diff --git a/sys/kern/kern_rwlock.c b/sys/kern/kern_rwlock.c index 6449390bda3..181ea10077d 100644 --- a/sys/kern/kern_rwlock.c +++ b/sys/kern/kern_rwlock.c @@ -1,4 +1,4 @@ -/* $OpenBSD: kern_rwlock.c,v 1.51 2024/11/27 01:02:03 dlg Exp $ */ +/* $OpenBSD: kern_rwlock.c,v 1.52 2024/12/03 12:50:16 claudio Exp $ */ /* * Copyright (c) 2002, 2003 Artur Grabowski @@ -26,12 +26,10 @@ #include #include -#ifdef RWDIAG -#include /* for hz */ -#define RW_SLEEP_TMO 10 * hz -#else -#define RW_SLEEP_TMO 0 -#endif +void rw_do_exit(struct rwlock *, unsigned long); + +/* XXX - temporary measure until proc0 is properly aligned */ +#define RW_PROC(p) (((long)p) & ~RWLOCK_MASK) /* * Other OSes implement more sophisticated mechanism to determine how long the @@ -42,131 +40,166 @@ #define RW_SPINS 1000 #ifdef MULTIPROCESSOR -#define rw_cas(p, e, n) atomic_cas_ulong(p, e, n) -#define rw_inc(p) atomic_inc_int(p) -#define rw_dec(p) atomic_dec_int(p) +#define rw_cas(p, o, n) (atomic_cas_ulong(p, o, n) != o) #else -static inline unsigned long -rw_cas(volatile unsigned long *p, unsigned long e, unsigned long n) +static inline int +rw_cas(volatile unsigned long *p, unsigned long o, unsigned long n) { - unsigned long o = *p; + if (*p != o) + return (1); + *p = n; - if (o == e) - *p = n; - - return (o); -} - -static inline void -rw_inc(volatile unsigned int *p) -{ - ++(*p); -} - -static inline void -rw_dec(volatile unsigned int *p) -{ - (*p)--; + return (0); } #endif -static int rw_do_enter_read(struct rwlock *, int); -static void rw_do_exit_read(struct rwlock *, unsigned long); -static int rw_do_enter_write(struct rwlock *, int); -static int rw_downgrade(struct rwlock *, int); - -static void rw_exited(struct rwlock *); - -static unsigned long -rw_self(void) -{ - unsigned long self = (unsigned long)curproc; - - CLR(self, RWLOCK_MASK); - SET(self, RWLOCK_WRLOCK); - - return (self); -} +/* + * Magic wand for lock operations. Every operation checks if certain + * flags are set and if they aren't, it increments the lock with some + * value (that might need some computing in a few cases). If the operation + * fails, we need to set certain flags while waiting for the lock. + * + * RW_WRITE The lock must be completely empty. We increment it with + * RWLOCK_WRLOCK and the proc pointer of the holder. + * Sets RWLOCK_WAIT|RWLOCK_WRWANT while waiting. + * RW_READ RWLOCK_WRLOCK|RWLOCK_WRWANT may not be set. We increment + * with RWLOCK_READ_INCR. RWLOCK_WAIT while waiting. + */ +static const struct rwlock_op { + unsigned long inc; + unsigned long check; + unsigned long wait_set; + long proc_mult; + int wait_prio; +} rw_ops[] = { + { /* RW_WRITE */ + RWLOCK_WRLOCK, + ULONG_MAX, + RWLOCK_WAIT | RWLOCK_WRWANT, + 1, + PLOCK - 4 + }, + { /* RW_READ */ + RWLOCK_READ_INCR, + RWLOCK_WRLOCK | RWLOCK_WRWANT, + RWLOCK_WAIT, + 0, + PLOCK + }, + { /* Sparse Entry. */ + 0, + }, + { /* RW_DOWNGRADE */ + RWLOCK_READ_INCR - RWLOCK_WRLOCK, + 0, + 0, + -1, + PLOCK + }, +}; void rw_enter_read(struct rwlock *rwl) { - rw_do_enter_read(rwl, 0); + unsigned long owner = rwl->rwl_owner; + + if (__predict_false((owner & (RWLOCK_WRLOCK | RWLOCK_WRWANT)) || + rw_cas(&rwl->rwl_owner, owner, owner + RWLOCK_READ_INCR))) + rw_enter(rwl, RW_READ); + else { + membar_enter_after_atomic(); + WITNESS_CHECKORDER(&rwl->rwl_lock_obj, LOP_NEWORDER, NULL); + WITNESS_LOCK(&rwl->rwl_lock_obj, 0); + } } void rw_enter_write(struct rwlock *rwl) { - rw_do_enter_write(rwl, 0); + struct proc *p = curproc; + + if (__predict_false(rw_cas(&rwl->rwl_owner, 0, + RW_PROC(p) | RWLOCK_WRLOCK))) + rw_enter(rwl, RW_WRITE); + else { + membar_enter_after_atomic(); + WITNESS_CHECKORDER(&rwl->rwl_lock_obj, + LOP_EXCLUSIVE | LOP_NEWORDER, NULL); + WITNESS_LOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE); + } } void rw_exit_read(struct rwlock *rwl) { - /* maybe we're the last one? */ - rw_do_exit_read(rwl, RWLOCK_READ_INCR); -} - -static void -rw_do_exit_read(struct rwlock *rwl, unsigned long owner) -{ - unsigned long decr; - unsigned long nowner; + unsigned long owner; + rw_assert_rdlock(rwl); WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0); - for (;;) { - decr = owner - RWLOCK_READ_INCR; - nowner = rw_cas(&rwl->rwl_owner, owner, decr); - if (owner == nowner) - break; - - if (__predict_false(ISSET(nowner, RWLOCK_WRLOCK))) { - panic("%s rwlock %p: exit read on write locked lock" - " (owner 0x%lx)", rwl->rwl_name, rwl, nowner); - } - if (__predict_false(nowner == 0)) { - panic("%s rwlock %p: exit read on unlocked lock", - rwl->rwl_name, rwl); - } - - owner = nowner; - } - - /* read lock didn't change anything, so no barrier needed? */ - - if (decr == 0) { - /* last one out */ - rw_exited(rwl); - } + membar_exit_before_atomic(); + owner = rwl->rwl_owner; + if (__predict_false((owner & RWLOCK_WAIT) || + rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR))) + rw_do_exit(rwl, 0); } void rw_exit_write(struct rwlock *rwl) { - unsigned long self = rw_self(); unsigned long owner; + rw_assert_wrlock(rwl); WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE); membar_exit_before_atomic(); - owner = rw_cas(&rwl->rwl_owner, self, 0); - if (__predict_false(owner != self)) { - panic("%s rwlock %p: exit write when lock not held " - "(owner 0x%lx, self 0x%lx)", rwl->rwl_name, rwl, - owner, self); - } - - rw_exited(rwl); + owner = rwl->rwl_owner; + if (__predict_false((owner & RWLOCK_WAIT) || + rw_cas(&rwl->rwl_owner, owner, 0))) + rw_do_exit(rwl, RWLOCK_WRLOCK); } +#ifdef DIAGNOSTIC +/* + * Put the diagnostic functions here to keep the main code free + * from ifdef clutter. + */ +static void +rw_enter_diag(struct rwlock *rwl, int flags) +{ + switch (flags & RW_OPMASK) { + case RW_WRITE: + case RW_READ: + if (RW_PROC(curproc) == RW_PROC(rwl->rwl_owner)) + panic("rw_enter: %s locking against myself", + rwl->rwl_name); + break; + case RW_DOWNGRADE: + /* + * If we're downgrading, we must hold the write lock. + */ + if ((rwl->rwl_owner & RWLOCK_WRLOCK) == 0) + panic("rw_enter: %s downgrade of non-write lock", + rwl->rwl_name); + if (RW_PROC(curproc) != RW_PROC(rwl->rwl_owner)) + panic("rw_enter: %s downgrade, not holder", + rwl->rwl_name); + break; + + default: + panic("rw_enter: unknown op 0x%x", flags); + } +} + +#else +#define rw_enter_diag(r, f) +#endif + static void _rw_init_flags_witness(struct rwlock *rwl, const char *name, int lo_flags, const struct lock_type *type) { rwl->rwl_owner = 0; - rwl->rwl_waiters = 0; - rwl->rwl_readers = 0; rwl->rwl_name = name; #ifdef WITNESS @@ -190,246 +223,90 @@ _rw_init_flags(struct rwlock *rwl, const char *name, int flags, int rw_enter(struct rwlock *rwl, int flags) { - int op = flags & RW_OPMASK; - int error; - - switch (op) { - case RW_WRITE: - error = rw_do_enter_write(rwl, flags); - break; - case RW_READ: - error = rw_do_enter_read(rwl, flags); - break; - case RW_DOWNGRADE: - error = rw_downgrade(rwl, flags); - break; - default: - panic("%s rwlock %p: %s unexpected op 0x%x", - rwl->rwl_name, rwl, __func__, op); - /* NOTREACHED */ - } - - return (error); -} - -static int -rw_do_enter_write(struct rwlock *rwl, int flags) -{ - unsigned long self = rw_self(); - unsigned long owner; - int prio; - int error; - -#ifdef WITNESS - int lop_flags = LOP_NEWORDER | LOP_EXCLUSIVE; - if (ISSET(flags, RW_DUPOK)) - lop_flags |= LOP_DUPOK; - - if (!ISSET(flags, RW_NOSLEEP)) - WITNESS_CHECKORDER(&rwl->rwl_lock_obj, lop_flags, NULL); -#endif - - owner = rw_cas(&rwl->rwl_owner, 0, self); - if (owner == 0) { - /* wow, we won. so easy */ - goto locked; - } - if (__predict_false(owner == self)) { - panic("%s rwlock %p: enter write deadlock", - rwl->rwl_name, rwl); - } - + const struct rwlock_op *op; + unsigned long inc, o; #ifdef MULTIPROCESSOR /* * If process holds the kernel lock, then we want to give up on CPU * as soon as possible so other processes waiting for the kernel lock * can progress. Hence no spinning if we hold the kernel lock. */ - if (!_kernel_lock_held()) { - int spins; + unsigned int spin = (_kernel_lock_held()) ? 0 : RW_SPINS; +#endif + int error, prio; +#ifdef WITNESS + int lop_flags; + lop_flags = LOP_NEWORDER; + if (flags & RW_WRITE) + lop_flags |= LOP_EXCLUSIVE; + if (flags & RW_DUPOK) + lop_flags |= LOP_DUPOK; + if ((flags & RW_NOSLEEP) == 0 && (flags & RW_DOWNGRADE) == 0) + WITNESS_CHECKORDER(&rwl->rwl_lock_obj, lop_flags, NULL); +#endif + + op = &rw_ops[(flags & RW_OPMASK) - 1]; + + inc = op->inc + RW_PROC(curproc) * op->proc_mult; +retry: + while (__predict_false(((o = rwl->rwl_owner) & op->check) != 0)) { + unsigned long set = o | op->wait_set; + int do_sleep; + + /* Avoid deadlocks after panic or in DDB */ + if (panicstr || db_active) + return (0); + +#ifdef MULTIPROCESSOR /* * It makes sense to try to spin just in case the lock * is acquired by writer. */ - - for (spins = 0; spins < RW_SPINS; spins++) { + if ((o & RWLOCK_WRLOCK) && (spin != 0)) { + spin--; CPU_BUSY_CYCLE(); - owner = atomic_load_long(&rwl->rwl_owner); - if (owner != 0) - continue; - - owner = rw_cas(&rwl->rwl_owner, 0, self); - if (owner == 0) { - /* ok, we won now. */ - goto locked; - } - } - } -#endif - - if (ISSET(flags, RW_NOSLEEP)) - return (EBUSY); - - prio = PLOCK - 4; - if (ISSET(flags, RW_INTR)) - prio |= PCATCH; - - rw_inc(&rwl->rwl_waiters); - membar_producer(); - do { - sleep_setup(&rwl->rwl_waiters, prio, rwl->rwl_name); - membar_consumer(); - owner = atomic_load_long(&rwl->rwl_owner); - error = sleep_finish(RW_SLEEP_TMO, owner != 0); -#ifdef RWDIAG - if (error == EWOULDBLOCK) { - printf("%s rwlock %p: %s timeout owner 0x%lx " - "(self 0x%lx)", rwl->rwl_name, rwl, __func__, - owner, self); - db_enter(); + continue; } #endif - if (ISSET(flags, RW_INTR) && (error != 0)) { - rw_dec(&rwl->rwl_waiters); + + rw_enter_diag(rwl, flags); + + if (flags & RW_NOSLEEP) + return (EBUSY); + + prio = op->wait_prio; + if (flags & RW_INTR) + prio |= PCATCH; + sleep_setup(rwl, prio, rwl->rwl_name); + + do_sleep = !rw_cas(&rwl->rwl_owner, o, set); + + error = sleep_finish(0, do_sleep); + if ((flags & RW_INTR) && + (error != 0)) return (error); - } - if (ISSET(flags, RW_SLEEPFAIL)) { - rw_dec(&rwl->rwl_waiters); - rw_exited(rwl); + if (flags & RW_SLEEPFAIL) return (EAGAIN); - } + } - owner = rw_cas(&rwl->rwl_owner, 0, self); - } while (owner != 0); - rw_dec(&rwl->rwl_waiters); - -locked: + if (__predict_false(rw_cas(&rwl->rwl_owner, o, o + inc))) + goto retry; membar_enter_after_atomic(); - WITNESS_LOCK(&rwl->rwl_lock_obj, lop_flags); - return (0); -} + /* + * If old lock had RWLOCK_WAIT and RWLOCK_WRLOCK set, it means we + * downgraded a write lock and had possible read waiter, wake them + * to let them retry the lock. + */ + if (__predict_false((o & (RWLOCK_WRLOCK|RWLOCK_WAIT)) == + (RWLOCK_WRLOCK|RWLOCK_WAIT))) + wakeup(rwl); -static int -rw_read_incr(struct rwlock *rwl, unsigned long owner) -{ - unsigned long incr; - unsigned long nowner; - - do { - incr = owner + RWLOCK_READ_INCR; - nowner = rw_cas(&rwl->rwl_owner, owner, incr); - if (nowner == owner) - return (1); - - owner = nowner; - } while (!ISSET(owner, RWLOCK_WRLOCK)); - - return (0); -} - -static int -rw_do_enter_read(struct rwlock *rwl, int flags) -{ - unsigned long owner; - int error; - int prio; - -#ifdef WITNESS - int lop_flags = LOP_NEWORDER; - if (ISSET(flags, RW_DUPOK)) - lop_flags |= LOP_DUPOK; - if (!ISSET(flags, RW_NOSLEEP)) - WITNESS_CHECKORDER(&rwl->rwl_lock_obj, lop_flags, NULL); -#endif - - owner = rw_cas(&rwl->rwl_owner, 0, RWLOCK_READ_INCR); - if (owner == 0) { - /* ermagerd, we won! */ - goto locked; - } - - if (ISSET(owner, RWLOCK_WRLOCK)) { - if (__predict_false(owner == rw_self())) { - panic("%s rwlock %p: enter read deadlock", - rwl->rwl_name, rwl); - } - } else if (atomic_load_int(&rwl->rwl_waiters) == 0) { - if (rw_read_incr(rwl, owner)) { - /* nailed it */ - goto locked; - } - } - - if (ISSET(flags, RW_NOSLEEP)) - return (EBUSY); - - prio = PLOCK; - if (ISSET(flags, RW_INTR)) - prio |= PCATCH; - - rw_inc(&rwl->rwl_readers); - membar_producer(); - do { - sleep_setup(&rwl->rwl_readers, prio, rwl->rwl_name); - membar_consumer(); - error = sleep_finish(RW_SLEEP_TMO, - atomic_load_int(&rwl->rwl_waiters) > 0 || - ISSET(atomic_load_long(&rwl->rwl_owner), RWLOCK_WRLOCK)); -#ifdef RWDIAG - if (error == EWOULDBLOCK) { - printf("%s rwlock %p: %s timeout owner 0x%lx\n", - rwl->rwl_name, rwl, __func__, owner); - db_enter(); - } -#endif - if (ISSET(flags, RW_INTR) && (error != 0)) - goto fail; - if (ISSET(flags, RW_SLEEPFAIL)) { - error = EAGAIN; - goto fail; - } - } while (!rw_read_incr(rwl, 0)); - rw_dec(&rwl->rwl_readers); - -locked: - membar_enter_after_atomic(); - WITNESS_LOCK(&rwl->rwl_lock_obj, lop_flags); - - return (0); -fail: - rw_dec(&rwl->rwl_readers); - return (error); -} - -static int -rw_downgrade(struct rwlock *rwl, int flags) -{ - unsigned long self = rw_self(); - unsigned long owner; - - membar_exit_before_atomic(); - owner = atomic_cas_ulong(&rwl->rwl_owner, self, RWLOCK_READ_INCR); - if (__predict_false(owner != self)) { - panic("%s rwlock %p: downgrade when lock not held " - "(owner 0x%lx, self 0x%lx)", rwl->rwl_name, rwl, - owner, self); - } - -#ifdef WITNESS - { - int lop_flags = LOP_NEWORDER; - if (ISSET(flags, RW_DUPOK)) - lop_flags |= LOP_DUPOK; + if (flags & RW_DOWNGRADE) WITNESS_DOWNGRADE(&rwl->rwl_lock_obj, lop_flags); - } -#endif - - membar_consumer(); - if (atomic_load_int(&rwl->rwl_waiters) == 0 && - atomic_load_int(&rwl->rwl_readers) > 0) - wakeup(&rwl->rwl_readers); + else + WITNESS_LOCK(&rwl->rwl_lock_obj, lop_flags); return (0); } @@ -437,38 +314,53 @@ rw_downgrade(struct rwlock *rwl, int flags) void rw_exit(struct rwlock *rwl) { - unsigned long owner; + unsigned long wrlock; - owner = atomic_load_long(&rwl->rwl_owner); - if (__predict_false(owner == 0)) { - panic("%s rwlock %p: exit on unlocked lock", - rwl->rwl_name, rwl); - } + /* Avoid deadlocks after panic or in DDB */ + if (panicstr || db_active) + return; - if (ISSET(owner, RWLOCK_WRLOCK)) - rw_exit_write(rwl); + wrlock = rwl->rwl_owner & RWLOCK_WRLOCK; + if (wrlock) + rw_assert_wrlock(rwl); else - rw_do_exit_read(rwl, owner); + rw_assert_rdlock(rwl); + WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0); + + membar_exit_before_atomic(); + rw_do_exit(rwl, wrlock); } -static void -rw_exited(struct rwlock *rwl) +/* membar_exit_before_atomic() has to precede call of this function. */ +void +rw_do_exit(struct rwlock *rwl, unsigned long wrlock) { - membar_consumer(); - if (atomic_load_int(&rwl->rwl_waiters) > 0) - wakeup_one(&rwl->rwl_waiters); - else if (atomic_load_int(&rwl->rwl_readers) > 0) - wakeup(&rwl->rwl_readers); + unsigned long owner, set; + + do { + owner = rwl->rwl_owner; + if (wrlock) + set = 0; + else + set = (owner - RWLOCK_READ_INCR) & + ~(RWLOCK_WAIT|RWLOCK_WRWANT); + /* + * Potential MP race here. If the owner had WRWANT set, we + * cleared it and a reader can sneak in before a writer. + */ + } while (__predict_false(rw_cas(&rwl->rwl_owner, owner, set))); + + if (owner & RWLOCK_WAIT) + wakeup(rwl); } int rw_status(struct rwlock *rwl) { - unsigned long owner; + unsigned long owner = rwl->rwl_owner; - owner = atomic_load_long(&rwl->rwl_owner); - if (ISSET(owner, RWLOCK_WRLOCK)) { - if (rw_self() == owner) + if (owner & RWLOCK_WRLOCK) { + if (RW_PROC(curproc) == RW_PROC(owner)) return RW_WRITE; else return RW_WRITE_OTHER; @@ -488,10 +380,11 @@ rw_assert_wrlock(struct rwlock *rwl) #ifdef WITNESS witness_assert(&rwl->rwl_lock_obj, LA_XLOCKED); #else - if (atomic_load_long(&rwl->rwl_owner) != rw_self()) { - panic("%s rwlock %p: lock not held by this process", - rwl->rwl_name, rwl); - } + if (!(rwl->rwl_owner & RWLOCK_WRLOCK)) + panic("%s: lock not held", rwl->rwl_name); + + if (RW_PROC(curproc) != RW_PROC(rwl->rwl_owner)) + panic("%s: lock not held by this process", rwl->rwl_name); #endif } @@ -504,8 +397,8 @@ rw_assert_rdlock(struct rwlock *rwl) #ifdef WITNESS witness_assert(&rwl->rwl_lock_obj, LA_SLOCKED); #else - if (rw_status(rwl) != RW_READ) - panic("%s rwlock %p: lock not shared", rwl->rwl_name, rwl); + if (!RW_PROC(rwl->rwl_owner) || (rwl->rwl_owner & RWLOCK_WRLOCK)) + panic("%s: lock not shared", rwl->rwl_name); #endif } @@ -520,11 +413,9 @@ rw_assert_anylock(struct rwlock *rwl) #else switch (rw_status(rwl)) { case RW_WRITE_OTHER: - panic("%s rwlock %p: lock held by different process " - "(self %lx, owner %lx)", rwl->rwl_name, rwl, - rw_self(), rwl->rwl_owner); + panic("%s: lock held by different process", rwl->rwl_name); case 0: - panic("%s rwlock %p: lock not held", rwl->rwl_name, rwl); + panic("%s: lock not held", rwl->rwl_name); } #endif } @@ -538,8 +429,8 @@ rw_assert_unlocked(struct rwlock *rwl) #ifdef WITNESS witness_assert(&rwl->rwl_lock_obj, LA_UNLOCKED); #else - if (atomic_load_long(&rwl->rwl_owner) == rw_self()) - panic("%s rwlock %p: lock held", rwl->rwl_name, rwl); + if (RW_PROC(curproc) == RW_PROC(rwl->rwl_owner)) + panic("%s: lock held", rwl->rwl_name); #endif } #endif @@ -559,7 +450,7 @@ rrw_enter(struct rrwlock *rrwl, int flags) { int rv; - if (atomic_load_long(&rrwl->rrwl_lock.rwl_owner) == rw_self()) { + if (RW_PROC(rrwl->rrwl_lock.rwl_owner) == RW_PROC(curproc)) { if (flags & RW_RECURSEFAIL) return (EDEADLK); else { @@ -581,7 +472,7 @@ void rrw_exit(struct rrwlock *rrwl) { - if (atomic_load_long(&rrwl->rrwl_lock.rwl_owner) == rw_self()) { + if (RW_PROC(rrwl->rrwl_lock.rwl_owner) == RW_PROC(curproc)) { KASSERT(rrwl->rrwl_wcnt > 0); rrwl->rrwl_wcnt--; if (rrwl->rrwl_wcnt != 0) { diff --git a/sys/sys/rwlock.h b/sys/sys/rwlock.h index cd7649789d1..284a6af4f6e 100644 --- a/sys/sys/rwlock.h +++ b/sys/sys/rwlock.h @@ -1,4 +1,4 @@ -/* $OpenBSD: rwlock.h,v 1.29 2024/11/27 01:02:03 dlg Exp $ */ +/* $OpenBSD: rwlock.h,v 1.30 2024/12/03 12:50:16 claudio Exp $ */ /* * Copyright (c) 2002 Artur Grabowski * @@ -60,8 +60,6 @@ struct proc; struct rwlock { volatile unsigned long rwl_owner; - volatile unsigned int rwl_waiters; - volatile unsigned int rwl_readers; const char *rwl_name; #ifdef WITNESS struct lock_object rwl_lock_obj; @@ -93,12 +91,14 @@ struct rwlock { #ifdef WITNESS #define RWLOCK_INITIALIZER(name) \ - { 0, 0, 0, name, .rwl_lock_obj = RWLOCK_LO_INITIALIZER(name, 0) } + { 0, name, .rwl_lock_obj = RWLOCK_LO_INITIALIZER(name, 0) } #else #define RWLOCK_INITIALIZER(name) \ - { 0, 0, 0, name } + { 0, name } #endif +#define RWLOCK_WAIT 0x01UL +#define RWLOCK_WRWANT 0x02UL #define RWLOCK_WRLOCK 0x04UL #define RWLOCK_MASK 0x07UL