Browse Source

Merge branch 'freebsd/current/master' into hardened/current/master

* freebsd/current/master:
  vfs: remove production kernel checks and mp == NULL support from vdrop
  mac: use a sleepable rmlock instead of an sx lock
  Add read-mostly sleepable locks
hardened/current/master
HardenedBSD Sync Service 9 months ago
parent
commit
8a60940898
6 changed files with 342 additions and 50 deletions
  1. +67
    -12
      share/man/man9/rmlock.9
  2. +239
    -0
      sys/kern/kern_rmlock.c
  3. +13
    -36
      sys/kern/vfs_subr.c
  4. +6
    -2
      sys/security/mac/mac_framework.c
  5. +10
    -0
      sys/sys/_rmlock.h
  6. +7
    -0
      sys/sys/rmlock.h

+ 67
- 12
share/man/man9/rmlock.9 View File

@@ -26,7 +26,7 @@
.\" $FreeBSD$
.\"
.\" Based on rwlock.9 man page
.Dd November 11, 2017
.Dd December 27, 2019
.Dt RMLOCK 9
.Os
.Sh NAME
@@ -43,7 +43,13 @@
.Nm rm_sleep ,
.Nm rm_assert ,
.Nm RM_SYSINIT ,
.Nm RM_SYSINIT_FLAGS
.Nm RM_SYSINIT_FLAGS ,
.Nm rms_init ,
.Nm rms_destroy ,
.Nm rms_rlock ,
.Nm rms_wlock ,
.Nm rms_runlock ,
.Nm rms_wunlock
.Nd kernel reader/writer lock optimized for read-mostly access patterns
.Sh SYNOPSIS
.In sys/param.h
@@ -77,6 +83,18 @@
.In sys/kernel.h
.Fn RM_SYSINIT "name" "struct rmlock *rm" "const char *desc"
.Fn RM_SYSINIT_FLAGS "name" "struct rmlock *rm" "const char *desc" "int flags"
.Ft void
.Fn rms_init "struct rmslock *rms" "const char *name"
.Ft void
.Fn rms_destroy "struct rmslock *rms"
.Ft void
.Fn rms_rlock "struct rmslock *rms"
.Ft void
.Fn rms_wlock "struct rmslock *rms"
.Ft void
.Fn rms_runlock "struct rmslock *rms"
.Ft void
.Fn rms_wunlock "struct rmslock *rms"
.Sh DESCRIPTION
Read-mostly locks allow shared access to protected data by multiple threads,
or exclusive access by a single thread.
@@ -113,22 +131,22 @@ Readers can recurse if the lock is initialized with the
option;
however, writers are never allowed to recurse.
.Pp
Sleepable read-mostly locks are created by passing
Sleeping for writers can be allowed by passing
.Dv RM_SLEEPABLE
to
.Fn rm_init_flags .
Unlike normal read-mostly locks,
sleepable read-mostly locks follow the same lock ordering rules as
It changes lock ordering rules to the same as for
.Xr sx 9
locks.
Sleepable read-mostly locks do not propagate priority to writers,
but they do propagate priority to readers.
Writers are permitted to sleep while holding a read-mostly lock,
but readers are not.
Unlike other sleepable locks such as
They do not propagate priority to writers, but they do propagate priority to
readers. Note that readers are not permitted to sleep regardless of the flag.
.Pp
Sleepable read-mostly locks (created with
.Fn rms_init )
allow sleeping for both readers and writers, but don't do priority propagation
for either. They follow
.Xr sx 9
locks,
readers must use try operations on other sleepable locks to avoid sleeping.
lock ordering.
.Ss Macros and Functions
.Bl -tag -width indent
.It Fn rm_init "struct rmlock *rm" "const char *name"
@@ -286,6 +304,43 @@ Assert that the current thread does not hold a recursive lock of
.Fa rm .
.El
.El
.Bl -tag -width indent
.It Fn rms_init "struct rmslock *rms" "const char *name"
Initialize the sleepable read-mostly lock
.Fa rms .
The
.Fa name
description is used as
.Fa wmesg
parameter to the
.Xr msleep 9
routine.
This function must be called before any other operations on the lock.
.It Fn rms_rlock "struct rmlock *rm"
Lock
.Fa rms
as a reader.
If any thread holds this lock exclusively, the current thread blocks.
.It Fn rms_wlock "struct rmslock *rms"
Lock
.Fa rms
as a writer.
If the lock is already taken, the current thread blocks.
The
.Fn rms_wlock
function cannot be called recursively.
.It Fn rms_runlock "struct rmslock *rms"
This function releases a shared lock previously acquired by
.Fn rms_rlock .
.It Fn rms_wunlock "struct rmslock *rms"
This function releases an exclusive lock previously acquired by
.Fn rms_wlock .
.It Fn rms_destroy "struct rmslock *rms"
This functions destroys a lock previously initialized with
.Fn rms_init .
The
.Fa rms
lock must be unlocked.
.Sh SEE ALSO
.Xr locking 9 ,
.Xr mutex 9 ,


+ 239
- 0
sys/kern/kern_rmlock.c View File

@@ -53,6 +53,7 @@ __FBSDID("$FreeBSD$");
#include <sys/turnstile.h>
#include <sys/lock_profile.h>
#include <machine/cpu.h>
#include <vm/uma.h>

#ifdef DDB
#include <ddb/ddb.h>
@@ -853,3 +854,241 @@ db_show_rm(const struct lock_object *lock)
lc->lc_ddb_show(&rm->rm_wlock_object);
}
#endif

/*
* Read-mostly sleepable locks.
*
* These primitives allow both readers and writers to sleep. However, neither
* readers nor writers are tracked and subsequently there is no priority
* propagation.
*
* They are intended to be only used when write-locking is almost never needed
* (e.g., they can guard against unloading a kernel module) while read-locking
* happens all the time.
*
* Concurrent writers take turns taking the lock while going off cpu. If this is
* of concern for your usecase, this is not the right primitive.
*
* Neither rms_rlock nor rms_runlock use fences. Instead compiler barriers are
* inserted to prevert reordering of generated code. Execution ordering is
* provided with the use of an IPI handler.
*/

void
rms_init(struct rmslock *rms, const char *name)
{

rms->writers = 0;
rms->readers = 0;
mtx_init(&rms->mtx, name, NULL, MTX_DEF | MTX_NEW);
rms->readers_pcpu = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO);
rms->readers_influx = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO);
}

void
rms_destroy(struct rmslock *rms)
{

MPASS(rms->writers == 0);
MPASS(rms->readers == 0);
mtx_destroy(&rms->mtx);
uma_zfree_pcpu(pcpu_zone_int, rms->readers_pcpu);
uma_zfree_pcpu(pcpu_zone_int, rms->readers_influx);
}

static void __noinline
rms_rlock_fallback(struct rmslock *rms)
{

(*zpcpu_get(rms->readers_influx)) = 0;
critical_exit();

mtx_lock(&rms->mtx);
MPASS(*zpcpu_get(rms->readers_pcpu) == 0);
while (rms->writers > 0)
msleep(&rms->readers, &rms->mtx, PUSER - 1, mtx_name(&rms->mtx), 0);
(*zpcpu_get(rms->readers_pcpu))++;
mtx_unlock(&rms->mtx);
}

void
rms_rlock(struct rmslock *rms)
{
int *influx;

WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);

critical_enter();
influx = zpcpu_get(rms->readers_influx);
__compiler_membar();
*influx = 1;
__compiler_membar();
if (__predict_false(rms->writers > 0)) {
rms_rlock_fallback(rms);
return;
}
__compiler_membar();
(*zpcpu_get(rms->readers_pcpu))++;
__compiler_membar();
*influx = 0;
critical_exit();
}

static void __noinline
rms_runlock_fallback(struct rmslock *rms)
{

(*zpcpu_get(rms->readers_influx)) = 0;
critical_exit();

mtx_lock(&rms->mtx);
MPASS(*zpcpu_get(rms->readers_pcpu) == 0);
MPASS(rms->writers > 0);
MPASS(rms->readers > 0);
rms->readers--;
if (rms->readers == 0)
wakeup_one(&rms->writers);
mtx_unlock(&rms->mtx);
}

void
rms_runlock(struct rmslock *rms)
{
int *influx;

WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);

critical_enter();
influx = zpcpu_get(rms->readers_influx);
__compiler_membar();
*influx = 1;
__compiler_membar();
if (__predict_false(rms->writers > 0)) {
rms_runlock_fallback(rms);
return;
}
__compiler_membar();
(*zpcpu_get(rms->readers_pcpu))--;
__compiler_membar();
*influx = 0;
critical_exit();
}

struct rmslock_ipi {
struct rmslock *rms;
cpuset_t signal;
};

static void
rms_wlock_IPI(void *arg)
{
struct rmslock_ipi *rmsipi;
struct rmslock *rms;
int readers;

rmsipi = arg;
rms = rmsipi->rms;

if (*zpcpu_get(rms->readers_influx))
return;
readers = zpcpu_replace(rms->readers_pcpu, 0);
if (readers != 0)
atomic_add_int(&rms->readers, readers);
CPU_CLR_ATOMIC(curcpu, &rmsipi->signal);
}

static void
rms_wlock_switch(struct rmslock *rms)
{
struct rmslock_ipi rmsipi;
int *in_op;
int cpu;

MPASS(rms->readers == 0);
MPASS(rms->writers == 1);

rmsipi.rms = rms;

/*
* Publishes rms->writers. rlock and runlock will get this ordered
* via IPI in the worst case.
*/
atomic_thread_fence_rel();

/*
* Collect reader counts from all CPUs using an IPI. The handler can
* find itself running while the interrupted CPU was doing either
* rlock or runlock in which case it will fail.
*
* Successful attempts clear the cpu id in the bitmap.
*
* In case of failure we observe all failing CPUs not executing there to
* determine when to make the next attempt. Note that threads having
* the var set have preemption disabled. Setting of readers_influx
* only uses compiler barriers making these loads unreliable, which is
* fine -- the IPI handler will always see the correct result.
*
* We retry until all counts are collected. Forward progress is
* guaranteed by that fact that the total number of threads which can
* be caught like this is finite and they all are going to block on
* their own.
*/
CPU_COPY(&all_cpus, &rmsipi.signal);
for (;;) {
smp_rendezvous_cpus(
rmsipi.signal,
smp_no_rendezvous_barrier,
rms_wlock_IPI,
smp_no_rendezvous_barrier,
&rmsipi);

if (CPU_EMPTY(&rmsipi.signal))
break;

CPU_FOREACH(cpu) {
if (!CPU_ISSET(cpu, &rmsipi.signal))
continue;
in_op = zpcpu_get_cpu(rms->readers_influx, cpu);
while (atomic_load_int(in_op))
cpu_spinwait();
}
}
}

void
rms_wlock(struct rmslock *rms)
{

WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);

mtx_lock(&rms->mtx);
rms->writers++;
if (rms->writers > 1) {
msleep(&rms->writers, &rms->mtx, PUSER - 1 | PDROP, mtx_name(&rms->mtx), 0);
MPASS(rms->readers == 0);
return;
}

rms_wlock_switch(rms);

if (rms->readers > 0)
msleep(&rms->writers, &rms->mtx, PUSER - 1 | PDROP, mtx_name(&rms->mtx), 0);
else
mtx_unlock(&rms->mtx);
MPASS(rms->readers == 0);
}

void
rms_wunlock(struct rmslock *rms)
{

mtx_lock(&rms->mtx);
MPASS(rms->writers >= 1);
MPASS(rms->readers == 0);
rms->writers--;
if (rms->writers > 0)
wakeup_one(&rms->writers);
else
wakeup(&rms->readers);
mtx_unlock(&rms->mtx);
}

+ 13
- 36
sys/kern/vfs_subr.c View File

@@ -3225,35 +3225,20 @@ vdrop_deactivate(struct vnode *vp)
("vdrop: freeing when we shouldn't"));
if ((vp->v_iflag & VI_OWEINACT) == 0) {
mp = vp->v_mount;
if (mp != NULL) {
mtx_lock(&mp->mnt_listmtx);
if (vp->v_iflag & VI_ACTIVE) {
vp->v_iflag &= ~VI_ACTIVE;
TAILQ_REMOVE(&mp->mnt_activevnodelist,
vp, v_actfreelist);
mp->mnt_activevnodelistsize--;
}
TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist,
vp, v_actfreelist);
mp->mnt_tmpfreevnodelistsize++;
vp->v_iflag |= VI_FREE;
vp->v_mflag |= VMP_TMPMNTFREELIST;
VI_UNLOCK(vp);
if (mp->mnt_tmpfreevnodelistsize >=
mnt_free_list_batch)
vnlru_return_batch_locked(mp);
mtx_unlock(&mp->mnt_listmtx);
} else {
VNASSERT((vp->v_iflag & VI_ACTIVE) == 0, vp,
("vdrop: active vnode not on per mount vnode list"));
mtx_lock(&vnode_free_list_mtx);
TAILQ_INSERT_TAIL(&vnode_free_list, vp,
v_actfreelist);
freevnodes++;
vp->v_iflag |= VI_FREE;
VI_UNLOCK(vp);
mtx_unlock(&vnode_free_list_mtx);
mtx_lock(&mp->mnt_listmtx);
if (vp->v_iflag & VI_ACTIVE) {
vp->v_iflag &= ~VI_ACTIVE;
TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
mp->mnt_activevnodelistsize--;
}
TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist);
mp->mnt_tmpfreevnodelistsize++;
vp->v_iflag |= VI_FREE;
vp->v_mflag |= VMP_TMPMNTFREELIST;
VI_UNLOCK(vp);
if (mp->mnt_tmpfreevnodelistsize >= mnt_free_list_batch)
vnlru_return_batch_locked(mp);
mtx_unlock(&mp->mnt_listmtx);
} else {
VI_UNLOCK(vp);
counter_u64_add(free_owe_inact, 1);
@@ -3266,10 +3251,6 @@ vdrop(struct vnode *vp)

ASSERT_VI_UNLOCKED(vp, __func__);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
if (__predict_false((int)vp->v_holdcnt <= 0)) {
vn_printf(vp, "vdrop: holdcnt %d", vp->v_holdcnt);
panic("vdrop: wrong holdcnt");
}
if (refcount_release_if_not_last(&vp->v_holdcnt))
return;
VI_LOCK(vp);
@@ -3282,10 +3263,6 @@ vdropl(struct vnode *vp)

ASSERT_VI_LOCKED(vp, __func__);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
if (__predict_false((int)vp->v_holdcnt <= 0)) {
vn_printf(vp, "vdrop: holdcnt %d", vp->v_holdcnt);
panic("vdrop: wrong holdcnt");
}
if (!refcount_release(&vp->v_holdcnt)) {
VI_UNLOCK(vp);
return;


+ 6
- 2
sys/security/mac/mac_framework.c View File

@@ -176,6 +176,7 @@ MALLOC_DEFINE(M_MACTEMP, "mactemp", "MAC temporary label storage");
#ifndef MAC_STATIC
static struct rmlock mac_policy_rm; /* Non-sleeping entry points. */
static struct sx mac_policy_sx; /* Sleeping entry points. */
static struct rmslock mac_policy_rms;
#endif

struct mac_policy_list_head mac_policy_list;
@@ -209,7 +210,7 @@ mac_policy_slock_sleep(void)
if (!mac_late)
return;

sx_slock(&mac_policy_sx);
rms_rlock(&mac_policy_rms);
#endif
}

@@ -233,7 +234,7 @@ mac_policy_sunlock_sleep(void)
if (!mac_late)
return;

sx_sunlock(&mac_policy_sx);
rms_runlock(&mac_policy_rms);
#endif
}

@@ -249,6 +250,7 @@ mac_policy_xlock(void)
return;

sx_xlock(&mac_policy_sx);
rms_wlock(&mac_policy_rms);
rm_wlock(&mac_policy_rm);
#endif
}
@@ -262,6 +264,7 @@ mac_policy_xunlock(void)
return;

rm_wunlock(&mac_policy_rm);
rms_wunlock(&mac_policy_rms);
sx_xunlock(&mac_policy_sx);
#endif
}
@@ -294,6 +297,7 @@ mac_init(void)
rm_init_flags(&mac_policy_rm, "mac_policy_rm", RM_NOWITNESS |
RM_RECURSE);
sx_init_flags(&mac_policy_sx, "mac_policy_sx", SX_NOWITNESS);
rms_init(&mac_policy_rms, "mac_policy_rms");
#endif
}



+ 10
- 0
sys/sys/_rmlock.h View File

@@ -68,4 +68,14 @@ struct rm_priotracker {
LIST_ENTRY(rm_priotracker) rmp_qentry;
};

#include <sys/_mutex.h>

struct rmslock {
struct mtx mtx;
int writers;
int readers;
int *readers_pcpu;
int *readers_influx;
};

#endif /* !_SYS__RMLOCK_H_ */

+ 7
- 0
sys/sys/rmlock.h View File

@@ -133,5 +133,12 @@ struct rm_args {
#define rm_assert(rm, what)
#endif

void rms_init(struct rmslock *rms, const char *name);
void rms_destroy(struct rmslock *rms);
void rms_rlock(struct rmslock *rms);
void rms_runlock(struct rmslock *rms);
void rms_wlock(struct rmslock *rms);
void rms_wunlock(struct rmslock *rms);

#endif /* _KERNEL */
#endif /* !_SYS_RMLOCK_H_ */

Loading…
Cancel
Save