vfs_bio.c 144 KB
Newer Older
1
/*-
2
3
 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
 *
4
 * Copyright (c) 2004 Poul-Henning Kamp
John Dyson's avatar
John Dyson committed
5
 * Copyright (c) 1994,1997 John S. Dyson
6
 * Copyright (c) 2013 The FreeBSD Foundation
7
 * All rights reserved.
Rodney W. Grimes's avatar
Rodney W. Grimes committed
8
 *
9
10
11
 * Portions of this software were developed by Konstantin Belousov
 * under sponsorship from the FreeBSD Foundation.
 *
Rodney W. Grimes's avatar
Rodney W. Grimes committed
12
13
14
15
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
Rodney W. Grimes's avatar
Rodney W. Grimes committed
32
33
 */

34
35
36
37
38
39
40
41
42
/*
 * this file contains a new buffer I/O scheme implementing a coherent
 * VM object and buffer cache scheme.  Pains have been taken to make
 * sure that the performance degradation associated with schemes such
 * as this is not realized.
 *
 * Author:  John S. Dyson
 * Significant help during the development and debugging phases
 * had been provided by David Greenman, also of the FreeBSD core team.
43
44
 *
 * see man buf(9) for more info.
45
46
 */

David E. O'Brien's avatar
David E. O'Brien committed
47
48
49
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

Rodney W. Grimes's avatar
Rodney W. Grimes committed
50
51
#include <sys/param.h>
#include <sys/systm.h>
52
#include <sys/bio.h>
53
#include <sys/bitset.h>
54
#include <sys/conf.h>
55
#include <sys/counter.h>
56
#include <sys/buf.h>
57
#include <sys/devicestat.h>
58
#include <sys/eventhandler.h>
Zachary Loafman's avatar
Zachary Loafman committed
59
#include <sys/fail.h>
60
#include <sys/limits.h>
61
62
63
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mount.h>
John Baldwin's avatar
John Baldwin committed
64
#include <sys/mutex.h>
65
#include <sys/kernel.h>
66
#include <sys/kthread.h>
67
#include <sys/proc.h>
68
#include <sys/racct.h>
69
#include <sys/resourcevar.h>
70
#include <sys/rwlock.h>
71
#include <sys/smp.h>
72
#include <sys/sysctl.h>
73
#include <sys/sysproto.h>
74
#include <sys/vmem.h>
75
#include <sys/vmmeter.h>
76
#include <sys/vnode.h>
77
#include <sys/watchdog.h>
78
#include <geom/geom.h>
79
#include <vm/vm.h>
80
#include <vm/vm_param.h>
81
#include <vm/vm_kern.h>
82
#include <vm/vm_object.h>
83
84
85
#include <vm/vm_page.h>
#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
86
#include <vm/vm_extern.h>
87
#include <vm/vm_map.h>
88
#include <vm/swap_pager.h>
89
#include "opt_swap.h"
90

91
static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
92

93
94
struct	bio_ops bioops;		/* I/O operation notification */

95
struct	buf_ops buf_ops_bio = {
96
97
98
	.bop_name	=	"buf_ops_bio",
	.bop_write	=	bufwrite,
	.bop_strategy	=	bufstrategy,
99
	.bop_sync	=	bufsync,
100
	.bop_bdflush	=	bufbdflush,
101
102
};

103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
struct bufqueue {
	struct mtx_padalign	bq_lock;
	TAILQ_HEAD(, buf)	bq_queue;
	uint8_t			bq_index;
	uint16_t		bq_subqueue;
	int			bq_len;
} __aligned(CACHE_LINE_SIZE);

#define	BQ_LOCKPTR(bq)		(&(bq)->bq_lock)
#define	BQ_LOCK(bq)		mtx_lock(BQ_LOCKPTR((bq)))
#define	BQ_UNLOCK(bq)		mtx_unlock(BQ_LOCKPTR((bq)))
#define	BQ_ASSERT_LOCKED(bq)	mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED)

struct bufdomain {
	struct bufqueue	bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */
	struct bufqueue bd_dirtyq;
	struct bufqueue	*bd_cleanq;
	struct mtx_padalign bd_run_lock;
	/* Constants */
	long		bd_maxbufspace;
	long		bd_hibufspace;
	long 		bd_lobufspace;
	long 		bd_bufspacethresh;
	int		bd_hifreebuffers;
	int		bd_lofreebuffers;
	int		bd_hidirtybuffers;
	int		bd_lodirtybuffers;
	int		bd_dirtybufthresh;
	int		bd_lim;
	/* atomics */
	int		bd_wanted;
	int __aligned(CACHE_LINE_SIZE)	bd_numdirtybuffers;
	int __aligned(CACHE_LINE_SIZE)	bd_running;
	long __aligned(CACHE_LINE_SIZE) bd_bufspace;
	int __aligned(CACHE_LINE_SIZE)	bd_freebuffers;
} __aligned(CACHE_LINE_SIZE);

#define	BD_LOCKPTR(bd)		(&(bd)->bd_cleanq->bq_lock)
#define	BD_LOCK(bd)		mtx_lock(BD_LOCKPTR((bd)))
#define	BD_UNLOCK(bd)		mtx_unlock(BD_LOCKPTR((bd)))
#define	BD_ASSERT_LOCKED(bd)	mtx_assert(BD_LOCKPTR((bd)), MA_OWNED)
#define	BD_RUN_LOCKPTR(bd)	(&(bd)->bd_run_lock)
#define	BD_RUN_LOCK(bd)		mtx_lock(BD_RUN_LOCKPTR((bd)))
#define	BD_RUN_UNLOCK(bd)	mtx_unlock(BD_RUN_LOCKPTR((bd)))
#define	BD_DOMAIN(bd)		(bd - bdomain)

149
150
static struct buf *buf;		/* buffer header pool */
extern struct buf *swbuf;	/* Swap buffer header pool. */
151
caddr_t __read_mostly unmapped_buf;
152

153
154
/* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
struct proc *bufdaemonproc;
155

Poul-Henning Kamp's avatar
Poul-Henning Kamp committed
156
static int inmem(struct vnode *vp, daddr_t blkno);
157
static void vm_hold_free_pages(struct buf *bp, int newbsize);
Poul-Henning Kamp's avatar
Poul-Henning Kamp committed
158
static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
159
		vm_offset_t to);
160
161
static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
162
		vm_page_t m);
163
static void vfs_clean_pages_dirty_buf(struct buf *bp);
164
static void vfs_setdirty_locked_object(struct buf *bp);
165
166
167
static void vfs_vmio_invalidate(struct buf *bp);
static void vfs_vmio_truncate(struct buf *bp, int npages);
static void vfs_vmio_extend(struct buf *bp, int npages, int size);
168
169
static int vfs_bio_clcheck(struct vnode *vp, int size,
		daddr_t lblkno, daddr_t blkno);
170
171
static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int,
		void (*)(struct buf *));
172
173
static int buf_flush(struct vnode *vp, struct bufdomain *, int);
static int flushbufqueues(struct vnode *, struct bufdomain *, int, int);
Alfred Perlstein's avatar
Alfred Perlstein committed
174
static void buf_daemon(void);
175
static __inline void bd_wakeup(void);
176
static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
177
178
static void bufkva_reclaim(vmem_t *, int);
static void bufkva_free(struct buf *);
179
static int buf_import(void *, void **, int, int, int);
180
static void buf_release(void *, void **, int);
181
static void maxbcachebuf_adjust(void);
182
183
184
185
186
187
188
189
190
191
static inline struct bufdomain *bufdomain(struct buf *);
static void bq_remove(struct bufqueue *bq, struct buf *bp);
static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock);
static int buf_recycle(struct bufdomain *, bool kva);
static void bq_init(struct bufqueue *bq, int qindex, int cpu,
	    const char *lockname);
static void bd_init(struct bufdomain *bd);
static int bd_flushall(struct bufdomain *bd);
static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS);
static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS);
192

193
static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
194
195
196
int vmiodirenable = TRUE;
SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
    "Use the VM system for directory writes");
197
198
long runningbufspace;
SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
199
    "Amount of presently outstanding async buffer io");
200
SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
201
202
203
    NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers");
static counter_u64_t bufkvaspace;
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace,
204
    "Kernel virtual memory used for buffers");
205
static long maxbufspace;
206
207
208
SYSCTL_PROC(_vfs, OID_AUTO, maxbufspace,
    CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &maxbufspace,
    __offsetof(struct bufdomain, bd_maxbufspace), sysctl_bufdomain_long, "L",
209
    "Maximum allowed value of bufspace (including metadata)");
210
211
static long bufmallocspace;
SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
212
    "Amount of malloced memory for buffers");
213
static long maxbufmallocspace;
214
215
SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
    0, "Maximum amount of malloced memory for buffers");
216
static long lobufspace;
217
218
219
SYSCTL_PROC(_vfs, OID_AUTO, lobufspace,
    CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lobufspace,
    __offsetof(struct bufdomain, bd_lobufspace), sysctl_bufdomain_long, "L",
220
    "Minimum amount of buffers we want to have");
221
long hibufspace;
222
223
224
SYSCTL_PROC(_vfs, OID_AUTO, hibufspace,
    CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hibufspace,
    __offsetof(struct bufdomain, bd_hibufspace), sysctl_bufdomain_long, "L",
225
226
    "Maximum allowed value of bufspace (excluding metadata)");
long bufspacethresh;
227
228
229
230
SYSCTL_PROC(_vfs, OID_AUTO, bufspacethresh,
    CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &bufspacethresh,
    __offsetof(struct bufdomain, bd_bufspacethresh), sysctl_bufdomain_long, "L",
    "Bufspace consumed before waking the daemon to free some");
231
232
static counter_u64_t buffreekvacnt;
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt,
233
    "Number of times we have freed the KVA space from some buffer");
234
235
static counter_u64_t bufdefragcnt;
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt,
236
    "Number of times we have had to repeat buffer allocation to defragment");
237
static long lorunningspace;
238
239
SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
    CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L",
240
    "Minimum preferred space used for in-progress I/O");
241
static long hirunningspace;
242
243
SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
    CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L",
244
    "Maximum amount of space to use for in-progress I/O");
245
int dirtybufferflushes;
246
247
SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
    0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
248
249
250
251
int bdwriteskip;
SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
    0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
int altbufferflushes;
252
253
SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
    0, "Number of fsync flushes to limit dirty buffers");
254
255
256
static int recursiveflushes;
SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
    0, "Number of flushes skipped due to being recursive");
257
258
259
static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS);
SYSCTL_PROC(_vfs, OID_AUTO, numdirtybuffers,
    CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_numdirtybuffers, "I",
260
261
    "Number of buffers that are dirty (has unwritten changes) at the moment");
static int lodirtybuffers;
262
SYSCTL_PROC(_vfs, OID_AUTO, lodirtybuffers,
263
    CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lodirtybuffers,
264
    __offsetof(struct bufdomain, bd_lodirtybuffers), sysctl_bufdomain_int, "I",
265
266
    "How many buffers we want to have free before bufdaemon can sleep");
static int hidirtybuffers;
267
SYSCTL_PROC(_vfs, OID_AUTO, hidirtybuffers,
268
    CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hidirtybuffers,
269
    __offsetof(struct bufdomain, bd_hidirtybuffers), sysctl_bufdomain_int, "I",
270
    "When the number of dirty buffers is considered severe");
271
int dirtybufthresh;
272
SYSCTL_PROC(_vfs, OID_AUTO, dirtybufthresh,
273
    CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &dirtybufthresh,
274
    __offsetof(struct bufdomain, bd_dirtybufthresh), sysctl_bufdomain_int, "I",
275
    "Number of bdwrite to bawrite conversions to clear dirty buffers");
276
277
278
279
static int numfreebuffers;
SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
    "Number of free buffers");
static int lofreebuffers;
280
SYSCTL_PROC(_vfs, OID_AUTO, lofreebuffers,
281
    CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lofreebuffers,
282
    __offsetof(struct bufdomain, bd_lofreebuffers), sysctl_bufdomain_int, "I",
283
   "Target number of free buffers");
284
static int hifreebuffers;
285
SYSCTL_PROC(_vfs, OID_AUTO, hifreebuffers,
286
    CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hifreebuffers,
287
    __offsetof(struct bufdomain, bd_hifreebuffers), sysctl_bufdomain_int, "I",
288
   "Threshold for clean buffer recycling");
289
290
291
292
293
294
static counter_u64_t getnewbufcalls;
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD,
   &getnewbufcalls, "Number of calls to getnewbuf");
static counter_u64_t getnewbufrestarts;
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD,
    &getnewbufrestarts,
Pedro F. Giffuni's avatar
Pedro F. Giffuni committed
295
    "Number of times getnewbuf has had to restart a buffer acquisition");
296
297
298
static counter_u64_t mappingrestarts;
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD,
    &mappingrestarts,
299
300
    "Number of times getblk has had to restart a buffer mapping for "
    "unmapped buffer");
301
302
303
static counter_u64_t numbufallocfails;
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW,
    &numbufallocfails, "Number of times buffer allocations failed");
304
305
306
static int flushbufqtarget = 100;
SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
    "Amount of work to do in flushbufqueues when helping bufdaemon");
307
308
static counter_u64_t notbufdflushes;
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes,
309
    "Number of dirty buffer flushes done by the bufdaemon helpers");
310
311
312
static long barrierwrites;
SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
    "Number of barrier writes");
313
314
315
SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
    &unmapped_buf_allowed, 0,
    "Permit the use of the unmapped i/o");
316
317
318
int maxbcachebuf = MAXBCACHEBUF;
SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0,
    "Maximum size of a buffer cache block");
319

320
321
322
/*
 * This lock synchronizes access to bd_request.
 */
323
static struct mtx_padalign __exclusive_cache_line bdlock;
324
325
326
327
328

/*
 * This lock protects the runningbufreq and synchronizes runningbufwakeup and
 * waitrunningbufspace().
 */
329
static struct mtx_padalign __exclusive_cache_line rbreqlock;
330
331
332
333

/*
 * Lock that protects bdirtywait.
 */
334
static struct mtx_padalign __exclusive_cache_line bdirtylock;
335

336
337
338
339
340
/*
 * Wakeup point for bufdaemon, as well as indicator of whether it is already
 * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
 * is idling.
 */
341
342
static int bd_request;

343
344
345
346
347
348
349
350
/*
 * Request for the buf daemon to write more buffers than is indicated by
 * lodirtybuf.  This may be necessary to push out excess dependencies or
 * defragment the address space where a simple count of the number of dirty
 * buffers is insufficient to characterize the demand for flushing them.
 */
static int bd_speedupreq;

351
352
353
354
355
356
357
/*
 * Synchronization (sleep/wakeup) variable for active buffer space requests.
 * Set when wait starts, cleared prior to wakeup().
 * Used in runningbufwakeup() and waitrunningbufspace().
 */
static int runningbufreq;

358
/*
359
 * Synchronization for bwillwrite() waiters.
360
 */
361
static int bdirtywait;
362

363
364
365
366
/*
 * Definitions for the buffer free lists.
 */
#define QUEUE_NONE	0	/* on no queue */
367
#define QUEUE_EMPTY	1	/* empty buffer headers */
368
#define QUEUE_DIRTY	2	/* B_DELWRI buffers */
369
#define QUEUE_CLEAN	3	/* non-B_DELWRI buffers */
370
371
#define QUEUE_SENTINEL	4	/* not an queue index, but mark for sentinel */

372
373
/* Maximum number of buffer domains. */
#define	BUF_DOMAINS	8
374

375
376
struct bufdomainset bdlodirty;		/* Domains > lodirty */
struct bufdomainset bdhidirty;		/* Domains > hidirty */
377
378

/* Configured number of clean queues. */
379
static int __read_mostly buf_domains;
380

381
382
383
BITSET_DEFINE(bufdomainset, BUF_DOMAINS);
struct bufdomain __exclusive_cache_line bdomain[BUF_DOMAINS];
struct bufqueue __exclusive_cache_line bqempty;
384
385
386
387
388
389

/*
 * per-cpu empty buffer cache.
 */
uma_zone_t buf_zone;

390
391
392
393
/*
 * Single global constant for BUF_WMESG, to avoid getting multiple references.
 * buf_wmesg is referred from macros.
 */
394
const char *buf_wmesg = BUF_WMESG;
395

396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
static int
sysctl_runningspace(SYSCTL_HANDLER_ARGS)
{
	long value;
	int error;

	value = *(long *)arg1;
	error = sysctl_handle_long(oidp, &value, 0, req);
	if (error != 0 || req->newptr == NULL)
		return (error);
	mtx_lock(&rbreqlock);
	if (arg1 == &hirunningspace) {
		if (value < lorunningspace)
			error = EINVAL;
		else
			hirunningspace = value;
	} else {
		KASSERT(arg1 == &lorunningspace,
		    ("%s: unknown arg1", __func__));
		if (value > hirunningspace)
			error = EINVAL;
		else
			lorunningspace = value;
	}
	mtx_unlock(&rbreqlock);
	return (error);
}

424
425
426
427
428
429
430
431
432
433
434
435
436
static int
sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS)
{
	int error;
	int value;
	int i;

	value = *(int *)arg1;
	error = sysctl_handle_int(oidp, &value, 0, req);
	if (error != 0 || req->newptr == NULL)
		return (error);
	*(int *)arg1 = value;
	for (i = 0; i < buf_domains; i++)
437
		*(int *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) =
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
		    value / buf_domains;

	return (error);
}

static int
sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS)
{
	long value;
	int error;
	int i;

	value = *(long *)arg1;
	error = sysctl_handle_long(oidp, &value, 0, req);
	if (error != 0 || req->newptr == NULL)
		return (error);
	*(long *)arg1 = value;
	for (i = 0; i < buf_domains; i++)
456
		*(long *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) =
457
458
459
460
461
		    value / buf_domains;

	return (error);
}

462
463
464
465
466
467
468
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
static int
sysctl_bufspace(SYSCTL_HANDLER_ARGS)
{
	long lvalue;
	int ivalue;
469
	int i;
470

471
	lvalue = 0;
472
473
	for (i = 0; i < buf_domains; i++)
		lvalue += bdomain[i].bd_bufspace;
474
	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
475
		return (sysctl_handle_long(oidp, &lvalue, 0, req));
476
477
478
479
	if (lvalue > INT_MAX)
		/* On overflow, still write out a long to trigger ENOMEM. */
		return (sysctl_handle_long(oidp, &lvalue, 0, req));
	ivalue = lvalue;
480
481
	return (sysctl_handle_int(oidp, &ivalue, 0, req));
}
482
#else
483
static int
484
sysctl_bufspace(SYSCTL_HANDLER_ARGS)
485
{
486
487
	long lvalue;
	int i;
Poul-Henning Kamp's avatar
Poul-Henning Kamp committed
488

489
	lvalue = 0;
490
491
	for (i = 0; i < buf_domains; i++)
		lvalue += bdomain[i].bd_bufspace;
492
	return (sysctl_handle_long(oidp, &lvalue, 0, req));
493
}
494
#endif
495

496
497
498
499
500
501
502
503
504
505
506
507
static int
sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS)
{
	int value;
	int i;

	value = 0;
	for (i = 0; i < buf_domains; i++)
		value += bdomain[i].bd_numdirtybuffers;
	return (sysctl_handle_int(oidp, &value, 0, req));
}

508
509
510
511
512
513
514
515
516
517
518
519
/*
 *	bdirtywakeup:
 *
 *	Wakeup any bwillwrite() waiters.
 */
static void
bdirtywakeup(void)
{
	mtx_lock(&bdirtylock);
	if (bdirtywait) {
		bdirtywait = 0;
		wakeup(&bdirtywait);
520
	}
521
522
523
	mtx_unlock(&bdirtylock);
}

524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
/*
 *	bd_clear:
 *
 *	Clear a domain from the appropriate bitsets when dirtybuffers
 *	is decremented.
 */
static void
bd_clear(struct bufdomain *bd)
{

	mtx_lock(&bdirtylock);
	if (bd->bd_numdirtybuffers <= bd->bd_lodirtybuffers)
		BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty);
	if (bd->bd_numdirtybuffers <= bd->bd_hidirtybuffers)
		BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty);
	mtx_unlock(&bdirtylock);
}

/*
 *	bd_set:
 *
 *	Set a domain in the appropriate bitsets when dirtybuffers
 *	is incremented.
 */
static void
bd_set(struct bufdomain *bd)
{

	mtx_lock(&bdirtylock);
	if (bd->bd_numdirtybuffers > bd->bd_lodirtybuffers)
		BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty);
	if (bd->bd_numdirtybuffers > bd->bd_hidirtybuffers)
		BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty);
	mtx_unlock(&bdirtylock);
}

560
561
562
563
564
565
566
/*
 *	bdirtysub:
 *
 *	Decrement the numdirtybuffers count by one and wakeup any
 *	threads blocked in bwillwrite().
 */
static void
567
bdirtysub(struct buf *bp)
568
{
569
570
	struct bufdomain *bd;
	int num;
571

572
573
574
	bd = bufdomain(bp);
	num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, -1);
	if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2)
575
		bdirtywakeup();
576
577
	if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers)
		bd_clear(bd);
578
579
580
581
582
583
584
585
586
}

/*
 *	bdirtyadd:
 *
 *	Increment the numdirtybuffers count by one and wakeup the buf 
 *	daemon if needed.
 */
static void
587
bdirtyadd(struct buf *bp)
588
{
589
590
	struct bufdomain *bd;
	int num;
591
592
593
594
595

	/*
	 * Only do the wakeup once as we cross the boundary.  The
	 * buf daemon will keep running until the condition clears.
	 */
596
597
598
	bd = bufdomain(bp);
	num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, 1);
	if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2)
599
		bd_wakeup();
600
601
	if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers)
		bd_set(bd);
602
603
}

604
/*
605
 *	bufspace_daemon_wakeup:
606
 *
607
 *	Wakeup the daemons responsible for freeing clean bufs.
608
 */
609
static void
610
bufspace_daemon_wakeup(struct bufdomain *bd)
611
{
Poul-Henning Kamp's avatar
Poul-Henning Kamp committed
612

613
	/*
614
	 * avoid the lock if the daemon is running.
615
	 */
616
617
618
619
620
	if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) {
		BD_RUN_LOCK(bd);
		atomic_store_int(&bd->bd_running, 1);
		wakeup(&bd->bd_running);
		BD_RUN_UNLOCK(bd);
621
622
623
624
	}
}

/*
625
 *	bufspace_daemon_wait:
626
 *
627
 *	Sleep until the domain falls below a limit or one second passes.
628
629
 */
static void
630
bufspace_daemon_wait(struct bufdomain *bd)
631
{
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
	/*
	 * Re-check our limits and sleep.  bd_running must be
	 * cleared prior to checking the limits to avoid missed
	 * wakeups.  The waker will adjust one of bufspace or
	 * freebuffers prior to checking bd_running.
	 */
	BD_RUN_LOCK(bd);
	atomic_store_int(&bd->bd_running, 0);
	if (bd->bd_bufspace < bd->bd_bufspacethresh &&
	    bd->bd_freebuffers > bd->bd_lofreebuffers) {
		msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO|PDROP,
		    "-", hz);
	} else {
		/* Avoid spurious wakeups while running. */
		atomic_store_int(&bd->bd_running, 1);
		BD_RUN_UNLOCK(bd);
648
649
650
	}
}

651
/*
652
 *	bufspace_adjust:
653
654
655
656
657
 *
 *	Adjust the reported bufspace for a KVA managed buffer, possibly
 * 	waking any waiters.
 */
static void
658
bufspace_adjust(struct buf *bp, int bufsize)
659
{
660
	struct bufdomain *bd;
661
	long space;
662
663
664
	int diff;

	KASSERT((bp->b_flags & B_MALLOC) == 0,
665
	    ("bufspace_adjust: malloc buf %p", bp));
666
	bd = bufdomain(bp);
667
668
	diff = bufsize - bp->b_bufsize;
	if (diff < 0) {
669
		atomic_subtract_long(&bd->bd_bufspace, -diff);
670
	} else if (diff > 0) {
671
		space = atomic_fetchadd_long(&bd->bd_bufspace, diff);
672
		/* Wake up the daemon on the transition. */
673
674
675
		if (space < bd->bd_bufspacethresh &&
		    space + diff >= bd->bd_bufspacethresh)
			bufspace_daemon_wakeup(bd);
676
	}
677
678
679
	bp->b_bufsize = bufsize;
}

680
681
682
683
684
685
686
/*
 *	bufspace_reserve:
 *
 *	Reserve bufspace before calling allocbuf().  metadata has a
 *	different space limit than data.
 */
static int
687
bufspace_reserve(struct bufdomain *bd, int size, bool metadata)
688
{
689
	long limit, new;
690
691
692
	long space;

	if (metadata)
693
		limit = bd->bd_maxbufspace;
694
	else
695
696
697
698
699
700
701
		limit = bd->bd_hibufspace;
	space = atomic_fetchadd_long(&bd->bd_bufspace, size);
	new = space + size;
	if (new > limit) {
		atomic_subtract_long(&bd->bd_bufspace, size);
		return (ENOSPC);
	}
702
703

	/* Wake up the daemon on the transition. */
704
705
	if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh)
		bufspace_daemon_wakeup(bd);
706
707
708
709
710
711
712
713
714
715

	return (0);
}

/*
 *	bufspace_release:
 *
 *	Release reserved bufspace after bufspace_adjust() has consumed it.
 */
static void
716
bufspace_release(struct bufdomain *bd, int size)
717
{
718
719

	atomic_subtract_long(&bd->bd_bufspace, size);
720
721
722
723
724
725
}

/*
 *	bufspace_wait:
 *
 *	Wait for bufspace, acting as the buf daemon if a locked vnode is
726
727
 *	supplied.  bd_wanted must be set prior to polling for space.  The
 *	operation must be re-tried on return.
728
729
 */
static void
730
731
bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags,
    int slpflag, int slptimeo)
732
733
734
735
736
737
738
739
{
	struct thread *td;
	int error, fl, norunbuf;

	if ((gbflags & GB_NOWAIT_BD) != 0)
		return;

	td = curthread;
740
741
	BD_LOCK(bd);
	while (bd->bd_wanted) {
742
743
		if (vp != NULL && vp->v_type != VCHR &&
		    (td->td_pflags & TDP_BUFNEED) == 0) {
744
			BD_UNLOCK(bd);
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
			/*
			 * getblk() is called with a vnode locked, and
			 * some majority of the dirty buffers may as
			 * well belong to the vnode.  Flushing the
			 * buffers there would make a progress that
			 * cannot be achieved by the buf_daemon, that
			 * cannot lock the vnode.
			 */
			norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
			    (td->td_pflags & TDP_NORUNNINGBUF);

			/*
			 * Play bufdaemon.  The getnewbuf() function
			 * may be called while the thread owns lock
			 * for another dirty buffer for the same
			 * vnode, which makes it impossible to use
			 * VOP_FSYNC() there, due to the buffer lock
			 * recursion.
			 */
			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
765
			fl = buf_flush(vp, bd, flushbufqtarget);
766
			td->td_pflags &= norunbuf;
767
			BD_LOCK(bd);
768
769
			if (fl != 0)
				continue;
770
			if (bd->bd_wanted == 0)
771
772
				break;
		}
773
		error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
774
775
776
777
		    (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
		if (error != 0)
			break;
	}
778
	BD_UNLOCK(bd);
779
780
781
782
783
784
785
786
787
788
789
}


/*
 *	bufspace_daemon:
 *
 *	buffer space management daemon.  Tries to maintain some marginal
 *	amount of free buffer space so that requesting processes neither
 *	block nor work to reclaim buffers.
 */
static void
790
bufspace_daemon(void *arg)
791
{
792
793
	struct bufdomain *bd;

794
795
796
	EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread,
	    SHUTDOWN_PRI_LAST + 100);

797
	bd = arg;
798
	for (;;) {
799
		kthread_suspend_check();
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825

		/*
		 * Free buffers from the clean queue until we meet our
		 * targets.
		 *
		 * Theory of operation:  The buffer cache is most efficient
		 * when some free buffer headers and space are always
		 * available to getnewbuf().  This daemon attempts to prevent
		 * the excessive blocking and synchronization associated
		 * with shortfall.  It goes through three phases according
		 * demand:
		 *
		 * 1)	The daemon wakes up voluntarily once per-second
		 *	during idle periods when the counters are below
		 *	the wakeup thresholds (bufspacethresh, lofreebuffers).
		 *
		 * 2)	The daemon wakes up as we cross the thresholds
		 *	ahead of any potential blocking.  This may bounce
		 *	slightly according to the rate of consumption and
		 *	release.
		 *
		 * 3)	The daemon and consumers are starved for working
		 *	clean buffers.  This is the 'bufspace' sleep below
		 *	which will inefficiently trade bufs with bqrelse
		 *	until we return to condition 2.
		 */
826
827
		while (bd->bd_bufspace > bd->bd_lobufspace ||
		    bd->bd_freebuffers < bd->bd_hifreebuffers) {
828
829
830
			if (buf_recycle(bd, false) != 0) {
				if (bd_flushall(bd))
					continue;
831
832
833
834
835
836
837
838
839
				/*
				 * Speedup dirty if we've run out of clean
				 * buffers.  This is possible in particular
				 * because softdep may held many bufs locked
				 * pending writes to other bufs which are
				 * marked for delayed write, exhausting
				 * clean space until they are written.
				 */
				bd_speedup();
840
841
842
843
844
845
				BD_LOCK(bd);
				if (bd->bd_wanted) {
					msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
					    PRIBIO|PDROP, "bufspace", hz/10);
				} else
					BD_UNLOCK(bd);
846
847
			}
			maybe_yield();
848
		}
849
		bufspace_daemon_wait(bd);
850
851
852
	}
}

853
854
855
856
857
858
859
860
861
862
863
864
865
866
/*
 *	bufmallocadjust:
 *
 *	Adjust the reported bufspace for a malloc managed buffer, possibly
 *	waking any waiters.
 */
static void
bufmallocadjust(struct buf *bp, int bufsize)
{
	int diff;

	KASSERT((bp->b_flags & B_MALLOC) != 0,
	    ("bufmallocadjust: non-malloc buf %p", bp));
	diff = bufsize - bp->b_bufsize;
867
	if (diff < 0)
868
		atomic_subtract_long(&bufmallocspace, -diff);
869
	else
870
871
872
873
		atomic_add_long(&bufmallocspace, diff);
	bp->b_bufsize = bufsize;
}

874
/*
875
 *	runningwakeup:
876
 *
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
 *	Wake up processes that are waiting on asynchronous writes to fall
 *	below lorunningspace.
 */
static void
runningwakeup(void)
{

	mtx_lock(&rbreqlock);
	if (runningbufreq) {
		runningbufreq = 0;
		wakeup(&runningbufreq);
	}
	mtx_unlock(&rbreqlock);
}

/*
 *	runningbufwakeup:
 *
 *	Decrement the outstanding write count according.
896
 */
897
void
898
899
runningbufwakeup(struct buf *bp)
{
900
	long space, bspace;
Poul-Henning Kamp's avatar
Poul-Henning Kamp committed
901

902
	bspace = bp->b_runningbufspace;
903
904
905
906
907
	if (bspace == 0)
		return;
	space = atomic_fetchadd_long(&runningbufspace, -bspace);
	KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
	    space, bspace));
908
909
910
911
912
913
914
915
916
917
	bp->b_runningbufspace = 0;
	/*
	 * Only acquire the lock and wakeup on the transition from exceeding
	 * the threshold to falling below it.
	 */
	if (space < lorunningspace)
		return;
	if (space - bspace > lorunningspace)
		return;
	runningwakeup();
918
919
920
921
922
923
924
925
926
927
928
929
930
931
}

/*
 *	waitrunningbufspace()
 *
 *	runningbufspace is a measure of the amount of I/O currently
 *	running.  This routine is used in async-write situations to
 *	prevent creating huge backups of pending writes to a device.
 *	Only asynchronous writes are governed by this function.
 *
 *	This does NOT turn an async write into a sync write.  It waits  
 *	for earlier writes to complete and generally returns before the
 *	caller's write has reached the device.
 */
932
void
933
934
waitrunningbufspace(void)
{
Poul-Henning Kamp's avatar
Poul-Henning Kamp committed
935

936
	mtx_lock(&rbreqlock);
937
	while (runningbufspace > hirunningspace) {
938
		runningbufreq = 1;
939
		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
940
	}
941
	mtx_unlock(&rbreqlock);
942
943
944
}


945
946
947
948
949
950
951
/*
 *	vfs_buf_test_cache:
 *
 *	Called when a buffer is extended.  This function clears the B_CACHE
 *	bit if the newly extended portion of the buffer does not contain
 *	valid data.
 */
952
953
954
static __inline void
vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off,
    vm_offset_t size, vm_page_t m)
955
{
Poul-Henning Kamp's avatar
Poul-Henning Kamp committed
956

957
	VM_OBJECT_ASSERT_LOCKED(m->object);
958
959
960
961
962
963
964
	if (bp->b_flags & B_CACHE) {
		int base = (foff + off) & PAGE_MASK;
		if (vm_page_is_valid(m, base, size) == 0)
			bp->b_flags &= ~B_CACHE;
	}
}

Kris Kennaway's avatar
Kris Kennaway committed
965
/* Wake up the buffer daemon if necessary */
966
static void
967
bd_wakeup(void)
968
{
Poul-Henning Kamp's avatar
Poul-Henning Kamp committed
969

970
	mtx_lock(&bdlock);
971
	if (bd_request == 0) {
972
973
974
		bd_request = 1;
		wakeup(&bd_request);
	}
975
	mtx_unlock(&bdlock);
976
977
}

978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
 * Adjust the maxbcachbuf tunable.
 */
static void
maxbcachebuf_adjust(void)
{
	int i;

	/*
	 * maxbcachebuf must be a power of 2 >= MAXBSIZE.
	 */
	i = 2;
	while (i * 2 <= maxbcachebuf)
		i *= 2;
	maxbcachebuf = i;
	if (maxbcachebuf < MAXBSIZE)
		maxbcachebuf = MAXBSIZE;
	if (maxbcachebuf > MAXPHYS)
		maxbcachebuf = MAXPHYS;
	if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF)
		printf("maxbcachebuf=%d\n", maxbcachebuf);
}