HardenedBSD src tree https://hardenedbsd.org/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

6097 lines
160 KiB

  1. /*-
  2. * SPDX-License-Identifier: BSD-4-Clause
  3. *
  4. * Copyright (c) 1991 Regents of the University of California.
  5. * All rights reserved.
  6. * Copyright (c) 1994 John S. Dyson
  7. * All rights reserved.
  8. * Copyright (c) 1994 David Greenman
  9. * All rights reserved.
  10. * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  11. * All rights reserved.
  12. *
  13. * This code is derived from software contributed to Berkeley by
  14. * the Systems Programming Group of the University of Utah Computer
  15. * Science Department and William Jolitz of UUNET Technologies Inc.
  16. *
  17. * Redistribution and use in source and binary forms, with or without
  18. * modification, are permitted provided that the following conditions
  19. * are met:
  20. * 1. Redistributions of source code must retain the above copyright
  21. * notice, this list of conditions and the following disclaimer.
  22. * 2. Redistributions in binary form must reproduce the above copyright
  23. * notice, this list of conditions and the following disclaimer in the
  24. * documentation and/or other materials provided with the distribution.
  25. * 3. All advertising materials mentioning features or use of this software
  26. * must display the following acknowledgement:
  27. * This product includes software developed by the University of
  28. * California, Berkeley and its contributors.
  29. * 4. Neither the name of the University nor the names of its contributors
  30. * may be used to endorse or promote products derived from this software
  31. * without specific prior written permission.
  32. *
  33. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  34. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  35. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  36. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  37. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  38. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  39. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  40. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  41. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  42. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  43. * SUCH DAMAGE.
  44. *
  45. * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
  46. */
  47. /*-
  48. * Copyright (c) 2003 Networks Associates Technology, Inc.
  49. * All rights reserved.
  50. * Copyright (c) 2018 The FreeBSD Foundation
  51. * All rights reserved.
  52. *
  53. * This software was developed for the FreeBSD Project by Jake Burkholder,
  54. * Safeport Network Services, and Network Associates Laboratories, the
  55. * Security Research Division of Network Associates, Inc. under
  56. * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  57. * CHATS research program.
  58. *
  59. * Portions of this software were developed by
  60. * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
  61. * the FreeBSD Foundation.
  62. *
  63. * Redistribution and use in source and binary forms, with or without
  64. * modification, are permitted provided that the following conditions
  65. * are met:
  66. * 1. Redistributions of source code must retain the above copyright
  67. * notice, this list of conditions and the following disclaimer.
  68. * 2. Redistributions in binary form must reproduce the above copyright
  69. * notice, this list of conditions and the following disclaimer in the
  70. * documentation and/or other materials provided with the distribution.
  71. *
  72. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  73. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  74. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  75. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  76. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  77. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  78. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  79. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  80. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  81. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  82. * SUCH DAMAGE.
  83. */
  84. #include <sys/cdefs.h>
  85. __FBSDID("$FreeBSD$");
  86. /*
  87. * Manages physical address maps.
  88. *
  89. * Since the information managed by this module is
  90. * also stored by the logical address mapping module,
  91. * this module may throw away valid virtual-to-physical
  92. * mappings at almost any time. However, invalidations
  93. * of virtual-to-physical mappings must be done as
  94. * requested.
  95. *
  96. * In order to cope with hardware architectures which
  97. * make virtual-to-physical map invalidates expensive,
  98. * this module may delay invalidate or reduced protection
  99. * operations until such time as they are actually
  100. * necessary. This module is given full information as
  101. * to which processors are currently using which maps,
  102. * and to when physical maps must be made correct.
  103. */
  104. #include "opt_apic.h"
  105. #include "opt_cpu.h"
  106. #include "opt_pmap.h"
  107. #include "opt_smp.h"
  108. #include "opt_vm.h"
  109. #include <sys/param.h>
  110. #include <sys/systm.h>
  111. #include <sys/kernel.h>
  112. #include <sys/ktr.h>
  113. #include <sys/lock.h>
  114. #include <sys/malloc.h>
  115. #include <sys/mman.h>
  116. #include <sys/msgbuf.h>
  117. #include <sys/mutex.h>
  118. #include <sys/proc.h>
  119. #include <sys/rwlock.h>
  120. #include <sys/sf_buf.h>
  121. #include <sys/sx.h>
  122. #include <sys/vmmeter.h>
  123. #include <sys/sched.h>
  124. #include <sys/sysctl.h>
  125. #include <sys/smp.h>
  126. #include <sys/vmem.h>
  127. #include <vm/vm.h>
  128. #include <vm/vm_param.h>
  129. #include <vm/vm_kern.h>
  130. #include <vm/vm_page.h>
  131. #include <vm/vm_map.h>
  132. #include <vm/vm_object.h>
  133. #include <vm/vm_extern.h>
  134. #include <vm/vm_pageout.h>
  135. #include <vm/vm_pager.h>
  136. #include <vm/vm_phys.h>
  137. #include <vm/vm_radix.h>
  138. #include <vm/vm_reserv.h>
  139. #include <vm/uma.h>
  140. #ifdef DEV_APIC
  141. #include <sys/bus.h>
  142. #include <machine/intr_machdep.h>
  143. #include <x86/apicvar.h>
  144. #endif
  145. #include <x86/ifunc.h>
  146. #include <machine/bootinfo.h>
  147. #include <machine/cpu.h>
  148. #include <machine/cputypes.h>
  149. #include <machine/md_var.h>
  150. #include <machine/pcb.h>
  151. #include <machine/specialreg.h>
  152. #ifdef SMP
  153. #include <machine/smp.h>
  154. #endif
  155. #ifndef PMAP_SHPGPERPROC
  156. #define PMAP_SHPGPERPROC 200
  157. #endif
  158. #if !defined(DIAGNOSTIC)
  159. #ifdef __GNUC_GNU_INLINE__
  160. #define PMAP_INLINE __attribute__((__gnu_inline__)) inline
  161. #else
  162. #define PMAP_INLINE extern inline
  163. #endif
  164. #else
  165. #define PMAP_INLINE
  166. #endif
  167. #ifdef PV_STATS
  168. #define PV_STAT(x) do { x ; } while (0)
  169. #else
  170. #define PV_STAT(x) do { } while (0)
  171. #endif
  172. #define pa_index(pa) ((pa) >> PDRSHIFT)
  173. #define pa_to_pvh(pa) (&pv_table[pa_index(pa)])
  174. /*
  175. * Get PDEs and PTEs for user/kernel address space
  176. */
  177. #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
  178. #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
  179. #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0)
  180. #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0)
  181. #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0)
  182. #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0)
  183. #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0)
  184. #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
  185. atomic_clear_int((u_int *)(pte), PG_W))
  186. #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
  187. struct pmap kernel_pmap_store;
  188. vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
  189. vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
  190. static int pgeflag = 0; /* PG_G or-in */
  191. static int pseflag = 0; /* PG_PS or-in */
  192. static int nkpt = NKPT;
  193. vm_offset_t kernel_vm_end = /* 0 + */ NKPT * NBPDR;
  194. #if defined(PAE) || defined(PAE_TABLES)
  195. pt_entry_t pg_nx;
  196. static uma_zone_t pdptzone;
  197. #endif
  198. static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
  199. static int pat_works = 1;
  200. SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 0,
  201. "Is page attribute table fully functional?");
  202. static int pg_ps_enabled = 1;
  203. SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
  204. &pg_ps_enabled, 0, "Are large page mappings enabled?");
  205. #define PAT_INDEX_SIZE 8
  206. static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */
  207. /*
  208. * pmap_mapdev support pre initialization (i.e. console)
  209. */
  210. #define PMAP_PREINIT_MAPPING_COUNT 8
  211. static struct pmap_preinit_mapping {
  212. vm_paddr_t pa;
  213. vm_offset_t va;
  214. vm_size_t sz;
  215. int mode;
  216. } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
  217. static int pmap_initialized;
  218. static struct rwlock_padalign pvh_global_lock;
  219. /*
  220. * Data for the pv entry allocation mechanism
  221. */
  222. static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
  223. static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
  224. static struct md_page *pv_table;
  225. static int shpgperproc = PMAP_SHPGPERPROC;
  226. struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */
  227. int pv_maxchunks; /* How many chunks we have KVA for */
  228. vm_offset_t pv_vafree; /* freelist stored in the PTE */
  229. /*
  230. * All those kernel PT submaps that BSD is so fond of
  231. */
  232. pt_entry_t *CMAP3;
  233. static pd_entry_t *KPTD;
  234. caddr_t ptvmmap = 0;
  235. caddr_t CADDR3;
  236. /*
  237. * Crashdump maps.
  238. */
  239. static caddr_t crashdumpmap;
  240. static pt_entry_t *PMAP1 = NULL, *PMAP2, *PMAP3;
  241. static pt_entry_t *PADDR1 = NULL, *PADDR2, *PADDR3;
  242. #ifdef SMP
  243. static int PMAP1cpu, PMAP3cpu;
  244. static int PMAP1changedcpu;
  245. SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
  246. &PMAP1changedcpu, 0,
  247. "Number of times pmap_pte_quick changed CPU with same PMAP1");
  248. #endif
  249. static int PMAP1changed;
  250. SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
  251. &PMAP1changed, 0,
  252. "Number of times pmap_pte_quick changed PMAP1");
  253. static int PMAP1unchanged;
  254. SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
  255. &PMAP1unchanged, 0,
  256. "Number of times pmap_pte_quick didn't change PMAP1");
  257. static struct mtx PMAP2mutex;
  258. int pti;
  259. /*
  260. * Internal flags for pmap_enter()'s helper functions.
  261. */
  262. #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */
  263. #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */
  264. /*
  265. * Internal flags for pmap_mapdev_internal().
  266. */
  267. #define MAPDEV_SETATTR 0x0000001 /* Modify existing attrs. */
  268. static void free_pv_chunk(struct pv_chunk *pc);
  269. static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
  270. static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
  271. static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
  272. static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde,
  273. u_int flags);
  274. #if VM_NRESERVLEVEL > 0
  275. static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
  276. #endif
  277. static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
  278. static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
  279. vm_offset_t va);
  280. static int pmap_pvh_wired_mappings(struct md_page *pvh, int count);
  281. static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
  282. static bool pmap_enter_4mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
  283. vm_prot_t prot);
  284. static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde,
  285. u_int flags, vm_page_t m);
  286. static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
  287. vm_page_t m, vm_prot_t prot, vm_page_t mpte);
  288. static void pmap_flush_page(vm_page_t m);
  289. static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted);
  290. static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva,
  291. vm_offset_t eva);
  292. static void pmap_invalidate_cache_range_all(vm_offset_t sva,
  293. vm_offset_t eva);
  294. static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
  295. pd_entry_t pde);
  296. static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
  297. static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
  298. static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
  299. static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
  300. static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
  301. static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
  302. #if VM_NRESERVLEVEL > 0
  303. static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
  304. #endif
  305. static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
  306. vm_prot_t prot);
  307. static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
  308. static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
  309. struct spglist *free);
  310. static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
  311. struct spglist *free);
  312. static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
  313. static void pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free);
  314. static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
  315. struct spglist *free);
  316. static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va);
  317. static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
  318. static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
  319. vm_page_t m);
  320. static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
  321. pd_entry_t newpde);
  322. static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
  323. static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags);
  324. static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags);
  325. static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free);
  326. static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
  327. static void pmap_pte_release(pt_entry_t *pte);
  328. static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *);
  329. #if defined(PAE) || defined(PAE_TABLES)
  330. static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain,
  331. uint8_t *flags, int wait);
  332. #endif
  333. static void pmap_init_trm(void);
  334. static __inline void pagezero(void *page);
  335. CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
  336. CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
  337. void pmap_cold(void);
  338. extern char _end[];
  339. u_long physfree; /* phys addr of next free page */
  340. u_long vm86phystk; /* PA of vm86/bios stack */
  341. u_long vm86paddr; /* address of vm86 region */
  342. int vm86pa; /* phys addr of vm86 region */
  343. u_long KERNend; /* phys addr end of kernel (just after bss) */
  344. pd_entry_t *IdlePTD; /* phys addr of kernel PTD */
  345. #if defined(PAE) || defined(PAE_TABLES)
  346. pdpt_entry_t *IdlePDPT; /* phys addr of kernel PDPT */
  347. #endif
  348. pt_entry_t *KPTmap; /* address of kernel page tables */
  349. u_long KPTphys; /* phys addr of kernel page tables */
  350. extern u_long tramp_idleptd;
  351. static u_long
  352. allocpages(u_int cnt, u_long *physfree)
  353. {
  354. u_long res;
  355. res = *physfree;
  356. *physfree += PAGE_SIZE * cnt;
  357. bzero((void *)res, PAGE_SIZE * cnt);
  358. return (res);
  359. }
  360. static void
  361. pmap_cold_map(u_long pa, u_long va, u_long cnt)
  362. {
  363. pt_entry_t *pt;
  364. for (pt = (pt_entry_t *)KPTphys + atop(va); cnt > 0;
  365. cnt--, pt++, va += PAGE_SIZE, pa += PAGE_SIZE)
  366. *pt = pa | PG_V | PG_RW | PG_A | PG_M;
  367. }
  368. static void
  369. pmap_cold_mapident(u_long pa, u_long cnt)
  370. {
  371. pmap_cold_map(pa, pa, cnt);
  372. }
  373. _Static_assert(2 * NBPDR == KERNBASE, "Broken double-map of zero PTD");
  374. /*
  375. * Called from locore.s before paging is enabled. Sets up the first
  376. * kernel page table. Since kernel is mapped with PA == VA, this code
  377. * does not require relocations.
  378. */
  379. void
  380. pmap_cold(void)
  381. {
  382. pt_entry_t *pt;
  383. u_long a;
  384. u_int cr3, ncr4;
  385. physfree = (u_long)&_end;
  386. if (bootinfo.bi_esymtab != 0)
  387. physfree = bootinfo.bi_esymtab;
  388. if (bootinfo.bi_kernend != 0)
  389. physfree = bootinfo.bi_kernend;
  390. physfree = roundup2(physfree, NBPDR);
  391. KERNend = physfree;
  392. /* Allocate Kernel Page Tables */
  393. KPTphys = allocpages(NKPT, &physfree);
  394. KPTmap = (pt_entry_t *)KPTphys;
  395. /* Allocate Page Table Directory */
  396. #if defined(PAE) || defined(PAE_TABLES)
  397. /* XXX only need 32 bytes (easier for now) */
  398. IdlePDPT = (pdpt_entry_t *)allocpages(1, &physfree);
  399. #endif
  400. IdlePTD = (pd_entry_t *)allocpages(NPGPTD, &physfree);
  401. /*
  402. * Allocate KSTACK. Leave a guard page between IdlePTD and
  403. * proc0kstack, to control stack overflow for thread0 and
  404. * prevent corruption of the page table. We leak the guard
  405. * physical memory due to 1:1 mappings.
  406. */
  407. allocpages(1, &physfree);
  408. proc0kstack = allocpages(TD0_KSTACK_PAGES, &physfree);
  409. /* vm86/bios stack */
  410. vm86phystk = allocpages(1, &physfree);
  411. /* pgtable + ext + IOPAGES */
  412. vm86paddr = vm86pa = allocpages(3, &physfree);
  413. /* Install page tables into PTD. Page table page 1 is wasted. */
  414. for (a = 0; a < NKPT; a++)
  415. IdlePTD[a] = (KPTphys + ptoa(a)) | PG_V | PG_RW | PG_A | PG_M;
  416. #if defined(PAE) || defined(PAE_TABLES)
  417. /* PAE install PTD pointers into PDPT */
  418. for (a = 0; a < NPGPTD; a++)
  419. IdlePDPT[a] = ((u_int)IdlePTD + ptoa(a)) | PG_V;
  420. #endif
  421. /*
  422. * Install recursive mapping for kernel page tables into
  423. * itself.
  424. */
  425. for (a = 0; a < NPGPTD; a++)
  426. IdlePTD[PTDPTDI + a] = ((u_int)IdlePTD + ptoa(a)) | PG_V |
  427. PG_RW;
  428. /*
  429. * Initialize page table pages mapping physical address zero
  430. * through the (physical) end of the kernel. Many of these
  431. * pages must be reserved, and we reserve them all and map
  432. * them linearly for convenience. We do this even if we've
  433. * enabled PSE above; we'll just switch the corresponding
  434. * kernel PDEs before we turn on paging.
  435. *
  436. * This and all other page table entries allow read and write
  437. * access for various reasons. Kernel mappings never have any
  438. * access restrictions.
  439. */
  440. pmap_cold_mapident(0, atop(NBPDR));
  441. pmap_cold_map(0, NBPDR, atop(NBPDR));
  442. pmap_cold_mapident(KERNBASE, atop(KERNend - KERNBASE));
  443. /* Map page table directory */
  444. #if defined(PAE) || defined(PAE_TABLES)
  445. pmap_cold_mapident((u_long)IdlePDPT, 1);
  446. #endif
  447. pmap_cold_mapident((u_long)IdlePTD, NPGPTD);
  448. /* Map early KPTmap. It is really pmap_cold_mapident. */
  449. pmap_cold_map(KPTphys, (u_long)KPTmap, NKPT);
  450. /* Map proc0kstack */
  451. pmap_cold_mapident(proc0kstack, TD0_KSTACK_PAGES);
  452. /* ISA hole already mapped */
  453. pmap_cold_mapident(vm86phystk, 1);
  454. pmap_cold_mapident(vm86pa, 3);
  455. /* Map page 0 into the vm86 page table */
  456. *(pt_entry_t *)vm86pa = 0 | PG_RW | PG_U | PG_A | PG_M | PG_V;
  457. /* ...likewise for the ISA hole for vm86 */
  458. for (pt = (pt_entry_t *)vm86pa + atop(ISA_HOLE_START), a = 0;
  459. a < atop(ISA_HOLE_LENGTH); a++, pt++)
  460. *pt = (ISA_HOLE_START + ptoa(a)) | PG_RW | PG_U | PG_A |
  461. PG_M | PG_V;
  462. /* Enable PSE, PGE, VME, and PAE if configured. */
  463. ncr4 = 0;
  464. if ((cpu_feature & CPUID_PSE) != 0) {
  465. ncr4 |= CR4_PSE;
  466. pseflag = PG_PS;
  467. /*
  468. * Superpage mapping of the kernel text. Existing 4k
  469. * page table pages are wasted.
  470. */
  471. for (a = KERNBASE; a < KERNend; a += NBPDR)
  472. IdlePTD[a >> PDRSHIFT] = a | PG_PS | PG_A | PG_M |
  473. PG_RW | PG_V;
  474. }
  475. if ((cpu_feature & CPUID_PGE) != 0) {
  476. ncr4 |= CR4_PGE;
  477. pgeflag = PG_G;
  478. }
  479. ncr4 |= (cpu_feature & CPUID_VME) != 0 ? CR4_VME : 0;
  480. #if defined(PAE) || defined(PAE_TABLES)
  481. ncr4 |= CR4_PAE;
  482. #endif
  483. if (ncr4 != 0)
  484. load_cr4(rcr4() | ncr4);
  485. /* Now enable paging */
  486. #if defined(PAE) || defined(PAE_TABLES)
  487. cr3 = (u_int)IdlePDPT;
  488. #else
  489. cr3 = (u_int)IdlePTD;
  490. #endif
  491. tramp_idleptd = cr3;
  492. load_cr3(cr3);
  493. load_cr0(rcr0() | CR0_PG);
  494. /*
  495. * Now running relocated at KERNBASE where the system is
  496. * linked to run.
  497. */
  498. /*
  499. * Remove the lowest part of the double mapping of low memory
  500. * to get some null pointer checks.
  501. */
  502. IdlePTD[0] = 0;
  503. load_cr3(cr3); /* invalidate TLB */
  504. }
  505. /*
  506. * Bootstrap the system enough to run with virtual memory.
  507. *
  508. * On the i386 this is called after pmap_cold() created initial
  509. * kernel page table and enabled paging, and just syncs the pmap
  510. * module with what has already been done.
  511. */
  512. void
  513. pmap_bootstrap(vm_paddr_t firstaddr)
  514. {
  515. vm_offset_t va;
  516. pt_entry_t *pte, *unused;
  517. struct pcpu *pc;
  518. u_long res;
  519. int i;
  520. res = atop(firstaddr - (vm_paddr_t)KERNLOAD);
  521. /*
  522. * Add a physical memory segment (vm_phys_seg) corresponding to the
  523. * preallocated kernel page table pages so that vm_page structures
  524. * representing these pages will be created. The vm_page structures
  525. * are required for promotion of the corresponding kernel virtual
  526. * addresses to superpage mappings.
  527. */
  528. vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
  529. /*
  530. * Initialize the first available kernel virtual address.
  531. * However, using "firstaddr" may waste a few pages of the
  532. * kernel virtual address space, because pmap_cold() may not
  533. * have mapped every physical page that it allocated.
  534. * Preferably, pmap_cold() would provide a first unused
  535. * virtual address in addition to "firstaddr".
  536. */
  537. virtual_avail = (vm_offset_t)firstaddr;
  538. virtual_end = VM_MAX_KERNEL_ADDRESS;
  539. /*
  540. * Initialize the kernel pmap (which is statically allocated).
  541. * Count bootstrap data as being resident in case any of this data is
  542. * later unmapped (using pmap_remove()) and freed.
  543. */
  544. PMAP_LOCK_INIT(kernel_pmap);
  545. kernel_pmap->pm_pdir = IdlePTD;
  546. #if defined(PAE) || defined(PAE_TABLES)
  547. kernel_pmap->pm_pdpt = IdlePDPT;
  548. #endif
  549. CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */
  550. kernel_pmap->pm_stats.resident_count = res;
  551. TAILQ_INIT(&kernel_pmap->pm_pvchunk);
  552. /*
  553. * Initialize the global pv list lock.
  554. */
  555. rw_init(&pvh_global_lock, "pmap pv global");
  556. /*
  557. * Reserve some special page table entries/VA space for temporary
  558. * mapping of pages.
  559. */
  560. #define SYSMAP(c, p, v, n) \
  561. v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
  562. va = virtual_avail;
  563. pte = vtopte(va);
  564. /*
  565. * Initialize temporary map objects on the current CPU for use
  566. * during early boot.
  567. * CMAP1/CMAP2 are used for zeroing and copying pages.
  568. * CMAP3 is used for the boot-time memory test.
  569. */
  570. pc = get_pcpu();
  571. mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
  572. SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1)
  573. SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1)
  574. SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1)
  575. SYSMAP(caddr_t, CMAP3, CADDR3, 1);
  576. /*
  577. * Crashdump maps.
  578. */
  579. SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
  580. /*
  581. * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
  582. */
  583. SYSMAP(caddr_t, unused, ptvmmap, 1)
  584. /*
  585. * msgbufp is used to map the system message buffer.
  586. */
  587. SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
  588. /*
  589. * KPTmap is used by pmap_kextract().
  590. *
  591. * KPTmap is first initialized by pmap_cold(). However, that initial
  592. * KPTmap can only support NKPT page table pages. Here, a larger
  593. * KPTmap is created that can support KVA_PAGES page table pages.
  594. */
  595. SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
  596. for (i = 0; i < NKPT; i++)
  597. KPTD[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V;
  598. /*
  599. * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(),
  600. * respectively.
  601. */
  602. SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
  603. SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
  604. SYSMAP(pt_entry_t *, PMAP3, PADDR3, 1)
  605. mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
  606. virtual_avail = va;
  607. /*
  608. * Initialize the PAT MSR if present.
  609. * pmap_init_pat() clears and sets CR4_PGE, which, as a
  610. * side-effect, invalidates stale PG_G TLB entries that might
  611. * have been created in our pre-boot environment. We assume
  612. * that PAT support implies PGE and in reverse, PGE presence
  613. * comes with PAT. Both features were added for Pentium Pro.
  614. */
  615. pmap_init_pat();
  616. }
  617. static void
  618. pmap_init_reserved_pages(void)
  619. {
  620. struct pcpu *pc;
  621. vm_offset_t pages;
  622. int i;
  623. CPU_FOREACH(i) {
  624. pc = pcpu_find(i);
  625. mtx_init(&pc->pc_copyout_mlock, "cpmlk", NULL, MTX_DEF |
  626. MTX_NEW);
  627. pc->pc_copyout_maddr = kva_alloc(ptoa(2));
  628. if (pc->pc_copyout_maddr == 0)
  629. panic("unable to allocate non-sleepable copyout KVA");
  630. sx_init(&pc->pc_copyout_slock, "cpslk");
  631. pc->pc_copyout_saddr = kva_alloc(ptoa(2));
  632. if (pc->pc_copyout_saddr == 0)
  633. panic("unable to allocate sleepable copyout KVA");
  634. pc->pc_pmap_eh_va = kva_alloc(ptoa(1));
  635. if (pc->pc_pmap_eh_va == 0)
  636. panic("unable to allocate pmap_extract_and_hold KVA");
  637. pc->pc_pmap_eh_ptep = (char *)vtopte(pc->pc_pmap_eh_va);
  638. /*
  639. * Skip if the mappings have already been initialized,
  640. * i.e. this is the BSP.
  641. */
  642. if (pc->pc_cmap_addr1 != 0)
  643. continue;
  644. mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
  645. pages = kva_alloc(PAGE_SIZE * 3);
  646. if (pages == 0)
  647. panic("unable to allocate CMAP KVA");
  648. pc->pc_cmap_pte1 = vtopte(pages);
  649. pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE);
  650. pc->pc_cmap_addr1 = (caddr_t)pages;
  651. pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE);
  652. pc->pc_qmap_addr = pages + ptoa(2);
  653. }
  654. }
  655. SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL);
  656. /*
  657. * Setup the PAT MSR.
  658. */
  659. void
  660. pmap_init_pat(void)
  661. {
  662. int pat_table[PAT_INDEX_SIZE];
  663. uint64_t pat_msr;
  664. u_long cr0, cr4;
  665. int i;
  666. /* Set default PAT index table. */
  667. for (i = 0; i < PAT_INDEX_SIZE; i++)
  668. pat_table[i] = -1;
  669. pat_table[PAT_WRITE_BACK] = 0;
  670. pat_table[PAT_WRITE_THROUGH] = 1;
  671. pat_table[PAT_UNCACHEABLE] = 3;
  672. pat_table[PAT_WRITE_COMBINING] = 3;
  673. pat_table[PAT_WRITE_PROTECTED] = 3;
  674. pat_table[PAT_UNCACHED] = 3;
  675. /*
  676. * Bail if this CPU doesn't implement PAT.
  677. * We assume that PAT support implies PGE.
  678. */
  679. if ((cpu_feature & CPUID_PAT) == 0) {
  680. for (i = 0; i < PAT_INDEX_SIZE; i++)
  681. pat_index[i] = pat_table[i];
  682. pat_works = 0;
  683. return;
  684. }
  685. /*
  686. * Due to some Intel errata, we can only safely use the lower 4
  687. * PAT entries.
  688. *
  689. * Intel Pentium III Processor Specification Update
  690. * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
  691. * or Mode C Paging)
  692. *
  693. * Intel Pentium IV Processor Specification Update
  694. * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
  695. */
  696. if (cpu_vendor_id == CPU_VENDOR_INTEL &&
  697. !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
  698. pat_works = 0;
  699. /* Initialize default PAT entries. */
  700. pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
  701. PAT_VALUE(1, PAT_WRITE_THROUGH) |
  702. PAT_VALUE(2, PAT_UNCACHED) |
  703. PAT_VALUE(3, PAT_UNCACHEABLE) |
  704. PAT_VALUE(4, PAT_WRITE_BACK) |
  705. PAT_VALUE(5, PAT_WRITE_THROUGH) |
  706. PAT_VALUE(6, PAT_UNCACHED) |
  707. PAT_VALUE(7, PAT_UNCACHEABLE);
  708. if (pat_works) {
  709. /*
  710. * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
  711. * Program 5 and 6 as WP and WC.
  712. * Leave 4 and 7 as WB and UC.
  713. */
  714. pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
  715. pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
  716. PAT_VALUE(6, PAT_WRITE_COMBINING);
  717. pat_table[PAT_UNCACHED] = 2;
  718. pat_table[PAT_WRITE_PROTECTED] = 5;
  719. pat_table[PAT_WRITE_COMBINING] = 6;
  720. } else {
  721. /*
  722. * Just replace PAT Index 2 with WC instead of UC-.
  723. */
  724. pat_msr &= ~PAT_MASK(2);
  725. pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
  726. pat_table[PAT_WRITE_COMBINING] = 2;
  727. }
  728. /* Disable PGE. */
  729. cr4 = rcr4();
  730. load_cr4(cr4 & ~CR4_PGE);
  731. /* Disable caches (CD = 1, NW = 0). */
  732. cr0 = rcr0();
  733. load_cr0((cr0 & ~CR0_NW) | CR0_CD);
  734. /* Flushes caches and TLBs. */
  735. wbinvd();
  736. invltlb();
  737. /* Update PAT and index table. */
  738. wrmsr(MSR_PAT, pat_msr);
  739. for (i = 0; i < PAT_INDEX_SIZE; i++)
  740. pat_index[i] = pat_table[i];
  741. /* Flush caches and TLBs again. */
  742. wbinvd();
  743. invltlb();
  744. /* Restore caches and PGE. */
  745. load_cr0(cr0);
  746. load_cr4(cr4);
  747. }
  748. /*
  749. * Initialize a vm_page's machine-dependent fields.
  750. */
  751. void
  752. pmap_page_init(vm_page_t m)
  753. {
  754. TAILQ_INIT(&m->md.pv_list);
  755. m->md.pat_mode = PAT_WRITE_BACK;
  756. }
  757. #if defined(PAE) || defined(PAE_TABLES)
  758. static void *
  759. pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
  760. int wait)
  761. {
  762. /* Inform UMA that this allocator uses kernel_map/object. */
  763. *flags = UMA_SLAB_KERNEL;
  764. return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain),
  765. bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
  766. }
  767. #endif
  768. /*
  769. * Abuse the pte nodes for unmapped kva to thread a kva freelist through.
  770. * Requirements:
  771. * - Must deal with pages in order to ensure that none of the PG_* bits
  772. * are ever set, PG_V in particular.
  773. * - Assumes we can write to ptes without pte_store() atomic ops, even
  774. * on PAE systems. This should be ok.
  775. * - Assumes nothing will ever test these addresses for 0 to indicate
  776. * no mapping instead of correctly checking PG_V.
  777. * - Assumes a vm_offset_t will fit in a pte (true for i386).
  778. * Because PG_V is never set, there can be no mappings to invalidate.
  779. */
  780. static vm_offset_t
  781. pmap_ptelist_alloc(vm_offset_t *head)
  782. {
  783. pt_entry_t *pte;
  784. vm_offset_t va;
  785. va = *head;
  786. if (va == 0)
  787. panic("pmap_ptelist_alloc: exhausted ptelist KVA");
  788. pte = vtopte(va);
  789. *head = *pte;
  790. if (*head & PG_V)
  791. panic("pmap_ptelist_alloc: va with PG_V set!");
  792. *pte = 0;
  793. return (va);
  794. }
  795. static void
  796. pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
  797. {
  798. pt_entry_t *pte;
  799. if (va & PG_V)
  800. panic("pmap_ptelist_free: freeing va with PG_V set!");
  801. pte = vtopte(va);
  802. *pte = *head; /* virtual! PG_V is 0 though */
  803. *head = va;
  804. }
  805. static void
  806. pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
  807. {
  808. int i;
  809. vm_offset_t va;
  810. *head = 0;
  811. for (i = npages - 1; i >= 0; i--) {
  812. va = (vm_offset_t)base + i * PAGE_SIZE;
  813. pmap_ptelist_free(head, va);
  814. }
  815. }
  816. /*
  817. * Initialize the pmap module.
  818. * Called by vm_init, to initialize any structures that the pmap
  819. * system needs to map virtual memory.
  820. */
  821. void
  822. pmap_init(void)
  823. {
  824. struct pmap_preinit_mapping *ppim;
  825. vm_page_t mpte;
  826. vm_size_t s;
  827. int i, pv_npg;
  828. /*
  829. * Initialize the vm page array entries for the kernel pmap's
  830. * page table pages.
  831. */
  832. PMAP_LOCK(kernel_pmap);
  833. for (i = 0; i < NKPT; i++) {
  834. mpte = PHYS_TO_VM_PAGE(KPTphys + ptoa(i));
  835. KASSERT(mpte >= vm_page_array &&
  836. mpte < &vm_page_array[vm_page_array_size],
  837. ("pmap_init: page table page is out of range"));
  838. mpte->pindex = i + KPTDI;
  839. mpte->phys_addr = KPTphys + ptoa(i);
  840. mpte->wire_count = 1;
  841. /*
  842. * Collect the page table pages that were replaced by a 2/4MB
  843. * page. They are filled with equivalent 4KB page mappings.
  844. */
  845. if (pseflag != 0 &&
  846. KERNBASE <= i << PDRSHIFT && i << PDRSHIFT < KERNend &&
  847. pmap_insert_pt_page(kernel_pmap, mpte, true))
  848. panic("pmap_init: pmap_insert_pt_page failed");
  849. }
  850. PMAP_UNLOCK(kernel_pmap);
  851. vm_wire_add(NKPT);
  852. /*
  853. * Initialize the address space (zone) for the pv entries. Set a
  854. * high water mark so that the system can recover from excessive
  855. * numbers of pv entries.
  856. */
  857. TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
  858. pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count;
  859. TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
  860. pv_entry_max = roundup(pv_entry_max, _NPCPV);
  861. pv_entry_high_water = 9 * (pv_entry_max / 10);
  862. /*
  863. * If the kernel is running on a virtual machine, then it must assume
  864. * that MCA is enabled by the hypervisor. Moreover, the kernel must
  865. * be prepared for the hypervisor changing the vendor and family that
  866. * are reported by CPUID. Consequently, the workaround for AMD Family
  867. * 10h Erratum 383 is enabled if the processor's feature set does not
  868. * include at least one feature that is only supported by older Intel
  869. * or newer AMD processors.
  870. */
  871. if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
  872. (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
  873. CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
  874. AMDID2_FMA4)) == 0)
  875. workaround_erratum383 = 1;
  876. /*
  877. * Are large page mappings supported and enabled?
  878. */
  879. TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
  880. if (pseflag == 0)
  881. pg_ps_enabled = 0;
  882. else if (pg_ps_enabled) {
  883. KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
  884. ("pmap_init: can't assign to pagesizes[1]"));
  885. pagesizes[1] = NBPDR;
  886. }
  887. /*
  888. * Calculate the size of the pv head table for superpages.
  889. * Handle the possibility that "vm_phys_segs[...].end" is zero.
  890. */
  891. pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end -
  892. PAGE_SIZE) / NBPDR + 1;
  893. /*
  894. * Allocate memory for the pv head table for superpages.
  895. */
  896. s = (vm_size_t)(pv_npg * sizeof(struct md_page));
  897. s = round_page(s);
  898. pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
  899. for (i = 0; i < pv_npg; i++)
  900. TAILQ_INIT(&pv_table[i].pv_list);
  901. pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
  902. pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks);
  903. if (pv_chunkbase == NULL)
  904. panic("pmap_init: not enough kvm for pv chunks");
  905. pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
  906. #if defined(PAE) || defined(PAE_TABLES)
  907. pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
  908. NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
  909. UMA_ZONE_VM | UMA_ZONE_NOFREE);
  910. uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
  911. #endif
  912. pmap_initialized = 1;
  913. pmap_init_trm();
  914. if (!bootverbose)
  915. return;
  916. for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
  917. ppim = pmap_preinit_mapping + i;
  918. if (ppim->va == 0)
  919. continue;
  920. printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i,
  921. (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode);
  922. }
  923. }
  924. SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
  925. "Max number of PV entries");
  926. SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
  927. "Page share factor per proc");
  928. static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
  929. "2/4MB page mapping counters");
  930. static u_long pmap_pde_demotions;
  931. SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
  932. &pmap_pde_demotions, 0, "2/4MB page demotions");
  933. static u_long pmap_pde_mappings;
  934. SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
  935. &pmap_pde_mappings, 0, "2/4MB page mappings");
  936. static u_long pmap_pde_p_failures;
  937. SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
  938. &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
  939. static u_long pmap_pde_promotions;
  940. SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
  941. &pmap_pde_promotions, 0, "2/4MB page promotions");
  942. /***************************************************
  943. * Low level helper routines.....
  944. ***************************************************/
  945. boolean_t
  946. pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
  947. {
  948. return (mode >= 0 && mode < PAT_INDEX_SIZE &&
  949. pat_index[(int)mode] >= 0);
  950. }
  951. /*
  952. * Determine the appropriate bits to set in a PTE or PDE for a specified
  953. * caching mode.
  954. */
  955. int
  956. pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
  957. {
  958. int cache_bits, pat_flag, pat_idx;
  959. if (!pmap_is_valid_memattr(pmap, mode))
  960. panic("Unknown caching mode %d\n", mode);
  961. /* The PAT bit is different for PTE's and PDE's. */
  962. pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
  963. /* Map the caching mode to a PAT index. */
  964. pat_idx = pat_index[mode];
  965. /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
  966. cache_bits = 0;
  967. if (pat_idx & 0x4)
  968. cache_bits |= pat_flag;
  969. if (pat_idx & 0x2)
  970. cache_bits |= PG_NC_PCD;
  971. if (pat_idx & 0x1)
  972. cache_bits |= PG_NC_PWT;
  973. return (cache_bits);
  974. }
  975. bool
  976. pmap_ps_enabled(pmap_t pmap __unused)
  977. {
  978. return (pg_ps_enabled);
  979. }
  980. /*
  981. * The caller is responsible for maintaining TLB consistency.
  982. */
  983. static void
  984. pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
  985. {
  986. pd_entry_t *pde;
  987. pde = pmap_pde(kernel_pmap, va);
  988. pde_store(pde, newpde);
  989. }
  990. /*
  991. * After changing the page size for the specified virtual address in the page
  992. * table, flush the corresponding entries from the processor's TLB. Only the
  993. * calling processor's TLB is affected.
  994. *
  995. * The calling thread must be pinned to a processor.
  996. */
  997. static void
  998. pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
  999. {
  1000. if ((newpde & PG_PS) == 0)
  1001. /* Demotion: flush a specific 2MB page mapping. */
  1002. invlpg(va);
  1003. else /* if ((newpde & PG_G) == 0) */
  1004. /*
  1005. * Promotion: flush every 4KB page mapping from the TLB
  1006. * because there are too many to flush individually.
  1007. */
  1008. invltlb();
  1009. }
  1010. void
  1011. invltlb_glob(void)
  1012. {
  1013. invltlb();
  1014. }
  1015. #ifdef SMP
  1016. static void
  1017. pmap_curcpu_cb_dummy(pmap_t pmap __unused, vm_offset_t addr1 __unused,
  1018. vm_offset_t addr2 __unused)
  1019. {
  1020. }
  1021. /*
  1022. * For SMP, these functions have to use the IPI mechanism for coherence.
  1023. *
  1024. * N.B.: Before calling any of the following TLB invalidation functions,
  1025. * the calling processor must ensure that all stores updating a non-
  1026. * kernel page table are globally performed. Otherwise, another
  1027. * processor could cache an old, pre-update entry without being
  1028. * invalidated. This can happen one of two ways: (1) The pmap becomes
  1029. * active on another processor after its pm_active field is checked by
  1030. * one of the following functions but before a store updating the page
  1031. * table is globally performed. (2) The pmap becomes active on another
  1032. * processor before its pm_active field is checked but due to
  1033. * speculative loads one of the following functions stills reads the
  1034. * pmap as inactive on the other processor.
  1035. *
  1036. * The kernel page table is exempt because its pm_active field is
  1037. * immutable. The kernel page table is always active on every
  1038. * processor.
  1039. */
  1040. void
  1041. pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
  1042. {
  1043. cpuset_t *mask, other_cpus;
  1044. u_int cpuid;
  1045. sched_pin();
  1046. if (pmap == kernel_pmap) {
  1047. invlpg(va);
  1048. mask = &all_cpus;
  1049. } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
  1050. mask = &all_cpus;
  1051. } else {
  1052. cpuid = PCPU_GET(cpuid);
  1053. other_cpus = all_cpus;
  1054. CPU_CLR(cpuid, &other_cpus);
  1055. CPU_AND(&other_cpus, &pmap->pm_active);
  1056. mask = &other_cpus;
  1057. }
  1058. smp_masked_invlpg(*mask, va, pmap, pmap_curcpu_cb_dummy);
  1059. sched_unpin();
  1060. }
  1061. /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
  1062. #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE)
  1063. void
  1064. pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
  1065. {
  1066. cpuset_t *mask, other_cpus;
  1067. vm_offset_t addr;
  1068. u_int cpuid;
  1069. if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
  1070. pmap_invalidate_all(pmap);
  1071. return;
  1072. }
  1073. sched_pin();
  1074. if (pmap == kernel_pmap) {
  1075. for (addr = sva; addr < eva; addr += PAGE_SIZE)
  1076. invlpg(addr);
  1077. mask = &all_cpus;
  1078. } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
  1079. mask = &all_cpus;
  1080. } else {
  1081. cpuid = PCPU_GET(cpuid);
  1082. other_cpus = all_cpus;
  1083. CPU_CLR(cpuid, &other_cpus);
  1084. CPU_AND(&other_cpus, &pmap->pm_active);
  1085. mask = &other_cpus;
  1086. }
  1087. smp_masked_invlpg_range(*mask, sva, eva, pmap, pmap_curcpu_cb_dummy);
  1088. sched_unpin();
  1089. }
  1090. void
  1091. pmap_invalidate_all(pmap_t pmap)
  1092. {
  1093. cpuset_t *mask, other_cpus;
  1094. u_int cpuid;
  1095. sched_pin();
  1096. if (pmap == kernel_pmap) {
  1097. invltlb();
  1098. mask = &all_cpus;
  1099. } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
  1100. mask = &all_cpus;
  1101. } else {
  1102. cpuid = PCPU_GET(cpuid);
  1103. other_cpus = all_cpus;
  1104. CPU_CLR(cpuid, &other_cpus);
  1105. CPU_AND(&other_cpus, &pmap->pm_active);
  1106. mask = &other_cpus;
  1107. }
  1108. smp_masked_invltlb(*mask, pmap, pmap_curcpu_cb_dummy);
  1109. sched_unpin();
  1110. }
  1111. static void
  1112. pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused,
  1113. vm_offset_t addr1 __unused, vm_offset_t addr2 __unused)
  1114. {
  1115. wbinvd();
  1116. }
  1117. void
  1118. pmap_invalidate_cache(void)
  1119. {
  1120. smp_cache_flush(pmap_invalidate_cache_curcpu_cb);
  1121. }
  1122. struct pde_action {
  1123. cpuset_t invalidate; /* processors that invalidate their TLB */
  1124. vm_offset_t va;
  1125. pd_entry_t *pde;
  1126. pd_entry_t newpde;
  1127. u_int store; /* processor that updates the PDE */
  1128. };
  1129. static void
  1130. pmap_update_pde_kernel(void *arg)
  1131. {
  1132. struct pde_action *act = arg;
  1133. pd_entry_t *pde;
  1134. if (act->store == PCPU_GET(cpuid)) {
  1135. pde = pmap_pde(kernel_pmap, act->va);
  1136. pde_store(pde, act->newpde);
  1137. }
  1138. }
  1139. static void
  1140. pmap_update_pde_user(void *arg)
  1141. {
  1142. struct pde_action *act = arg;
  1143. if (act->store == PCPU_GET(cpuid))
  1144. pde_store(act->pde, act->newpde);
  1145. }
  1146. static void
  1147. pmap_update_pde_teardown(void *arg)
  1148. {
  1149. struct pde_action *act = arg;
  1150. if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
  1151. pmap_update_pde_invalidate(act->va, act->newpde);
  1152. }
  1153. /*
  1154. * Change the page size for the specified virtual address in a way that
  1155. * prevents any possibility of the TLB ever having two entries that map the
  1156. * same virtual address using different page sizes. This is the recommended
  1157. * workaround for Erratum 383 on AMD Family 10h processors. It prevents a
  1158. * machine check exception for a TLB state that is improperly diagnosed as a
  1159. * hardware error.
  1160. */
  1161. static void
  1162. pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
  1163. {
  1164. struct pde_action act;
  1165. cpuset_t active, other_cpus;
  1166. u_int cpuid;
  1167. sched_pin();
  1168. cpuid = PCPU_GET(cpuid);
  1169. other_cpus = all_cpus;
  1170. CPU_CLR(cpuid, &other_cpus);
  1171. if (pmap == kernel_pmap)
  1172. active = all_cpus;
  1173. else
  1174. active = pmap->pm_active;
  1175. if (CPU_OVERLAP(&active, &other_cpus)) {
  1176. act.store = cpuid;
  1177. act.invalidate = active;
  1178. act.va = va;
  1179. act.pde = pde;
  1180. act.newpde = newpde;
  1181. CPU_SET(cpuid, &active);
  1182. smp_rendezvous_cpus(active,
  1183. smp_no_rendezvous_barrier, pmap == kernel_pmap ?
  1184. pmap_update_pde_kernel : pmap_update_pde_user,
  1185. pmap_update_pde_teardown, &act);
  1186. } else {
  1187. if (pmap == kernel_pmap)
  1188. pmap_kenter_pde(va, newpde);
  1189. else
  1190. pde_store(pde, newpde);
  1191. if (CPU_ISSET(cpuid, &active))
  1192. pmap_update_pde_invalidate(va, newpde);
  1193. }
  1194. sched_unpin();
  1195. }
  1196. #else /* !SMP */
  1197. /*
  1198. * Normal, non-SMP, 486+ invalidation functions.
  1199. * We inline these within pmap.c for speed.
  1200. */
  1201. PMAP_INLINE void
  1202. pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
  1203. {
  1204. if (pmap == kernel_pmap)
  1205. invlpg(va);
  1206. }
  1207. PMAP_INLINE void
  1208. pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
  1209. {
  1210. vm_offset_t addr;
  1211. if (pmap == kernel_pmap)
  1212. for (addr = sva; addr < eva; addr += PAGE_SIZE)
  1213. invlpg(addr);
  1214. }
  1215. PMAP_INLINE void
  1216. pmap_invalidate_all(pmap_t pmap)
  1217. {
  1218. if (pmap == kernel_pmap)
  1219. invltlb();
  1220. }
  1221. PMAP_INLINE void
  1222. pmap_invalidate_cache(void)
  1223. {
  1224. wbinvd();
  1225. }
  1226. static void
  1227. pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
  1228. {
  1229. if (pmap == kernel_pmap)
  1230. pmap_kenter_pde(va, newpde);
  1231. else
  1232. pde_store(pde, newpde);
  1233. if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
  1234. pmap_update_pde_invalidate(va, newpde);
  1235. }
  1236. #endif /* !SMP */
  1237. static void
  1238. pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
  1239. {
  1240. /*
  1241. * When the PDE has PG_PROMOTED set, the 2- or 4MB page mapping was
  1242. * created by a promotion that did not invalidate the 512 or 1024 4KB
  1243. * page mappings that might exist in the TLB. Consequently, at this
  1244. * point, the TLB may hold both 4KB and 2- or 4MB page mappings for
  1245. * the address range [va, va + NBPDR). Therefore, the entire range
  1246. * must be invalidated here. In contrast, when PG_PROMOTED is clear,
  1247. * the TLB will not hold any 4KB page mappings for the address range
  1248. * [va, va + NBPDR), and so a single INVLPG suffices to invalidate the
  1249. * 2- or 4MB page mapping from the TLB.
  1250. */
  1251. if ((pde & PG_PROMOTED) != 0)
  1252. pmap_invalidate_range(pmap, va, va + NBPDR - 1);
  1253. else
  1254. pmap_invalidate_page(pmap, va);
  1255. }
  1256. DEFINE_IFUNC(, void, pmap_invalidate_cache_range, (vm_offset_t, vm_offset_t),
  1257. static)
  1258. {
  1259. if ((cpu_feature & CPUID_SS) != 0)
  1260. return (pmap_invalidate_cache_range_selfsnoop);
  1261. if ((cpu_feature & CPUID_CLFSH) != 0)
  1262. return (pmap_force_invalidate_cache_range);
  1263. return (pmap_invalidate_cache_range_all);
  1264. }
  1265. #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024)
  1266. static void
  1267. pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva)
  1268. {
  1269. KASSERT((sva & PAGE_MASK) == 0,
  1270. ("pmap_invalidate_cache_range: sva not page-aligned"));
  1271. KASSERT((eva & PAGE_MASK) == 0,
  1272. ("pmap_invalidate_cache_range: eva not page-aligned"));
  1273. }
  1274. static void
  1275. pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva)
  1276. {
  1277. pmap_invalidate_cache_range_check_align(sva, eva);
  1278. }
  1279. void
  1280. pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
  1281. {
  1282. sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
  1283. if (eva - sva >= PMAP_CLFLUSH_THRESHOLD) {
  1284. /*
  1285. * The supplied range is bigger than 2MB.
  1286. * Globally invalidate cache.
  1287. */
  1288. pmap_invalidate_cache();
  1289. return;
  1290. }
  1291. #ifdef DEV_APIC
  1292. /*
  1293. * XXX: Some CPUs fault, hang, or trash the local APIC
  1294. * registers if we use CLFLUSH on the local APIC
  1295. * range. The local APIC is always uncached, so we
  1296. * don't need to flush for that range anyway.
  1297. */
  1298. if (pmap_kextract(sva) == lapic_paddr)
  1299. return;
  1300. #endif
  1301. if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) {
  1302. /*
  1303. * Do per-cache line flush. Use the sfence
  1304. * instruction to insure that previous stores are
  1305. * included in the write-back. The processor
  1306. * propagates flush to other processors in the cache
  1307. * coherence domain.
  1308. */
  1309. sfence();
  1310. for (; sva < eva; sva += cpu_clflush_line_size)
  1311. clflushopt(sva);
  1312. sfence();
  1313. } else {
  1314. /*
  1315. * Writes are ordered by CLFLUSH on Intel CPUs.
  1316. */
  1317. if (cpu_vendor_id != CPU_VENDOR_INTEL)
  1318. mfence();
  1319. for (; sva < eva; sva += cpu_clflush_line_size)
  1320. clflush(sva);
  1321. if (cpu_vendor_id != CPU_VENDOR_INTEL)
  1322. mfence();
  1323. }
  1324. }
  1325. static void
  1326. pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva)
  1327. {
  1328. pmap_invalidate_cache_range_check_align(sva, eva);
  1329. pmap_invalidate_cache();
  1330. }
  1331. void
  1332. pmap_invalidate_cache_pages(vm_page_t *pages, int count)
  1333. {
  1334. int i;
  1335. if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
  1336. (cpu_feature & CPUID_CLFSH) == 0) {
  1337. pmap_invalidate_cache();
  1338. } else {
  1339. for (i = 0; i < count; i++)
  1340. pmap_flush_page(pages[i]);
  1341. }
  1342. }
  1343. /*
  1344. * Are we current address space or kernel?
  1345. */
  1346. static __inline int
  1347. pmap_is_current(pmap_t pmap)
  1348. {
  1349. return (pmap == kernel_pmap);
  1350. }
  1351. /*
  1352. * If the given pmap is not the current or kernel pmap, the returned pte must
  1353. * be released by passing it to pmap_pte_release().
  1354. */
  1355. pt_entry_t *
  1356. pmap_pte(pmap_t pmap, vm_offset_t va)
  1357. {
  1358. pd_entry_t newpf;
  1359. pd_entry_t *pde;
  1360. pde = pmap_pde(pmap, va);
  1361. if (*pde & PG_PS)
  1362. return (pde);
  1363. if (*pde != 0) {
  1364. /* are we current address space or kernel? */
  1365. if (pmap_is_current(pmap))
  1366. return (vtopte(va));
  1367. mtx_lock(&PMAP2mutex);
  1368. newpf = *pde & PG_FRAME;
  1369. if ((*PMAP2 & PG_FRAME) != newpf) {
  1370. *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
  1371. pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
  1372. }
  1373. return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
  1374. }
  1375. return (NULL);
  1376. }
  1377. /*
  1378. * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte
  1379. * being NULL.
  1380. */
  1381. static __inline void
  1382. pmap_pte_release(pt_entry_t *pte)
  1383. {
  1384. if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
  1385. mtx_unlock(&PMAP2mutex);
  1386. }
  1387. /*
  1388. * NB: The sequence of updating a page table followed by accesses to the
  1389. * corresponding pages is subject to the situation described in the "AMD64
  1390. * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23,
  1391. * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG
  1392. * right after modifying the PTE bits is crucial.
  1393. */
  1394. static __inline void
  1395. invlcaddr(void *caddr)
  1396. {
  1397. invlpg((u_int)caddr);
  1398. }
  1399. /*
  1400. * Super fast pmap_pte routine best used when scanning
  1401. * the pv lists. This eliminates many coarse-grained
  1402. * invltlb calls. Note that many of the pv list
  1403. * scans are across different pmaps. It is very wasteful
  1404. * to do an entire invltlb for checking a single mapping.
  1405. *
  1406. * If the given pmap is not the current pmap, pvh_global_lock
  1407. * must be held and curthread pinned to a CPU.
  1408. */
  1409. static pt_entry_t *
  1410. pmap_pte_quick(pmap_t pmap, vm_offset_t va)
  1411. {
  1412. pd_entry_t newpf;
  1413. pd_entry_t *pde;
  1414. pde = pmap_pde(pmap, va);
  1415. if (*pde & PG_PS)
  1416. return (pde);
  1417. if (*pde != 0) {
  1418. /* are we current address space or kernel? */
  1419. if (pmap_is_current(pmap))
  1420. return (vtopte(va));
  1421. rw_assert(&pvh_global_lock, RA_WLOCKED);
  1422. KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
  1423. newpf = *pde & PG_FRAME;
  1424. if ((*PMAP1 & PG_FRAME) != newpf) {
  1425. *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
  1426. #ifdef SMP
  1427. PMAP1cpu = PCPU_GET(cpuid);
  1428. #endif
  1429. invlcaddr(PADDR1);
  1430. PMAP1changed++;
  1431. } else
  1432. #ifdef SMP
  1433. if (PMAP1cpu != PCPU_GET(cpuid)) {
  1434. PMAP1cpu = PCPU_GET(cpuid);
  1435. invlcaddr(PADDR1);
  1436. PMAP1changedcpu++;
  1437. } else
  1438. #endif
  1439. PMAP1unchanged++;
  1440. return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
  1441. }
  1442. return (0);
  1443. }
  1444. static pt_entry_t *
  1445. pmap_pte_quick3(pmap_t pmap, vm_offset_t va)
  1446. {
  1447. pd_entry_t newpf;
  1448. pd_entry_t *pde;
  1449. pde = pmap_pde(pmap, va);
  1450. if (*pde & PG_PS)
  1451. return (pde);
  1452. if (*pde != 0) {
  1453. rw_assert(&pvh_global_lock, RA_WLOCKED);
  1454. KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
  1455. newpf = *pde & PG_FRAME;
  1456. if ((*PMAP3 & PG_FRAME) != newpf) {
  1457. *PMAP3 = newpf | PG_RW | PG_V | PG_A | PG_M;
  1458. #ifdef SMP
  1459. PMAP3cpu = PCPU_GET(cpuid);
  1460. #endif
  1461. invlcaddr(PADDR3);
  1462. PMAP1changed++;
  1463. } else
  1464. #ifdef SMP
  1465. if (PMAP3cpu != PCPU_GET(cpuid)) {
  1466. PMAP3cpu = PCPU_GET(cpuid);
  1467. invlcaddr(PADDR3);
  1468. PMAP1changedcpu++;
  1469. } else
  1470. #endif
  1471. PMAP1unchanged++;
  1472. return (PADDR3 + (i386_btop(va) & (NPTEPG - 1)));
  1473. }
  1474. return (0);
  1475. }
  1476. static pt_entry_t
  1477. pmap_pte_ufast(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
  1478. {
  1479. pt_entry_t *eh_ptep, pte, *ptep;
  1480. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  1481. pde &= PG_FRAME;
  1482. critical_enter();
  1483. eh_ptep = (pt_entry_t *)PCPU_GET(pmap_eh_ptep);
  1484. if ((*eh_ptep & PG_FRAME) != pde) {
  1485. *eh_ptep = pde | PG_RW | PG_V | PG_A | PG_M;
  1486. invlcaddr((void *)PCPU_GET(pmap_eh_va));
  1487. }
  1488. ptep = (pt_entry_t *)PCPU_GET(pmap_eh_va) + (i386_btop(va) &
  1489. (NPTEPG - 1));
  1490. pte = *ptep;
  1491. critical_exit();
  1492. return (pte);
  1493. }
  1494. /*
  1495. * Routine: pmap_extract
  1496. * Function:
  1497. * Extract the physical page address associated
  1498. * with the given map/virtual_address pair.
  1499. */
  1500. vm_paddr_t
  1501. pmap_extract(pmap_t pmap, vm_offset_t va)
  1502. {
  1503. vm_paddr_t rtval;
  1504. pt_entry_t pte;
  1505. pd_entry_t pde;
  1506. rtval = 0;
  1507. PMAP_LOCK(pmap);
  1508. pde = pmap->pm_pdir[va >> PDRSHIFT];
  1509. if (pde != 0) {
  1510. if ((pde & PG_PS) != 0)
  1511. rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
  1512. else {
  1513. pte = pmap_pte_ufast(pmap, va, pde);
  1514. rtval = (pte & PG_FRAME) | (va & PAGE_MASK);
  1515. }
  1516. }
  1517. PMAP_UNLOCK(pmap);
  1518. return (rtval);
  1519. }
  1520. /*
  1521. * Routine: pmap_extract_and_hold
  1522. * Function:
  1523. * Atomically extract and hold the physical page
  1524. * with the given pmap and virtual address pair
  1525. * if that mapping permits the given protection.
  1526. */
  1527. vm_page_t
  1528. pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
  1529. {
  1530. pd_entry_t pde;
  1531. pt_entry_t pte;
  1532. vm_page_t m;
  1533. vm_paddr_t pa;
  1534. pa = 0;
  1535. m = NULL;
  1536. PMAP_LOCK(pmap);
  1537. retry:
  1538. pde = *pmap_pde(pmap, va);
  1539. if (pde != 0) {
  1540. if (pde & PG_PS) {
  1541. if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
  1542. if (vm_page_pa_tryrelock(pmap, (pde &
  1543. PG_PS_FRAME) | (va & PDRMASK), &pa))
  1544. goto retry;
  1545. m = PHYS_TO_VM_PAGE(pa);
  1546. }
  1547. } else {
  1548. pte = pmap_pte_ufast(pmap, va, pde);
  1549. if (pte != 0 &&
  1550. ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
  1551. if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
  1552. &pa))
  1553. goto retry;
  1554. m = PHYS_TO_VM_PAGE(pa);
  1555. }
  1556. }
  1557. if (m != NULL)
  1558. vm_page_hold(m);
  1559. }
  1560. PA_UNLOCK_COND(pa);
  1561. PMAP_UNLOCK(pmap);
  1562. return (m);
  1563. }
  1564. /***************************************************
  1565. * Low level mapping routines.....
  1566. ***************************************************/
  1567. /*
  1568. * Add a wired page to the kva.
  1569. * Note: not SMP coherent.
  1570. *
  1571. * This function may be used before pmap_bootstrap() is called.
  1572. */
  1573. PMAP_INLINE void
  1574. pmap_kenter(vm_offset_t va, vm_paddr_t pa)
  1575. {
  1576. pt_entry_t *pte;
  1577. pte = vtopte(va);
  1578. pte_store(pte, pa | PG_RW | PG_V);
  1579. }
  1580. static __inline void
  1581. pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
  1582. {
  1583. pt_entry_t *pte;
  1584. pte = vtopte(va);
  1585. pte_store(pte, pa | PG_RW | PG_V | pmap_cache_bits(kernel_pmap,
  1586. mode, 0));
  1587. }
  1588. /*
  1589. * Remove a page from the kernel pagetables.
  1590. * Note: not SMP coherent.
  1591. *
  1592. * This function may be used before pmap_bootstrap() is called.
  1593. */
  1594. PMAP_INLINE void
  1595. pmap_kremove(vm_offset_t va)
  1596. {
  1597. pt_entry_t *pte;
  1598. pte = vtopte(va);
  1599. pte_clear(pte);
  1600. }
  1601. /*
  1602. * Used to map a range of physical addresses into kernel
  1603. * virtual address space.
  1604. *
  1605. * The value passed in '*virt' is a suggested virtual address for
  1606. * the mapping. Architectures which can support a direct-mapped
  1607. * physical to virtual region can return the appropriate address
  1608. * within that region, leaving '*virt' unchanged. Other
  1609. * architectures should map the pages starting at '*virt' and
  1610. * update '*virt' with the first usable address after the mapped
  1611. * region.
  1612. */
  1613. vm_offset_t
  1614. pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
  1615. {
  1616. vm_offset_t va, sva;
  1617. vm_paddr_t superpage_offset;
  1618. pd_entry_t newpde;
  1619. va = *virt;
  1620. /*
  1621. * Does the physical address range's size and alignment permit at
  1622. * least one superpage mapping to be created?
  1623. */
  1624. superpage_offset = start & PDRMASK;
  1625. if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) {
  1626. /*
  1627. * Increase the starting virtual address so that its alignment
  1628. * does not preclude the use of superpage mappings.
  1629. */
  1630. if ((va & PDRMASK) < superpage_offset)
  1631. va = (va & ~PDRMASK) + superpage_offset;
  1632. else if ((va & PDRMASK) > superpage_offset)
  1633. va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset;
  1634. }
  1635. sva = va;
  1636. while (start < end) {
  1637. if ((start & PDRMASK) == 0 && end - start >= NBPDR &&
  1638. pseflag != 0) {
  1639. KASSERT((va & PDRMASK) == 0,
  1640. ("pmap_map: misaligned va %#x", va));
  1641. newpde = start | PG_PS | PG_RW | PG_V;
  1642. pmap_kenter_pde(va, newpde);
  1643. va += NBPDR;
  1644. start += NBPDR;
  1645. } else {
  1646. pmap_kenter(va, start);
  1647. va += PAGE_SIZE;
  1648. start += PAGE_SIZE;
  1649. }
  1650. }
  1651. pmap_invalidate_range(kernel_pmap, sva, va);
  1652. *virt = va;
  1653. return (sva);
  1654. }
  1655. /*
  1656. * Add a list of wired pages to the kva
  1657. * this routine is only used for temporary
  1658. * kernel mappings that do not need to have
  1659. * page modification or references recorded.
  1660. * Note that old mappings are simply written
  1661. * over. The page *must* be wired.
  1662. * Note: SMP coherent. Uses a ranged shootdown IPI.
  1663. */
  1664. void
  1665. pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
  1666. {
  1667. pt_entry_t *endpte, oldpte, pa, *pte;
  1668. vm_page_t m;
  1669. oldpte = 0;
  1670. pte = vtopte(sva);
  1671. endpte = pte + count;
  1672. while (pte < endpte) {
  1673. m = *ma++;
  1674. pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(kernel_pmap,
  1675. m->md.pat_mode, 0);
  1676. if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
  1677. oldpte |= *pte;
  1678. #if defined(PAE) || defined(PAE_TABLES)
  1679. pte_store(pte, pa | pg_nx | PG_RW | PG_V);
  1680. #else
  1681. pte_store(pte, pa | PG_RW | PG_V);
  1682. #endif
  1683. }
  1684. pte++;
  1685. }
  1686. if (__predict_false((oldpte & PG_V) != 0))
  1687. pmap_invalidate_range(kernel_pmap, sva, sva + count *
  1688. PAGE_SIZE);
  1689. }
  1690. /*
  1691. * This routine tears out page mappings from the
  1692. * kernel -- it is meant only for temporary mappings.
  1693. * Note: SMP coherent. Uses a ranged shootdown IPI.
  1694. */
  1695. void
  1696. pmap_qremove(vm_offset_t sva, int count)
  1697. {
  1698. vm_offset_t va;
  1699. va = sva;
  1700. while (count-- > 0) {
  1701. pmap_kremove(va);
  1702. va += PAGE_SIZE;
  1703. }
  1704. pmap_invalidate_range(kernel_pmap, sva, va);
  1705. }
  1706. /***************************************************
  1707. * Page table page management routines.....
  1708. ***************************************************/
  1709. /*
  1710. * Schedule the specified unused page table page to be freed. Specifically,
  1711. * add the page to the specified list of pages that will be released to the
  1712. * physical memory manager after the TLB has been updated.
  1713. */
  1714. static __inline void
  1715. pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
  1716. boolean_t set_PG_ZERO)
  1717. {
  1718. if (set_PG_ZERO)
  1719. m->flags |= PG_ZERO;
  1720. else
  1721. m->flags &= ~PG_ZERO;
  1722. SLIST_INSERT_HEAD(free, m, plinks.s.ss);
  1723. }
  1724. /*
  1725. * Inserts the specified page table page into the specified pmap's collection
  1726. * of idle page table pages. Each of a pmap's page table pages is responsible
  1727. * for mapping a distinct range of virtual addresses. The pmap's collection is
  1728. * ordered by this virtual address range.
  1729. *
  1730. * If "promoted" is false, then the page table page "mpte" must be zero filled.
  1731. */
  1732. static __inline int
  1733. pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted)
  1734. {
  1735. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  1736. mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0;
  1737. return (vm_radix_insert(&pmap->pm_root, mpte));
  1738. }
  1739. /*
  1740. * Removes the page table page mapping the specified virtual address from the
  1741. * specified pmap's collection of idle page table pages, and returns it.
  1742. * Otherwise, returns NULL if there is no page table page corresponding to the
  1743. * specified virtual address.
  1744. */
  1745. static __inline vm_page_t
  1746. pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
  1747. {
  1748. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  1749. return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT));
  1750. }
  1751. /*
  1752. * Decrements a page table page's wire count, which is used to record the
  1753. * number of valid page table entries within the page. If the wire count
  1754. * drops to zero, then the page table page is unmapped. Returns TRUE if the
  1755. * page table page was unmapped and FALSE otherwise.
  1756. */
  1757. static inline boolean_t
  1758. pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
  1759. {
  1760. --m->wire_count;
  1761. if (m->wire_count == 0) {
  1762. _pmap_unwire_ptp(pmap, m, free);
  1763. return (TRUE);
  1764. } else
  1765. return (FALSE);
  1766. }
  1767. static void
  1768. _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
  1769. {
  1770. /*
  1771. * unmap the page table page
  1772. */
  1773. pmap->pm_pdir[m->pindex] = 0;
  1774. --pmap->pm_stats.resident_count;
  1775. /*
  1776. * There is not need to invalidate the recursive mapping since
  1777. * we never instantiate such mapping for the usermode pmaps,
  1778. * and never remove page table pages from the kernel pmap.
  1779. * Put page on a list so that it is released since all TLB
  1780. * shootdown is done.
  1781. */
  1782. MPASS(pmap != kernel_pmap);
  1783. pmap_add_delayed_free_list(m, free, TRUE);
  1784. }
  1785. /*
  1786. * After removing a page table entry, this routine is used to
  1787. * conditionally free the page, and manage the hold/wire counts.
  1788. */
  1789. static int
  1790. pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free)
  1791. {
  1792. pd_entry_t ptepde;
  1793. vm_page_t mpte;
  1794. if (pmap == kernel_pmap)
  1795. return (0);
  1796. ptepde = *pmap_pde(pmap, va);
  1797. mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
  1798. return (pmap_unwire_ptp(pmap, mpte, free));
  1799. }
  1800. /*
  1801. * Initialize the pmap for the swapper process.
  1802. */
  1803. void
  1804. pmap_pinit0(pmap_t pmap)
  1805. {
  1806. PMAP_LOCK_INIT(pmap);
  1807. pmap->pm_pdir = IdlePTD;
  1808. #if defined(PAE) || defined(PAE_TABLES)
  1809. pmap->pm_pdpt = IdlePDPT;
  1810. #endif
  1811. pmap->pm_root.rt_root = 0;
  1812. CPU_ZERO(&pmap->pm_active);
  1813. TAILQ_INIT(&pmap->pm_pvchunk);
  1814. bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
  1815. pmap_activate_boot(pmap);
  1816. }
  1817. /*
  1818. * Initialize a preallocated and zeroed pmap structure,
  1819. * such as one in a vmspace structure.
  1820. */
  1821. int
  1822. pmap_pinit(pmap_t pmap)
  1823. {
  1824. vm_page_t m;
  1825. int i;
  1826. /*
  1827. * No need to allocate page table space yet but we do need a valid
  1828. * page directory table.
  1829. */
  1830. if (pmap->pm_pdir == NULL) {
  1831. pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD);
  1832. if (pmap->pm_pdir == NULL)
  1833. return (0);
  1834. #if defined(PAE) || defined(PAE_TABLES)
  1835. pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
  1836. KASSERT(((vm_offset_t)pmap->pm_pdpt &
  1837. ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
  1838. ("pmap_pinit: pdpt misaligned"));
  1839. KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
  1840. ("pmap_pinit: pdpt above 4g"));
  1841. #endif
  1842. pmap->pm_root.rt_root = 0;
  1843. }
  1844. KASSERT(vm_radix_is_empty(&pmap->pm_root),
  1845. ("pmap_pinit: pmap has reserved page table page(s)"));
  1846. /*
  1847. * allocate the page directory page(s)
  1848. */
  1849. for (i = 0; i < NPGPTD;) {
  1850. m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
  1851. VM_ALLOC_WIRED | VM_ALLOC_ZERO);
  1852. if (m == NULL) {
  1853. vm_wait(NULL);
  1854. } else {
  1855. pmap->pm_ptdpg[i] = m;
  1856. #if defined(PAE) || defined(PAE_TABLES)
  1857. pmap->pm_pdpt[i] = VM_PAGE_TO_PHYS(m) | PG_V;
  1858. #endif
  1859. i++;
  1860. }
  1861. }
  1862. pmap_qenter((vm_offset_t)pmap->pm_pdir, pmap->pm_ptdpg, NPGPTD);
  1863. for (i = 0; i < NPGPTD; i++)
  1864. if ((pmap->pm_ptdpg[i]->flags & PG_ZERO) == 0)
  1865. pagezero(pmap->pm_pdir + (i * NPDEPG));
  1866. /* Install the trampoline mapping. */
  1867. pmap->pm_pdir[TRPTDI] = PTD[TRPTDI];
  1868. CPU_ZERO(&pmap->pm_active);
  1869. TAILQ_INIT(&pmap->pm_pvchunk);
  1870. bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
  1871. return (1);
  1872. }
  1873. /*
  1874. * this routine is called if the page table page is not
  1875. * mapped correctly.
  1876. */
  1877. static vm_page_t
  1878. _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags)
  1879. {
  1880. vm_paddr_t ptepa;
  1881. vm_page_t m;
  1882. /*
  1883. * Allocate a page table page.
  1884. */
  1885. if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
  1886. VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
  1887. if ((flags & PMAP_ENTER_NOSLEEP) == 0) {
  1888. PMAP_UNLOCK(pmap);
  1889. rw_wunlock(&pvh_global_lock);
  1890. vm_wait(NULL);
  1891. rw_wlock(&pvh_global_lock);
  1892. PMAP_LOCK(pmap);
  1893. }
  1894. /*
  1895. * Indicate the need to retry. While waiting, the page table
  1896. * page may have been allocated.
  1897. */
  1898. return (NULL);
  1899. }
  1900. if ((m->flags & PG_ZERO) == 0)
  1901. pmap_zero_page(m);
  1902. /*
  1903. * Map the pagetable page into the process address space, if
  1904. * it isn't already there.
  1905. */
  1906. pmap->pm_stats.resident_count++;
  1907. ptepa = VM_PAGE_TO_PHYS(m);
  1908. pmap->pm_pdir[ptepindex] =
  1909. (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
  1910. return (m);
  1911. }
  1912. static vm_page_t
  1913. pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags)
  1914. {
  1915. u_int ptepindex;
  1916. pd_entry_t ptepa;
  1917. vm_page_t m;
  1918. /*
  1919. * Calculate pagetable page index
  1920. */
  1921. ptepindex = va >> PDRSHIFT;
  1922. retry:
  1923. /*
  1924. * Get the page directory entry
  1925. */
  1926. ptepa = pmap->pm_pdir[ptepindex];
  1927. /*
  1928. * This supports switching from a 4MB page to a
  1929. * normal 4K page.
  1930. */
  1931. if (ptepa & PG_PS) {
  1932. (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
  1933. ptepa = pmap->pm_pdir[ptepindex];
  1934. }
  1935. /*
  1936. * If the page table page is mapped, we just increment the
  1937. * hold count, and activate it.
  1938. */
  1939. if (ptepa) {
  1940. m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
  1941. m->wire_count++;
  1942. } else {
  1943. /*
  1944. * Here if the pte page isn't mapped, or if it has
  1945. * been deallocated.
  1946. */
  1947. m = _pmap_allocpte(pmap, ptepindex, flags);
  1948. if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0)
  1949. goto retry;
  1950. }
  1951. return (m);
  1952. }
  1953. /***************************************************
  1954. * Pmap allocation/deallocation routines.
  1955. ***************************************************/
  1956. /*
  1957. * Release any resources held by the given physical map.
  1958. * Called when a pmap initialized by pmap_pinit is being released.
  1959. * Should only be called if the map contains no valid mappings.
  1960. */
  1961. void
  1962. pmap_release(pmap_t pmap)
  1963. {
  1964. vm_page_t m;
  1965. int i;
  1966. KASSERT(pmap->pm_stats.resident_count == 0,
  1967. ("pmap_release: pmap resident count %ld != 0",
  1968. pmap->pm_stats.resident_count));
  1969. KASSERT(vm_radix_is_empty(&pmap->pm_root),
  1970. ("pmap_release: pmap has reserved page table page(s)"));
  1971. KASSERT(CPU_EMPTY(&pmap->pm_active),
  1972. ("releasing active pmap %p", pmap));
  1973. pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
  1974. for (i = 0; i < NPGPTD; i++) {
  1975. m = pmap->pm_ptdpg[i];
  1976. #if defined(PAE) || defined(PAE_TABLES)
  1977. KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
  1978. ("pmap_release: got wrong ptd page"));
  1979. #endif
  1980. vm_page_unwire_noq(m);
  1981. vm_page_free(m);
  1982. }
  1983. }
  1984. static int
  1985. kvm_size(SYSCTL_HANDLER_ARGS)
  1986. {
  1987. unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
  1988. return (sysctl_handle_long(oidp, &ksize, 0, req));
  1989. }
  1990. SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
  1991. 0, 0, kvm_size, "IU", "Size of KVM");
  1992. static int
  1993. kvm_free(SYSCTL_HANDLER_ARGS)
  1994. {
  1995. unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
  1996. return (sysctl_handle_long(oidp, &kfree, 0, req));
  1997. }
  1998. SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
  1999. 0, 0, kvm_free, "IU", "Amount of KVM free");
  2000. /*
  2001. * grow the number of kernel page table entries, if needed
  2002. */
  2003. void
  2004. pmap_growkernel(vm_offset_t addr)
  2005. {
  2006. vm_paddr_t ptppaddr;
  2007. vm_page_t nkpg;
  2008. pd_entry_t newpdir;
  2009. mtx_assert(&kernel_map->system_mtx, MA_OWNED);
  2010. addr = roundup2(addr, NBPDR);
  2011. if (addr - 1 >= vm_map_max(kernel_map))
  2012. addr = vm_map_max(kernel_map);
  2013. while (kernel_vm_end < addr) {
  2014. if (pdir_pde(PTD, kernel_vm_end)) {
  2015. kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
  2016. if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
  2017. kernel_vm_end = vm_map_max(kernel_map);
  2018. break;
  2019. }
  2020. continue;
  2021. }
  2022. nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
  2023. VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
  2024. VM_ALLOC_ZERO);
  2025. if (nkpg == NULL)
  2026. panic("pmap_growkernel: no memory to grow kernel");
  2027. nkpt++;
  2028. if ((nkpg->flags & PG_ZERO) == 0)
  2029. pmap_zero_page(nkpg);
  2030. ptppaddr = VM_PAGE_TO_PHYS(nkpg);
  2031. newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
  2032. pdir_pde(KPTD, kernel_vm_end) = newpdir;
  2033. pmap_kenter_pde(kernel_vm_end, newpdir);
  2034. kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
  2035. if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
  2036. kernel_vm_end = vm_map_max(kernel_map);
  2037. break;
  2038. }
  2039. }
  2040. }
  2041. /***************************************************
  2042. * page management routines.
  2043. ***************************************************/
  2044. CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
  2045. CTASSERT(_NPCM == 11);
  2046. CTASSERT(_NPCPV == 336);
  2047. static __inline struct pv_chunk *
  2048. pv_to_chunk(pv_entry_t pv)
  2049. {
  2050. return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
  2051. }
  2052. #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
  2053. #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */
  2054. #define PC_FREE10 0x0000fffful /* Free values for index 10 */
  2055. static const uint32_t pc_freemask[_NPCM] = {
  2056. PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
  2057. PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
  2058. PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
  2059. PC_FREE0_9, PC_FREE10
  2060. };
  2061. SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
  2062. "Current number of pv entries");
  2063. #ifdef PV_STATS
  2064. static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
  2065. SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
  2066. "Current number of pv entry chunks");
  2067. SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
  2068. "Current number of pv entry chunks allocated");
  2069. SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
  2070. "Current number of pv entry chunks frees");
  2071. SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
  2072. "Number of times tried to get a chunk page but failed.");
  2073. static long pv_entry_frees, pv_entry_allocs;
  2074. static int pv_entry_spare;
  2075. SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
  2076. "Current number of pv entry frees");
  2077. SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
  2078. "Current number of pv entry allocs");
  2079. SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
  2080. "Current number of spare pv entries");
  2081. #endif
  2082. /*
  2083. * We are in a serious low memory condition. Resort to
  2084. * drastic measures to free some pages so we can allocate
  2085. * another pv entry chunk.
  2086. */
  2087. static vm_page_t
  2088. pmap_pv_reclaim(pmap_t locked_pmap)
  2089. {
  2090. struct pch newtail;
  2091. struct pv_chunk *pc;
  2092. struct md_page *pvh;
  2093. pd_entry_t *pde;
  2094. pmap_t pmap;
  2095. pt_entry_t *pte, tpte;
  2096. pv_entry_t pv;
  2097. vm_offset_t va;
  2098. vm_page_t m, m_pc;
  2099. struct spglist free;
  2100. uint32_t inuse;
  2101. int bit, field, freed;
  2102. PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
  2103. pmap = NULL;
  2104. m_pc = NULL;
  2105. SLIST_INIT(&free);
  2106. TAILQ_INIT(&newtail);
  2107. while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
  2108. SLIST_EMPTY(&free))) {
  2109. TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
  2110. if (pmap != pc->pc_pmap) {
  2111. if (pmap != NULL) {
  2112. pmap_invalidate_all(pmap);
  2113. if (pmap != locked_pmap)
  2114. PMAP_UNLOCK(pmap);
  2115. }
  2116. pmap = pc->pc_pmap;
  2117. /* Avoid deadlock and lock recursion. */
  2118. if (pmap > locked_pmap)
  2119. PMAP_LOCK(pmap);
  2120. else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
  2121. pmap = NULL;
  2122. TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
  2123. continue;
  2124. }
  2125. }
  2126. /*
  2127. * Destroy every non-wired, 4 KB page mapping in the chunk.
  2128. */
  2129. freed = 0;
  2130. for (field = 0; field < _NPCM; field++) {
  2131. for (inuse = ~pc->pc_map[field] & pc_freemask[field];
  2132. inuse != 0; inuse &= ~(1UL << bit)) {
  2133. bit = bsfl(inuse);
  2134. pv = &pc->pc_pventry[field * 32 + bit];
  2135. va = pv->pv_va;
  2136. pde = pmap_pde(pmap, va);
  2137. if ((*pde & PG_PS) != 0)
  2138. continue;
  2139. pte = pmap_pte(pmap, va);
  2140. tpte = *pte;
  2141. if ((tpte & PG_W) == 0)
  2142. tpte = pte_load_clear(pte);
  2143. pmap_pte_release(pte);
  2144. if ((tpte & PG_W) != 0)
  2145. continue;
  2146. KASSERT(tpte != 0,
  2147. ("pmap_pv_reclaim: pmap %p va %x zero pte",
  2148. pmap, va));
  2149. if ((tpte & PG_G) != 0)
  2150. pmap_invalidate_page(pmap, va);
  2151. m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
  2152. if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
  2153. vm_page_dirty(m);
  2154. if ((tpte & PG_A) != 0)
  2155. vm_page_aflag_set(m, PGA_REFERENCED);
  2156. TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
  2157. if (TAILQ_EMPTY(&m->md.pv_list) &&
  2158. (m->flags & PG_FICTITIOUS) == 0) {
  2159. pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
  2160. if (TAILQ_EMPTY(&pvh->pv_list)) {
  2161. vm_page_aflag_clear(m,
  2162. PGA_WRITEABLE);
  2163. }
  2164. }
  2165. pc->pc_map[field] |= 1UL << bit;
  2166. pmap_unuse_pt(pmap, va, &free);
  2167. freed++;
  2168. }
  2169. }
  2170. if (freed == 0) {
  2171. TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
  2172. continue;
  2173. }
  2174. /* Every freed mapping is for a 4 KB page. */
  2175. pmap->pm_stats.resident_count -= freed;
  2176. PV_STAT(pv_entry_frees += freed);
  2177. PV_STAT(pv_entry_spare += freed);
  2178. pv_entry_count -= freed;
  2179. TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
  2180. for (field = 0; field < _NPCM; field++)
  2181. if (pc->pc_map[field] != pc_freemask[field]) {
  2182. TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
  2183. pc_list);
  2184. TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
  2185. /*
  2186. * One freed pv entry in locked_pmap is
  2187. * sufficient.
  2188. */
  2189. if (pmap == locked_pmap)
  2190. goto out;
  2191. break;
  2192. }
  2193. if (field == _NPCM) {
  2194. PV_STAT(pv_entry_spare -= _NPCPV);
  2195. PV_STAT(pc_chunk_count--);
  2196. PV_STAT(pc_chunk_frees++);
  2197. /* Entire chunk is free; return it. */
  2198. m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
  2199. pmap_qremove((vm_offset_t)pc, 1);
  2200. pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
  2201. break;
  2202. }
  2203. }
  2204. out:
  2205. TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
  2206. if (pmap != NULL) {
  2207. pmap_invalidate_all(pmap);
  2208. if (pmap != locked_pmap)
  2209. PMAP_UNLOCK(pmap);
  2210. }
  2211. if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) {
  2212. m_pc = SLIST_FIRST(&free);
  2213. SLIST_REMOVE_HEAD(&free, plinks.s.ss);
  2214. /* Recycle a freed page table page. */
  2215. m_pc->wire_count = 1;
  2216. }
  2217. vm_page_free_pages_toq(&free, true);
  2218. return (m_pc);
  2219. }
  2220. /*
  2221. * free the pv_entry back to the free list
  2222. */
  2223. static void
  2224. free_pv_entry(pmap_t pmap, pv_entry_t pv)
  2225. {
  2226. struct pv_chunk *pc;
  2227. int idx, field, bit;
  2228. rw_assert(&pvh_global_lock, RA_WLOCKED);
  2229. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  2230. PV_STAT(pv_entry_frees++);
  2231. PV_STAT(pv_entry_spare++);
  2232. pv_entry_count--;
  2233. pc = pv_to_chunk(pv);
  2234. idx = pv - &pc->pc_pventry[0];
  2235. field = idx / 32;
  2236. bit = idx % 32;
  2237. pc->pc_map[field] |= 1ul << bit;
  2238. for (idx = 0; idx < _NPCM; idx++)
  2239. if (pc->pc_map[idx] != pc_freemask[idx]) {
  2240. /*
  2241. * 98% of the time, pc is already at the head of the
  2242. * list. If it isn't already, move it to the head.
  2243. */
  2244. if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
  2245. pc)) {
  2246. TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
  2247. TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
  2248. pc_list);
  2249. }
  2250. return;
  2251. }
  2252. TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
  2253. free_pv_chunk(pc);
  2254. }
  2255. static void
  2256. free_pv_chunk(struct pv_chunk *pc)
  2257. {
  2258. vm_page_t m;
  2259. TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
  2260. PV_STAT(pv_entry_spare -= _NPCPV);
  2261. PV_STAT(pc_chunk_count--);
  2262. PV_STAT(pc_chunk_frees++);
  2263. /* entire chunk is free, return it */
  2264. m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
  2265. pmap_qremove((vm_offset_t)pc, 1);
  2266. vm_page_unwire_noq(m);
  2267. vm_page_free(m);
  2268. pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
  2269. }
  2270. /*
  2271. * get a new pv_entry, allocating a block from the system
  2272. * when needed.
  2273. */
  2274. static pv_entry_t
  2275. get_pv_entry(pmap_t pmap, boolean_t try)
  2276. {
  2277. static const struct timeval printinterval = { 60, 0 };
  2278. static struct timeval lastprint;
  2279. int bit, field;
  2280. pv_entry_t pv;
  2281. struct pv_chunk *pc;
  2282. vm_page_t m;
  2283. rw_assert(&pvh_global_lock, RA_WLOCKED);
  2284. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  2285. PV_STAT(pv_entry_allocs++);
  2286. pv_entry_count++;
  2287. if (pv_entry_count > pv_entry_high_water)
  2288. if (ratecheck(&lastprint, &printinterval))
  2289. printf("Approaching the limit on PV entries, consider "
  2290. "increasing either the vm.pmap.shpgperproc or the "
  2291. "vm.pmap.pv_entries tunable.\n");
  2292. retry:
  2293. pc = TAILQ_FIRST(&pmap->pm_pvchunk);
  2294. if (pc != NULL) {
  2295. for (field = 0; field < _NPCM; field++) {
  2296. if (pc->pc_map[field]) {
  2297. bit = bsfl(pc->pc_map[field]);
  2298. break;
  2299. }
  2300. }
  2301. if (field < _NPCM) {
  2302. pv = &pc->pc_pventry[field * 32 + bit];
  2303. pc->pc_map[field] &= ~(1ul << bit);
  2304. /* If this was the last item, move it to tail */
  2305. for (field = 0; field < _NPCM; field++)
  2306. if (pc->pc_map[field] != 0) {
  2307. PV_STAT(pv_entry_spare--);
  2308. return (pv); /* not full, return */
  2309. }
  2310. TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
  2311. TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
  2312. PV_STAT(pv_entry_spare--);
  2313. return (pv);
  2314. }
  2315. }
  2316. /*
  2317. * Access to the ptelist "pv_vafree" is synchronized by the pvh
  2318. * global lock. If "pv_vafree" is currently non-empty, it will
  2319. * remain non-empty until pmap_ptelist_alloc() completes.
  2320. */
  2321. if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
  2322. VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
  2323. if (try) {
  2324. pv_entry_count--;
  2325. PV_STAT(pc_chunk_tryfail++);
  2326. return (NULL);
  2327. }
  2328. m = pmap_pv_reclaim(pmap);
  2329. if (m == NULL)
  2330. goto retry;
  2331. }
  2332. PV_STAT(pc_chunk_count++);
  2333. PV_STAT(pc_chunk_allocs++);
  2334. pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
  2335. pmap_qenter((vm_offset_t)pc, &m, 1);
  2336. pc->pc_pmap = pmap;
  2337. pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */
  2338. for (field = 1; field < _NPCM; field++)
  2339. pc->pc_map[field] = pc_freemask[field];
  2340. TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
  2341. pv = &pc->pc_pventry[0];
  2342. TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
  2343. PV_STAT(pv_entry_spare += _NPCPV - 1);
  2344. return (pv);
  2345. }
  2346. static __inline pv_entry_t
  2347. pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
  2348. {
  2349. pv_entry_t pv;
  2350. rw_assert(&pvh_global_lock, RA_WLOCKED);
  2351. TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
  2352. if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
  2353. TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
  2354. break;
  2355. }
  2356. }
  2357. return (pv);
  2358. }
  2359. static void
  2360. pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
  2361. {
  2362. struct md_page *pvh;
  2363. pv_entry_t pv;
  2364. vm_offset_t va_last;
  2365. vm_page_t m;
  2366. rw_assert(&pvh_global_lock, RA_WLOCKED);
  2367. KASSERT((pa & PDRMASK) == 0,
  2368. ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
  2369. /*
  2370. * Transfer the 4mpage's pv entry for this mapping to the first
  2371. * page's pv list.
  2372. */
  2373. pvh = pa_to_pvh(pa);
  2374. va = trunc_4mpage(va);
  2375. pv = pmap_pvh_remove(pvh, pmap, va);
  2376. KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
  2377. m = PHYS_TO_VM_PAGE(pa);
  2378. TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
  2379. /* Instantiate the remaining NPTEPG - 1 pv entries. */
  2380. va_last = va + NBPDR - PAGE_SIZE;
  2381. do {
  2382. m++;
  2383. KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  2384. ("pmap_pv_demote_pde: page %p is not managed", m));
  2385. va += PAGE_SIZE;
  2386. pmap_insert_entry(pmap, va, m);
  2387. } while (va < va_last);
  2388. }
  2389. #if VM_NRESERVLEVEL > 0
  2390. static void
  2391. pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
  2392. {
  2393. struct md_page *pvh;
  2394. pv_entry_t pv;
  2395. vm_offset_t va_last;
  2396. vm_page_t m;
  2397. rw_assert(&pvh_global_lock, RA_WLOCKED);
  2398. KASSERT((pa & PDRMASK) == 0,
  2399. ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
  2400. /*
  2401. * Transfer the first page's pv entry for this mapping to the
  2402. * 4mpage's pv list. Aside from avoiding the cost of a call
  2403. * to get_pv_entry(), a transfer avoids the possibility that
  2404. * get_pv_entry() calls pmap_collect() and that pmap_collect()
  2405. * removes one of the mappings that is being promoted.
  2406. */
  2407. m = PHYS_TO_VM_PAGE(pa);
  2408. va = trunc_4mpage(va);
  2409. pv = pmap_pvh_remove(&m->md, pmap, va);
  2410. KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
  2411. pvh = pa_to_pvh(pa);
  2412. TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
  2413. /* Free the remaining NPTEPG - 1 pv entries. */
  2414. va_last = va + NBPDR - PAGE_SIZE;
  2415. do {
  2416. m++;
  2417. va += PAGE_SIZE;
  2418. pmap_pvh_free(&m->md, pmap, va);
  2419. } while (va < va_last);
  2420. }
  2421. #endif /* VM_NRESERVLEVEL > 0 */
  2422. static void
  2423. pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
  2424. {
  2425. pv_entry_t pv;
  2426. pv = pmap_pvh_remove(pvh, pmap, va);
  2427. KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
  2428. free_pv_entry(pmap, pv);
  2429. }
  2430. static void
  2431. pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
  2432. {
  2433. struct md_page *pvh;
  2434. rw_assert(&pvh_global_lock, RA_WLOCKED);
  2435. pmap_pvh_free(&m->md, pmap, va);
  2436. if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
  2437. pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
  2438. if (TAILQ_EMPTY(&pvh->pv_list))
  2439. vm_page_aflag_clear(m, PGA_WRITEABLE);
  2440. }
  2441. }
  2442. /*
  2443. * Create a pv entry for page at pa for
  2444. * (pmap, va).
  2445. */
  2446. static void
  2447. pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
  2448. {
  2449. pv_entry_t pv;
  2450. rw_assert(&pvh_global_lock, RA_WLOCKED);
  2451. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  2452. pv = get_pv_entry(pmap, FALSE);
  2453. pv->pv_va = va;
  2454. TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
  2455. }
  2456. /*
  2457. * Conditionally create a pv entry.
  2458. */
  2459. static boolean_t
  2460. pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
  2461. {
  2462. pv_entry_t pv;
  2463. rw_assert(&pvh_global_lock, RA_WLOCKED);
  2464. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  2465. if (pv_entry_count < pv_entry_high_water &&
  2466. (pv = get_pv_entry(pmap, TRUE)) != NULL) {
  2467. pv->pv_va = va;
  2468. TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
  2469. return (TRUE);
  2470. } else
  2471. return (FALSE);
  2472. }
  2473. /*
  2474. * Create the pv entries for each of the pages within a superpage.
  2475. */
  2476. static bool
  2477. pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags)
  2478. {
  2479. struct md_page *pvh;
  2480. pv_entry_t pv;
  2481. bool noreclaim;
  2482. rw_assert(&pvh_global_lock, RA_WLOCKED);
  2483. noreclaim = (flags & PMAP_ENTER_NORECLAIM) != 0;
  2484. if ((noreclaim && pv_entry_count >= pv_entry_high_water) ||
  2485. (pv = get_pv_entry(pmap, noreclaim)) == NULL)
  2486. return (false);
  2487. pv->pv_va = va;
  2488. pvh = pa_to_pvh(pde & PG_PS_FRAME);
  2489. TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
  2490. return (true);
  2491. }
  2492. /*
  2493. * Fills a page table page with mappings to consecutive physical pages.
  2494. */
  2495. static void
  2496. pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
  2497. {
  2498. pt_entry_t *pte;
  2499. for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
  2500. *pte = newpte;
  2501. newpte += PAGE_SIZE;
  2502. }
  2503. }
  2504. /*
  2505. * Tries to demote a 2- or 4MB page mapping. If demotion fails, the
  2506. * 2- or 4MB page mapping is invalidated.
  2507. */
  2508. static boolean_t
  2509. pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
  2510. {
  2511. pd_entry_t newpde, oldpde;
  2512. pt_entry_t *firstpte, newpte;
  2513. vm_paddr_t mptepa;
  2514. vm_page_t mpte;
  2515. struct spglist free;
  2516. vm_offset_t sva;
  2517. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  2518. oldpde = *pde;
  2519. KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
  2520. ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
  2521. if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
  2522. NULL) {
  2523. KASSERT((oldpde & PG_W) == 0,
  2524. ("pmap_demote_pde: page table page for a wired mapping"
  2525. " is missing"));
  2526. /*
  2527. * Invalidate the 2- or 4MB page mapping and return
  2528. * "failure" if the mapping was never accessed or the
  2529. * allocation of the new page table page fails.
  2530. */
  2531. if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
  2532. va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
  2533. VM_ALLOC_WIRED)) == NULL) {
  2534. SLIST_INIT(&free);
  2535. sva = trunc_4mpage(va);
  2536. pmap_remove_pde(pmap, pde, sva, &free);
  2537. if ((oldpde & PG_G) == 0)
  2538. pmap_invalidate_pde_page(pmap, sva, oldpde);
  2539. vm_page_free_pages_toq(&free, true);
  2540. CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
  2541. " in pmap %p", va, pmap);
  2542. return (FALSE);
  2543. }
  2544. if (pmap != kernel_pmap) {
  2545. mpte->wire_count = NPTEPG;
  2546. pmap->pm_stats.resident_count++;
  2547. }
  2548. }
  2549. mptepa = VM_PAGE_TO_PHYS(mpte);
  2550. /*
  2551. * If the page mapping is in the kernel's address space, then the
  2552. * KPTmap can provide access to the page table page. Otherwise,
  2553. * temporarily map the page table page (mpte) into the kernel's
  2554. * address space at either PADDR1 or PADDR2.
  2555. */
  2556. if (pmap == kernel_pmap)
  2557. firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
  2558. else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
  2559. if ((*PMAP1 & PG_FRAME) != mptepa) {
  2560. *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
  2561. #ifdef SMP
  2562. PMAP1cpu = PCPU_GET(cpuid);
  2563. #endif
  2564. invlcaddr(PADDR1);
  2565. PMAP1changed++;
  2566. } else
  2567. #ifdef SMP
  2568. if (PMAP1cpu != PCPU_GET(cpuid)) {
  2569. PMAP1cpu = PCPU_GET(cpuid);
  2570. invlcaddr(PADDR1);
  2571. PMAP1changedcpu++;
  2572. } else
  2573. #endif
  2574. PMAP1unchanged++;
  2575. firstpte = PADDR1;
  2576. } else {
  2577. mtx_lock(&PMAP2mutex);
  2578. if ((*PMAP2 & PG_FRAME) != mptepa) {
  2579. *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
  2580. pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
  2581. }
  2582. firstpte = PADDR2;
  2583. }
  2584. newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
  2585. KASSERT((oldpde & PG_A) != 0,
  2586. ("pmap_demote_pde: oldpde is missing PG_A"));
  2587. KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
  2588. ("pmap_demote_pde: oldpde is missing PG_M"));
  2589. newpte = oldpde & ~PG_PS;
  2590. if ((newpte & PG_PDE_PAT) != 0)
  2591. newpte ^= PG_PDE_PAT | PG_PTE_PAT;
  2592. /*
  2593. * If the page table page is not leftover from an earlier promotion,
  2594. * initialize it.
  2595. */
  2596. if (mpte->valid == 0)
  2597. pmap_fill_ptp(firstpte, newpte);
  2598. KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
  2599. ("pmap_demote_pde: firstpte and newpte map different physical"
  2600. " addresses"));
  2601. /*
  2602. * If the mapping has changed attributes, update the page table
  2603. * entries.
  2604. */
  2605. if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
  2606. pmap_fill_ptp(firstpte, newpte);
  2607. /*
  2608. * Demote the mapping. This pmap is locked. The old PDE has
  2609. * PG_A set. If the old PDE has PG_RW set, it also has PG_M
  2610. * set. Thus, there is no danger of a race with another
  2611. * processor changing the setting of PG_A and/or PG_M between
  2612. * the read above and the store below.
  2613. */
  2614. if (workaround_erratum383)
  2615. pmap_update_pde(pmap, va, pde, newpde);
  2616. else if (pmap == kernel_pmap)
  2617. pmap_kenter_pde(va, newpde);
  2618. else
  2619. pde_store(pde, newpde);
  2620. if (firstpte == PADDR2)
  2621. mtx_unlock(&PMAP2mutex);
  2622. /*
  2623. * Invalidate the recursive mapping of the page table page.
  2624. */
  2625. pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
  2626. /*
  2627. * Demote the pv entry. This depends on the earlier demotion
  2628. * of the mapping. Specifically, the (re)creation of a per-
  2629. * page pv entry might trigger the execution of pmap_collect(),
  2630. * which might reclaim a newly (re)created per-page pv entry
  2631. * and destroy the associated mapping. In order to destroy
  2632. * the mapping, the PDE must have already changed from mapping
  2633. * the 2mpage to referencing the page table page.
  2634. */
  2635. if ((oldpde & PG_MANAGED) != 0)
  2636. pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
  2637. pmap_pde_demotions++;
  2638. CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
  2639. " in pmap %p", va, pmap);
  2640. return (TRUE);
  2641. }
  2642. /*
  2643. * Removes a 2- or 4MB page mapping from the kernel pmap.
  2644. */
  2645. static void
  2646. pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
  2647. {
  2648. pd_entry_t newpde;
  2649. vm_paddr_t mptepa;
  2650. vm_page_t mpte;
  2651. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  2652. mpte = pmap_remove_pt_page(pmap, va);
  2653. if (mpte == NULL)
  2654. panic("pmap_remove_kernel_pde: Missing pt page.");
  2655. mptepa = VM_PAGE_TO_PHYS(mpte);
  2656. newpde = mptepa | PG_M | PG_A | PG_RW | PG_V;
  2657. /*
  2658. * If this page table page was unmapped by a promotion, then it
  2659. * contains valid mappings. Zero it to invalidate those mappings.
  2660. */
  2661. if (mpte->valid != 0)
  2662. pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]);
  2663. /*
  2664. * Remove the mapping.
  2665. */
  2666. if (workaround_erratum383)
  2667. pmap_update_pde(pmap, va, pde, newpde);
  2668. else
  2669. pmap_kenter_pde(va, newpde);
  2670. /*
  2671. * Invalidate the recursive mapping of the page table page.
  2672. */
  2673. pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
  2674. }
  2675. /*
  2676. * pmap_remove_pde: do the things to unmap a superpage in a process
  2677. */
  2678. static void
  2679. pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
  2680. struct spglist *free)
  2681. {
  2682. struct md_page *pvh;
  2683. pd_entry_t oldpde;
  2684. vm_offset_t eva, va;
  2685. vm_page_t m, mpte;
  2686. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  2687. KASSERT((sva & PDRMASK) == 0,
  2688. ("pmap_remove_pde: sva is not 4mpage aligned"));
  2689. oldpde = pte_load_clear(pdq);
  2690. if (oldpde & PG_W)
  2691. pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
  2692. /*
  2693. * Machines that don't support invlpg, also don't support
  2694. * PG_G.
  2695. */
  2696. if ((oldpde & PG_G) != 0)
  2697. pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
  2698. pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
  2699. if (oldpde & PG_MANAGED) {
  2700. pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
  2701. pmap_pvh_free(pvh, pmap, sva);
  2702. eva = sva + NBPDR;
  2703. for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
  2704. va < eva; va += PAGE_SIZE, m++) {
  2705. if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
  2706. vm_page_dirty(m);
  2707. if (oldpde & PG_A)
  2708. vm_page_aflag_set(m, PGA_REFERENCED);
  2709. if (TAILQ_EMPTY(&m->md.pv_list) &&
  2710. TAILQ_EMPTY(&pvh->pv_list))
  2711. vm_page_aflag_clear(m, PGA_WRITEABLE);
  2712. }
  2713. }
  2714. if (pmap == kernel_pmap) {
  2715. pmap_remove_kernel_pde(pmap, pdq, sva);
  2716. } else {
  2717. mpte = pmap_remove_pt_page(pmap, sva);
  2718. if (mpte != NULL) {
  2719. KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
  2720. ("pmap_remove_pde: pte page not promoted"));
  2721. pmap->pm_stats.resident_count--;
  2722. KASSERT(mpte->wire_count == NPTEPG,
  2723. ("pmap_remove_pde: pte page wire count error"));
  2724. mpte->wire_count = 0;
  2725. pmap_add_delayed_free_list(mpte, free, FALSE);
  2726. }
  2727. }
  2728. }
  2729. /*
  2730. * pmap_remove_pte: do the things to unmap a page in a process
  2731. */
  2732. static int
  2733. pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
  2734. struct spglist *free)
  2735. {
  2736. pt_entry_t oldpte;
  2737. vm_page_t m;
  2738. rw_assert(&pvh_global_lock, RA_WLOCKED);
  2739. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  2740. oldpte = pte_load_clear(ptq);
  2741. KASSERT(oldpte != 0,
  2742. ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va));
  2743. if (oldpte & PG_W)
  2744. pmap->pm_stats.wired_count -= 1;
  2745. /*
  2746. * Machines that don't support invlpg, also don't support
  2747. * PG_G.
  2748. */
  2749. if (oldpte & PG_G)
  2750. pmap_invalidate_page(kernel_pmap, va);
  2751. pmap->pm_stats.resident_count -= 1;
  2752. if (oldpte & PG_MANAGED) {
  2753. m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
  2754. if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
  2755. vm_page_dirty(m);
  2756. if (oldpte & PG_A)
  2757. vm_page_aflag_set(m, PGA_REFERENCED);
  2758. pmap_remove_entry(pmap, m, va);
  2759. }
  2760. return (pmap_unuse_pt(pmap, va, free));
  2761. }
  2762. /*
  2763. * Remove a single page from a process address space
  2764. */
  2765. static void
  2766. pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free)
  2767. {
  2768. pt_entry_t *pte;
  2769. rw_assert(&pvh_global_lock, RA_WLOCKED);
  2770. KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
  2771. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  2772. if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
  2773. return;
  2774. pmap_remove_pte(pmap, pte, va, free);
  2775. pmap_invalidate_page(pmap, va);
  2776. }
  2777. /*
  2778. * Removes the specified range of addresses from the page table page.
  2779. */
  2780. static bool
  2781. pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
  2782. struct spglist *free)
  2783. {
  2784. pt_entry_t *pte;
  2785. bool anyvalid;
  2786. rw_assert(&pvh_global_lock, RA_WLOCKED);
  2787. KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
  2788. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  2789. anyvalid = false;
  2790. for (pte = pmap_pte_quick(pmap, sva); sva != eva; pte++,
  2791. sva += PAGE_SIZE) {
  2792. if (*pte == 0)
  2793. continue;
  2794. /*
  2795. * The TLB entry for a PG_G mapping is invalidated by
  2796. * pmap_remove_pte().
  2797. */
  2798. if ((*pte & PG_G) == 0)
  2799. anyvalid = true;
  2800. if (pmap_remove_pte(pmap, pte, sva, free))
  2801. break;
  2802. }
  2803. return (anyvalid);
  2804. }
  2805. /*
  2806. * Remove the given range of addresses from the specified map.
  2807. *
  2808. * It is assumed that the start and end are properly
  2809. * rounded to the page size.
  2810. */
  2811. void
  2812. pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
  2813. {
  2814. vm_offset_t pdnxt;
  2815. pd_entry_t ptpaddr;
  2816. struct spglist free;
  2817. int anyvalid;
  2818. /*
  2819. * Perform an unsynchronized read. This is, however, safe.
  2820. */
  2821. if (pmap->pm_stats.resident_count == 0)
  2822. return;
  2823. anyvalid = 0;
  2824. SLIST_INIT(&free);
  2825. rw_wlock(&pvh_global_lock);
  2826. sched_pin();
  2827. PMAP_LOCK(pmap);
  2828. /*
  2829. * special handling of removing one page. a very
  2830. * common operation and easy to short circuit some
  2831. * code.
  2832. */
  2833. if ((sva + PAGE_SIZE == eva) &&
  2834. ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
  2835. pmap_remove_page(pmap, sva, &free);
  2836. goto out;
  2837. }
  2838. for (; sva < eva; sva = pdnxt) {
  2839. u_int pdirindex;
  2840. /*
  2841. * Calculate index for next page table.
  2842. */
  2843. pdnxt = (sva + NBPDR) & ~PDRMASK;
  2844. if (pdnxt < sva)
  2845. pdnxt = eva;
  2846. if (pmap->pm_stats.resident_count == 0)
  2847. break;
  2848. pdirindex = sva >> PDRSHIFT;
  2849. ptpaddr = pmap->pm_pdir[pdirindex];
  2850. /*
  2851. * Weed out invalid mappings. Note: we assume that the page
  2852. * directory table is always allocated, and in kernel virtual.
  2853. */
  2854. if (ptpaddr == 0)
  2855. continue;
  2856. /*
  2857. * Check for large page.
  2858. */
  2859. if ((ptpaddr & PG_PS) != 0) {
  2860. /*
  2861. * Are we removing the entire large page? If not,
  2862. * demote the mapping and fall through.
  2863. */
  2864. if (sva + NBPDR == pdnxt && eva >= pdnxt) {
  2865. /*
  2866. * The TLB entry for a PG_G mapping is
  2867. * invalidated by pmap_remove_pde().
  2868. */
  2869. if ((ptpaddr & PG_G) == 0)
  2870. anyvalid = 1;
  2871. pmap_remove_pde(pmap,
  2872. &pmap->pm_pdir[pdirindex], sva, &free);
  2873. continue;
  2874. } else if (!pmap_demote_pde(pmap,
  2875. &pmap->pm_pdir[pdirindex], sva)) {
  2876. /* The large page mapping was destroyed. */
  2877. continue;
  2878. }
  2879. }
  2880. /*
  2881. * Limit our scan to either the end of the va represented
  2882. * by the current page table page, or to the end of the
  2883. * range being removed.
  2884. */
  2885. if (pdnxt > eva)
  2886. pdnxt = eva;
  2887. if (pmap_remove_ptes(pmap, sva, pdnxt, &free))
  2888. anyvalid = 1;
  2889. }
  2890. out:
  2891. sched_unpin();
  2892. if (anyvalid)
  2893. pmap_invalidate_all(pmap);
  2894. rw_wunlock(&pvh_global_lock);
  2895. PMAP_UNLOCK(pmap);
  2896. vm_page_free_pages_toq(&free, true);
  2897. }
  2898. /*
  2899. * Routine: pmap_remove_all
  2900. * Function:
  2901. * Removes this physical page from
  2902. * all physical maps in which it resides.
  2903. * Reflects back modify bits to the pager.
  2904. *
  2905. * Notes:
  2906. * Original versions of this routine were very
  2907. * inefficient because they iteratively called
  2908. * pmap_remove (slow...)
  2909. */
  2910. void
  2911. pmap_remove_all(vm_page_t m)
  2912. {
  2913. struct md_page *pvh;
  2914. pv_entry_t pv;
  2915. pmap_t pmap;
  2916. pt_entry_t *pte, tpte;
  2917. pd_entry_t *pde;
  2918. vm_offset_t va;
  2919. struct spglist free;
  2920. KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  2921. ("pmap_remove_all: page %p is not managed", m));
  2922. SLIST_INIT(&free);
  2923. rw_wlock(&pvh_global_lock);
  2924. sched_pin();
  2925. if ((m->flags & PG_FICTITIOUS) != 0)
  2926. goto small_mappings;
  2927. pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
  2928. while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
  2929. va = pv->pv_va;
  2930. pmap = PV_PMAP(pv);
  2931. PMAP_LOCK(pmap);
  2932. pde = pmap_pde(pmap, va);
  2933. (void)pmap_demote_pde(pmap, pde, va);
  2934. PMAP_UNLOCK(pmap);
  2935. }
  2936. small_mappings:
  2937. while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
  2938. pmap = PV_PMAP(pv);
  2939. PMAP_LOCK(pmap);
  2940. pmap->pm_stats.resident_count--;
  2941. pde = pmap_pde(pmap, pv->pv_va);
  2942. KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
  2943. " a 4mpage in page %p's pv list", m));
  2944. pte = pmap_pte_quick(pmap, pv->pv_va);
  2945. tpte = pte_load_clear(pte);
  2946. KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte",
  2947. pmap, pv->pv_va));
  2948. if (tpte & PG_W)
  2949. pmap->pm_stats.wired_count--;
  2950. if (tpte & PG_A)
  2951. vm_page_aflag_set(m, PGA_REFERENCED);
  2952. /*
  2953. * Update the vm_page_t clean and reference bits.
  2954. */
  2955. if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
  2956. vm_page_dirty(m);
  2957. pmap_unuse_pt(pmap, pv->pv_va, &free);
  2958. pmap_invalidate_page(pmap, pv->pv_va);
  2959. TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
  2960. free_pv_entry(pmap, pv);
  2961. PMAP_UNLOCK(pmap);
  2962. }
  2963. vm_page_aflag_clear(m, PGA_WRITEABLE);
  2964. sched_unpin();
  2965. rw_wunlock(&pvh_global_lock);
  2966. vm_page_free_pages_toq(&free, true);
  2967. }
  2968. /*
  2969. * pmap_protect_pde: do the things to protect a 4mpage in a process
  2970. */
  2971. static boolean_t
  2972. pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
  2973. {
  2974. pd_entry_t newpde, oldpde;
  2975. vm_page_t m, mt;
  2976. boolean_t anychanged;
  2977. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  2978. KASSERT((sva & PDRMASK) == 0,
  2979. ("pmap_protect_pde: sva is not 4mpage aligned"));
  2980. anychanged = FALSE;
  2981. retry:
  2982. oldpde = newpde = *pde;
  2983. if ((prot & VM_PROT_WRITE) == 0) {
  2984. if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
  2985. (PG_MANAGED | PG_M | PG_RW)) {
  2986. m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
  2987. for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
  2988. vm_page_dirty(mt);
  2989. }
  2990. newpde &= ~(PG_RW | PG_M);
  2991. }
  2992. #if defined(PAE) || defined(PAE_TABLES)
  2993. if ((prot & VM_PROT_EXECUTE) == 0)
  2994. newpde |= pg_nx;
  2995. #endif
  2996. if (newpde != oldpde) {
  2997. /*
  2998. * As an optimization to future operations on this PDE, clear
  2999. * PG_PROMOTED. The impending invalidation will remove any
  3000. * lingering 4KB page mappings from the TLB.
  3001. */
  3002. if (!pde_cmpset(pde, oldpde, newpde & ~PG_PROMOTED))
  3003. goto retry;
  3004. if ((oldpde & PG_G) != 0)
  3005. pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
  3006. else
  3007. anychanged = TRUE;
  3008. }
  3009. return (anychanged);
  3010. }
  3011. /*
  3012. * Set the physical protection on the
  3013. * specified range of this map as requested.
  3014. */
  3015. void
  3016. pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
  3017. {
  3018. vm_offset_t pdnxt;
  3019. pd_entry_t ptpaddr;
  3020. pt_entry_t *pte;
  3021. boolean_t anychanged, pv_lists_locked;
  3022. KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
  3023. if (prot == VM_PROT_NONE) {
  3024. pmap_remove(pmap, sva, eva);
  3025. return;
  3026. }
  3027. #if defined(PAE) || defined(PAE_TABLES)
  3028. if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
  3029. (VM_PROT_WRITE|VM_PROT_EXECUTE))
  3030. return;
  3031. #else
  3032. if (prot & VM_PROT_WRITE)
  3033. return;
  3034. #endif
  3035. if (pmap_is_current(pmap))
  3036. pv_lists_locked = FALSE;
  3037. else {
  3038. pv_lists_locked = TRUE;
  3039. resume:
  3040. rw_wlock(&pvh_global_lock);
  3041. sched_pin();
  3042. }
  3043. anychanged = FALSE;
  3044. PMAP_LOCK(pmap);
  3045. for (; sva < eva; sva = pdnxt) {
  3046. pt_entry_t obits, pbits;
  3047. u_int pdirindex;
  3048. pdnxt = (sva + NBPDR) & ~PDRMASK;
  3049. if (pdnxt < sva)
  3050. pdnxt = eva;
  3051. pdirindex = sva >> PDRSHIFT;
  3052. ptpaddr = pmap->pm_pdir[pdirindex];
  3053. /*
  3054. * Weed out invalid mappings. Note: we assume that the page
  3055. * directory table is always allocated, and in kernel virtual.
  3056. */
  3057. if (ptpaddr == 0)
  3058. continue;
  3059. /*
  3060. * Check for large page.
  3061. */
  3062. if ((ptpaddr & PG_PS) != 0) {
  3063. /*
  3064. * Are we protecting the entire large page? If not,
  3065. * demote the mapping and fall through.
  3066. */
  3067. if (sva + NBPDR == pdnxt && eva >= pdnxt) {
  3068. /*
  3069. * The TLB entry for a PG_G mapping is
  3070. * invalidated by pmap_protect_pde().
  3071. */
  3072. if (pmap_protect_pde(pmap,
  3073. &pmap->pm_pdir[pdirindex], sva, prot))
  3074. anychanged = TRUE;
  3075. continue;
  3076. } else {
  3077. if (!pv_lists_locked) {
  3078. pv_lists_locked = TRUE;
  3079. if (!rw_try_wlock(&pvh_global_lock)) {
  3080. if (anychanged)
  3081. pmap_invalidate_all(
  3082. pmap);
  3083. PMAP_UNLOCK(pmap);
  3084. goto resume;
  3085. }
  3086. sched_pin();
  3087. }
  3088. if (!pmap_demote_pde(pmap,
  3089. &pmap->pm_pdir[pdirindex], sva)) {
  3090. /*
  3091. * The large page mapping was
  3092. * destroyed.
  3093. */
  3094. continue;
  3095. }
  3096. }
  3097. }
  3098. if (pdnxt > eva)
  3099. pdnxt = eva;
  3100. for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
  3101. sva += PAGE_SIZE) {
  3102. vm_page_t m;
  3103. retry:
  3104. /*
  3105. * Regardless of whether a pte is 32 or 64 bits in
  3106. * size, PG_RW, PG_A, and PG_M are among the least
  3107. * significant 32 bits.
  3108. */
  3109. obits = pbits = *pte;
  3110. if ((pbits & PG_V) == 0)
  3111. continue;
  3112. if ((prot & VM_PROT_WRITE) == 0) {
  3113. if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
  3114. (PG_MANAGED | PG_M | PG_RW)) {
  3115. m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
  3116. vm_page_dirty(m);
  3117. }
  3118. pbits &= ~(PG_RW | PG_M);
  3119. }
  3120. #if defined(PAE) || defined(PAE_TABLES)
  3121. if ((prot & VM_PROT_EXECUTE) == 0)
  3122. pbits |= pg_nx;
  3123. #endif
  3124. if (pbits != obits) {
  3125. #if defined(PAE) || defined(PAE_TABLES)
  3126. if (!atomic_cmpset_64(pte, obits, pbits))
  3127. goto retry;
  3128. #else
  3129. if (!atomic_cmpset_int((u_int *)pte, obits,
  3130. pbits))
  3131. goto retry;
  3132. #endif
  3133. if (obits & PG_G)
  3134. pmap_invalidate_page(pmap, sva);
  3135. else
  3136. anychanged = TRUE;
  3137. }
  3138. }
  3139. }
  3140. if (anychanged)
  3141. pmap_invalidate_all(pmap);
  3142. if (pv_lists_locked) {
  3143. sched_unpin();
  3144. rw_wunlock(&pvh_global_lock);
  3145. }
  3146. PMAP_UNLOCK(pmap);
  3147. }
  3148. #if VM_NRESERVLEVEL > 0
  3149. /*
  3150. * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
  3151. * within a single page table page (PTP) to a single 2- or 4MB page mapping.
  3152. * For promotion to occur, two conditions must be met: (1) the 4KB page
  3153. * mappings must map aligned, contiguous physical memory and (2) the 4KB page
  3154. * mappings must have identical characteristics.
  3155. *
  3156. * Managed (PG_MANAGED) mappings within the kernel address space are not
  3157. * promoted. The reason is that kernel PDEs are replicated in each pmap but
  3158. * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
  3159. * pmap.
  3160. */
  3161. static void
  3162. pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
  3163. {
  3164. pd_entry_t newpde;
  3165. pt_entry_t *firstpte, oldpte, pa, *pte;
  3166. vm_offset_t oldpteva;
  3167. vm_page_t mpte;
  3168. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  3169. /*
  3170. * Examine the first PTE in the specified PTP. Abort if this PTE is
  3171. * either invalid, unused, or does not map the first 4KB physical page
  3172. * within a 2- or 4MB page.
  3173. */
  3174. firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
  3175. setpde:
  3176. newpde = *firstpte;
  3177. if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
  3178. pmap_pde_p_failures++;
  3179. CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
  3180. " in pmap %p", va, pmap);
  3181. return;
  3182. }
  3183. if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
  3184. pmap_pde_p_failures++;
  3185. CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
  3186. " in pmap %p", va, pmap);
  3187. return;
  3188. }
  3189. if ((newpde & (PG_M | PG_RW)) == PG_RW) {
  3190. /*
  3191. * When PG_M is already clear, PG_RW can be cleared without
  3192. * a TLB invalidation.
  3193. */
  3194. if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
  3195. ~PG_RW))
  3196. goto setpde;
  3197. newpde &= ~PG_RW;
  3198. }
  3199. /*
  3200. * Examine each of the other PTEs in the specified PTP. Abort if this
  3201. * PTE maps an unexpected 4KB physical page or does not have identical
  3202. * characteristics to the first PTE.
  3203. */
  3204. pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
  3205. for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
  3206. setpte:
  3207. oldpte = *pte;
  3208. if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
  3209. pmap_pde_p_failures++;
  3210. CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
  3211. " in pmap %p", va, pmap);
  3212. return;
  3213. }
  3214. if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
  3215. /*
  3216. * When PG_M is already clear, PG_RW can be cleared
  3217. * without a TLB invalidation.
  3218. */
  3219. if (!atomic_cmpset_int((u_int *)pte, oldpte,
  3220. oldpte & ~PG_RW))
  3221. goto setpte;
  3222. oldpte &= ~PG_RW;
  3223. oldpteva = (oldpte & PG_FRAME & PDRMASK) |
  3224. (va & ~PDRMASK);
  3225. CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
  3226. " in pmap %p", oldpteva, pmap);
  3227. }
  3228. if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
  3229. pmap_pde_p_failures++;
  3230. CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
  3231. " in pmap %p", va, pmap);
  3232. return;
  3233. }
  3234. pa -= PAGE_SIZE;
  3235. }
  3236. /*
  3237. * Save the page table page in its current state until the PDE
  3238. * mapping the superpage is demoted by pmap_demote_pde() or
  3239. * destroyed by pmap_remove_pde().
  3240. */
  3241. mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
  3242. KASSERT(mpte >= vm_page_array &&
  3243. mpte < &vm_page_array[vm_page_array_size],
  3244. ("pmap_promote_pde: page table page is out of range"));
  3245. KASSERT(mpte->pindex == va >> PDRSHIFT,
  3246. ("pmap_promote_pde: page table page's pindex is wrong"));
  3247. if (pmap_insert_pt_page(pmap, mpte, true)) {
  3248. pmap_pde_p_failures++;
  3249. CTR2(KTR_PMAP,
  3250. "pmap_promote_pde: failure for va %#x in pmap %p", va,
  3251. pmap);
  3252. return;
  3253. }
  3254. /*
  3255. * Promote the pv entries.
  3256. */
  3257. if ((newpde & PG_MANAGED) != 0)
  3258. pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
  3259. /*
  3260. * Propagate the PAT index to its proper position.
  3261. */
  3262. if ((newpde & PG_PTE_PAT) != 0)
  3263. newpde ^= PG_PDE_PAT | PG_PTE_PAT;
  3264. /*
  3265. * Map the superpage.
  3266. */
  3267. if (workaround_erratum383)
  3268. pmap_update_pde(pmap, va, pde, PG_PS | newpde);
  3269. else if (pmap == kernel_pmap)
  3270. pmap_kenter_pde(va, PG_PROMOTED | PG_PS | newpde);
  3271. else
  3272. pde_store(pde, PG_PROMOTED | PG_PS | newpde);
  3273. pmap_pde_promotions++;
  3274. CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
  3275. " in pmap %p", va, pmap);
  3276. }
  3277. #endif /* VM_NRESERVLEVEL > 0 */
  3278. /*
  3279. * Insert the given physical page (p) at
  3280. * the specified virtual address (v) in the
  3281. * target physical map with the protection requested.
  3282. *
  3283. * If specified, the page will be wired down, meaning
  3284. * that the related pte can not be reclaimed.
  3285. *
  3286. * NB: This is the only routine which MAY NOT lazy-evaluate
  3287. * or lose information. That is, this routine must actually
  3288. * insert this page into the given map NOW.
  3289. */
  3290. int
  3291. pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
  3292. u_int flags, int8_t psind)
  3293. {
  3294. pd_entry_t *pde;
  3295. pt_entry_t *pte;
  3296. pt_entry_t newpte, origpte;
  3297. pv_entry_t pv;
  3298. vm_paddr_t opa, pa;
  3299. vm_page_t mpte, om;
  3300. int rv;
  3301. va = trunc_page(va);
  3302. KASSERT((pmap == kernel_pmap && va < VM_MAX_KERNEL_ADDRESS) ||
  3303. (pmap != kernel_pmap && va < VM_MAXUSER_ADDRESS),
  3304. ("pmap_enter: toobig k%d %#x", pmap == kernel_pmap, va));
  3305. KASSERT(va < PMAP_TRM_MIN_ADDRESS,
  3306. ("pmap_enter: invalid to pmap_enter into trampoline (va: 0x%x)",
  3307. va));
  3308. KASSERT(pmap != kernel_pmap || (m->oflags & VPO_UNMANAGED) != 0 ||
  3309. va < kmi.clean_sva || va >= kmi.clean_eva,
  3310. ("pmap_enter: managed mapping within the clean submap"));
  3311. if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
  3312. VM_OBJECT_ASSERT_LOCKED(m->object);
  3313. KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
  3314. ("pmap_enter: flags %u has reserved bits set", flags));
  3315. pa = VM_PAGE_TO_PHYS(m);
  3316. newpte = (pt_entry_t)(pa | PG_A | PG_V);
  3317. if ((flags & VM_PROT_WRITE) != 0)
  3318. newpte |= PG_M;
  3319. if ((prot & VM_PROT_WRITE) != 0)
  3320. newpte |= PG_RW;
  3321. KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
  3322. ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
  3323. #if defined(PAE) || defined(PAE_TABLES)
  3324. if ((prot & VM_PROT_EXECUTE) == 0)
  3325. newpte |= pg_nx;
  3326. #endif
  3327. if ((flags & PMAP_ENTER_WIRED) != 0)
  3328. newpte |= PG_W;
  3329. if (pmap != kernel_pmap)
  3330. newpte |= PG_U;
  3331. newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0);
  3332. if ((m->oflags & VPO_UNMANAGED) == 0)
  3333. newpte |= PG_MANAGED;
  3334. rw_wlock(&pvh_global_lock);
  3335. PMAP_LOCK(pmap);
  3336. sched_pin();
  3337. if (psind == 1) {
  3338. /* Assert the required virtual and physical alignment. */
  3339. KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
  3340. KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
  3341. rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m);
  3342. goto out;
  3343. }
  3344. pde = pmap_pde(pmap, va);
  3345. if (pmap != kernel_pmap) {
  3346. /*
  3347. * va is for UVA.
  3348. * In the case that a page table page is not resident,
  3349. * we are creating it here. pmap_allocpte() handles
  3350. * demotion.
  3351. */
  3352. mpte = pmap_allocpte(pmap, va, flags);
  3353. if (mpte == NULL) {
  3354. KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0,
  3355. ("pmap_allocpte failed with sleep allowed"));
  3356. rv = KERN_RESOURCE_SHORTAGE;
  3357. goto out;
  3358. }
  3359. } else {
  3360. /*
  3361. * va is for KVA, so pmap_demote_pde() will never fail
  3362. * to install a page table page. PG_V is also
  3363. * asserted by pmap_demote_pde().
  3364. */
  3365. mpte = NULL;
  3366. KASSERT(pde != NULL && (*pde & PG_V) != 0,
  3367. ("KVA %#x invalid pde pdir %#jx", va,
  3368. (uintmax_t)pmap->pm_pdir[PTDPTDI]));
  3369. if ((*pde & PG_PS) != 0)
  3370. pmap_demote_pde(pmap, pde, va);
  3371. }
  3372. pte = pmap_pte_quick(pmap, va);
  3373. /*
  3374. * Page Directory table entry is not valid, which should not
  3375. * happen. We should have either allocated the page table
  3376. * page or demoted the existing mapping above.
  3377. */
  3378. if (pte == NULL) {
  3379. panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
  3380. (uintmax_t)pmap->pm_pdir[PTDPTDI], va);
  3381. }
  3382. origpte = *pte;
  3383. pv = NULL;
  3384. /*
  3385. * Is the specified virtual address already mapped?
  3386. */
  3387. if ((origpte & PG_V) != 0) {
  3388. /*
  3389. * Wiring change, just update stats. We don't worry about
  3390. * wiring PT pages as they remain resident as long as there
  3391. * are valid mappings in them. Hence, if a user page is wired,
  3392. * the PT page will be also.
  3393. */
  3394. if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
  3395. pmap->pm_stats.wired_count++;
  3396. else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
  3397. pmap->pm_stats.wired_count--;
  3398. /*
  3399. * Remove the extra PT page reference.
  3400. */
  3401. if (mpte != NULL) {
  3402. mpte->wire_count--;
  3403. KASSERT(mpte->wire_count > 0,
  3404. ("pmap_enter: missing reference to page table page,"
  3405. " va: 0x%x", va));
  3406. }
  3407. /*
  3408. * Has the physical page changed?
  3409. */
  3410. opa = origpte & PG_FRAME;
  3411. if (opa == pa) {
  3412. /*
  3413. * No, might be a protection or wiring change.
  3414. */
  3415. if ((origpte & PG_MANAGED) != 0 &&
  3416. (newpte & PG_RW) != 0)
  3417. vm_page_aflag_set(m, PGA_WRITEABLE);
  3418. if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
  3419. goto unchanged;
  3420. goto validate;
  3421. }
  3422. /*
  3423. * The physical page has changed. Temporarily invalidate
  3424. * the mapping. This ensures that all threads sharing the
  3425. * pmap keep a consistent view of the mapping, which is
  3426. * necessary for the correct handling of COW faults. It
  3427. * also permits reuse of the old mapping's PV entry,
  3428. * avoiding an allocation.
  3429. *
  3430. * For consistency, handle unmanaged mappings the same way.
  3431. */
  3432. origpte = pte_load_clear(pte);
  3433. KASSERT((origpte & PG_FRAME) == opa,
  3434. ("pmap_enter: unexpected pa update for %#x", va));
  3435. if ((origpte & PG_MANAGED) != 0) {
  3436. om = PHYS_TO_VM_PAGE(opa);
  3437. /*
  3438. * The pmap lock is sufficient to synchronize with
  3439. * concurrent calls to pmap_page_test_mappings() and
  3440. * pmap_ts_referenced().
  3441. */
  3442. if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
  3443. vm_page_dirty(om);
  3444. if ((origpte & PG_A) != 0)
  3445. vm_page_aflag_set(om, PGA_REFERENCED);
  3446. pv = pmap_pvh_remove(&om->md, pmap, va);
  3447. KASSERT(pv != NULL,
  3448. ("pmap_enter: no PV entry for %#x", va));
  3449. if ((newpte & PG_MANAGED) == 0)
  3450. free_pv_entry(pmap, pv);
  3451. if ((om->aflags & PGA_WRITEABLE) != 0 &&
  3452. TAILQ_EMPTY(&om->md.pv_list) &&
  3453. ((om->flags & PG_FICTITIOUS) != 0 ||
  3454. TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
  3455. vm_page_aflag_clear(om, PGA_WRITEABLE);
  3456. }
  3457. if ((origpte & PG_A) != 0)
  3458. pmap_invalidate_page(pmap, va);
  3459. origpte = 0;
  3460. } else {
  3461. /*
  3462. * Increment the counters.
  3463. */
  3464. if ((newpte & PG_W) != 0)
  3465. pmap->pm_stats.wired_count++;
  3466. pmap->pm_stats.resident_count++;
  3467. }
  3468. /*
  3469. * Enter on the PV list if part of our managed memory.
  3470. */
  3471. if ((newpte & PG_MANAGED) != 0) {
  3472. if (pv == NULL) {
  3473. pv = get_pv_entry(pmap, FALSE);
  3474. pv->pv_va = va;
  3475. }
  3476. TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
  3477. if ((newpte & PG_RW) != 0)
  3478. vm_page_aflag_set(m, PGA_WRITEABLE);
  3479. }
  3480. /*
  3481. * Update the PTE.
  3482. */
  3483. if ((origpte & PG_V) != 0) {
  3484. validate:
  3485. origpte = pte_load_store(pte, newpte);
  3486. KASSERT((origpte & PG_FRAME) == pa,
  3487. ("pmap_enter: unexpected pa update for %#x", va));
  3488. if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
  3489. (PG_M | PG_RW)) {
  3490. if ((origpte & PG_MANAGED) != 0)
  3491. vm_page_dirty(m);
  3492. /*
  3493. * Although the PTE may still have PG_RW set, TLB
  3494. * invalidation may nonetheless be required because
  3495. * the PTE no longer has PG_M set.
  3496. */
  3497. }
  3498. #if defined(PAE) || defined(PAE_TABLES)
  3499. else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
  3500. /*
  3501. * This PTE change does not require TLB invalidation.
  3502. */
  3503. goto unchanged;
  3504. }
  3505. #endif
  3506. if ((origpte & PG_A) != 0)
  3507. pmap_invalidate_page(pmap, va);
  3508. } else
  3509. pte_store(pte, newpte);
  3510. unchanged:
  3511. #if VM_NRESERVLEVEL > 0
  3512. /*
  3513. * If both the page table page and the reservation are fully
  3514. * populated, then attempt promotion.
  3515. */
  3516. if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
  3517. pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
  3518. vm_reserv_level_iffullpop(m) == 0)
  3519. pmap_promote_pde(pmap, pde, va);
  3520. #endif
  3521. rv = KERN_SUCCESS;
  3522. out:
  3523. sched_unpin();
  3524. rw_wunlock(&pvh_global_lock);
  3525. PMAP_UNLOCK(pmap);
  3526. return (rv);
  3527. }
  3528. /*
  3529. * Tries to create a read- and/or execute-only 2 or 4 MB page mapping. Returns
  3530. * true if successful. Returns false if (1) a mapping already exists at the
  3531. * specified virtual address or (2) a PV entry cannot be allocated without
  3532. * reclaiming another PV entry.
  3533. */
  3534. static bool
  3535. pmap_enter_4mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
  3536. {
  3537. pd_entry_t newpde;
  3538. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  3539. newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
  3540. PG_PS | PG_V;
  3541. if ((m->oflags & VPO_UNMANAGED) == 0)
  3542. newpde |= PG_MANAGED;
  3543. #if defined(PAE) || defined(PAE_TABLES)
  3544. if ((prot & VM_PROT_EXECUTE) == 0)
  3545. newpde |= pg_nx;
  3546. #endif
  3547. if (pmap != kernel_pmap)
  3548. newpde |= PG_U;
  3549. return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
  3550. PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL) ==
  3551. KERN_SUCCESS);
  3552. }
  3553. /*
  3554. * Tries to create the specified 2 or 4 MB page mapping. Returns KERN_SUCCESS
  3555. * if the mapping was created, and either KERN_FAILURE or
  3556. * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if
  3557. * PMAP_ENTER_NOREPLACE was specified and a mapping already exists at the
  3558. * specified virtual address. Returns KERN_RESOURCE_SHORTAGE if
  3559. * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
  3560. *
  3561. * The parameter "m" is only used when creating a managed, writeable mapping.
  3562. */
  3563. static int
  3564. pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
  3565. vm_page_t m)
  3566. {
  3567. struct spglist free;
  3568. pd_entry_t oldpde, *pde;
  3569. vm_page_t mt;
  3570. rw_assert(&pvh_global_lock, RA_WLOCKED);
  3571. KASSERT((newpde & (PG_M | PG_RW)) != PG_RW,
  3572. ("pmap_enter_pde: newpde is missing PG_M"));
  3573. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  3574. pde = pmap_pde(pmap, va);
  3575. oldpde = *pde;
  3576. if ((oldpde & PG_V) != 0) {
  3577. if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
  3578. CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
  3579. " in pmap %p", va, pmap);
  3580. return (KERN_FAILURE);
  3581. }
  3582. /* Break the existing mapping(s). */
  3583. SLIST_INIT(&free);
  3584. if ((oldpde & PG_PS) != 0) {
  3585. /*
  3586. * If the PDE resulted from a promotion, then a
  3587. * reserved PT page could be freed.
  3588. */
  3589. (void)pmap_remove_pde(pmap, pde, va, &free);
  3590. if ((oldpde & PG_G) == 0)
  3591. pmap_invalidate_pde_page(pmap, va, oldpde);
  3592. } else {
  3593. if (pmap_remove_ptes(pmap, va, va + NBPDR, &free))
  3594. pmap_invalidate_all(pmap);
  3595. }
  3596. vm_page_free_pages_toq(&free, true);
  3597. if (pmap == kernel_pmap) {
  3598. /*
  3599. * Both pmap_remove_pde() and pmap_remove_ptes() will
  3600. * leave the kernel page table page zero filled.
  3601. */
  3602. mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
  3603. if (pmap_insert_pt_page(pmap, mt, false))
  3604. panic("pmap_enter_pde: trie insert failed");
  3605. } else
  3606. KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p",
  3607. pde));
  3608. }
  3609. if ((newpde & PG_MANAGED) != 0) {
  3610. /*
  3611. * Abort this mapping if its PV entry could not be created.
  3612. */
  3613. if (!pmap_pv_insert_pde(pmap, va, newpde, flags)) {
  3614. CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
  3615. " in pmap %p", va, pmap);
  3616. return (KERN_RESOURCE_SHORTAGE);
  3617. }
  3618. if ((newpde & PG_RW) != 0) {
  3619. for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
  3620. vm_page_aflag_set(mt, PGA_WRITEABLE);
  3621. }
  3622. }
  3623. /*
  3624. * Increment counters.
  3625. */
  3626. if ((newpde & PG_W) != 0)
  3627. pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE;
  3628. pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
  3629. /*
  3630. * Map the superpage. (This is not a promoted mapping; there will not
  3631. * be any lingering 4KB page mappings in the TLB.)
  3632. */
  3633. pde_store(pde, newpde);
  3634. pmap_pde_mappings++;
  3635. CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
  3636. " in pmap %p", va, pmap);
  3637. return (KERN_SUCCESS);
  3638. }
  3639. /*
  3640. * Maps a sequence of resident pages belonging to the same object.
  3641. * The sequence begins with the given page m_start. This page is
  3642. * mapped at the given virtual address start. Each subsequent page is
  3643. * mapped at a virtual address that is offset from start by the same
  3644. * amount as the page is offset from m_start within the object. The
  3645. * last page in the sequence is the page with the largest offset from
  3646. * m_start that can be mapped at a virtual address less than the given
  3647. * virtual address end. Not every virtual page between start and end
  3648. * is mapped; only those for which a resident page exists with the
  3649. * corresponding offset from m_start are mapped.
  3650. */
  3651. void
  3652. pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
  3653. vm_page_t m_start, vm_prot_t prot)
  3654. {
  3655. vm_offset_t va;
  3656. vm_page_t m, mpte;
  3657. vm_pindex_t diff, psize;
  3658. VM_OBJECT_ASSERT_LOCKED(m_start->object);
  3659. psize = atop(end - start);
  3660. mpte = NULL;
  3661. m = m_start;
  3662. rw_wlock(&pvh_global_lock);
  3663. PMAP_LOCK(pmap);
  3664. while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
  3665. va = start + ptoa(diff);
  3666. if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
  3667. m->psind == 1 && pg_ps_enabled &&
  3668. pmap_enter_4mpage(pmap, va, m, prot))
  3669. m = &m[NBPDR / PAGE_SIZE - 1];
  3670. else
  3671. mpte = pmap_enter_quick_locked(pmap, va, m, prot,
  3672. mpte);
  3673. m = TAILQ_NEXT(m, listq);
  3674. }
  3675. rw_wunlock(&pvh_global_lock);
  3676. PMAP_UNLOCK(pmap);
  3677. }
  3678. /*
  3679. * this code makes some *MAJOR* assumptions:
  3680. * 1. Current pmap & pmap exists.
  3681. * 2. Not wired.
  3682. * 3. Read access.
  3683. * 4. No page table pages.
  3684. * but is *MUCH* faster than pmap_enter...
  3685. */
  3686. void
  3687. pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
  3688. {
  3689. rw_wlock(&pvh_global_lock);
  3690. PMAP_LOCK(pmap);
  3691. (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
  3692. rw_wunlock(&pvh_global_lock);
  3693. PMAP_UNLOCK(pmap);
  3694. }
  3695. static vm_page_t
  3696. pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
  3697. vm_prot_t prot, vm_page_t mpte)
  3698. {
  3699. pt_entry_t newpte, *pte;
  3700. struct spglist free;
  3701. KASSERT(pmap != kernel_pmap || va < kmi.clean_sva ||
  3702. va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0,
  3703. ("pmap_enter_quick_locked: managed mapping within the clean submap"));
  3704. rw_assert(&pvh_global_lock, RA_WLOCKED);
  3705. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  3706. /*
  3707. * In the case that a page table page is not
  3708. * resident, we are creating it here.
  3709. */
  3710. if (pmap != kernel_pmap) {
  3711. u_int ptepindex;
  3712. pd_entry_t ptepa;
  3713. /*
  3714. * Calculate pagetable page index
  3715. */
  3716. ptepindex = va >> PDRSHIFT;
  3717. if (mpte && (mpte->pindex == ptepindex)) {
  3718. mpte->wire_count++;
  3719. } else {
  3720. /*
  3721. * Get the page directory entry
  3722. */
  3723. ptepa = pmap->pm_pdir[ptepindex];
  3724. /*
  3725. * If the page table page is mapped, we just increment
  3726. * the hold count, and activate it.
  3727. */
  3728. if (ptepa) {
  3729. if (ptepa & PG_PS)
  3730. return (NULL);
  3731. mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
  3732. mpte->wire_count++;
  3733. } else {
  3734. mpte = _pmap_allocpte(pmap, ptepindex,
  3735. PMAP_ENTER_NOSLEEP);
  3736. if (mpte == NULL)
  3737. return (mpte);
  3738. }
  3739. }
  3740. } else {
  3741. mpte = NULL;
  3742. }
  3743. sched_pin();
  3744. pte = pmap_pte_quick(pmap, va);
  3745. if (*pte) {
  3746. if (mpte != NULL) {
  3747. mpte->wire_count--;
  3748. mpte = NULL;
  3749. }
  3750. sched_unpin();
  3751. return (mpte);
  3752. }
  3753. /*
  3754. * Enter on the PV list if part of our managed memory.
  3755. */
  3756. if ((m->oflags & VPO_UNMANAGED) == 0 &&
  3757. !pmap_try_insert_pv_entry(pmap, va, m)) {
  3758. if (mpte != NULL) {
  3759. SLIST_INIT(&free);
  3760. if (pmap_unwire_ptp(pmap, mpte, &free)) {
  3761. pmap_invalidate_page(pmap, va);
  3762. vm_page_free_pages_toq(&free, true);
  3763. }
  3764. mpte = NULL;
  3765. }
  3766. sched_unpin();
  3767. return (mpte);
  3768. }
  3769. /*
  3770. * Increment counters
  3771. */
  3772. pmap->pm_stats.resident_count++;
  3773. newpte = VM_PAGE_TO_PHYS(m) | PG_V |
  3774. pmap_cache_bits(pmap, m->md.pat_mode, 0);
  3775. if ((m->oflags & VPO_UNMANAGED) == 0)
  3776. newpte |= PG_MANAGED;
  3777. #if defined(PAE) || defined(PAE_TABLES)
  3778. if ((prot & VM_PROT_EXECUTE) == 0)
  3779. newpte |= pg_nx;
  3780. #endif
  3781. if (pmap != kernel_pmap)
  3782. newpte |= PG_U;
  3783. pte_store(pte, newpte);
  3784. sched_unpin();
  3785. return (mpte);
  3786. }
  3787. /*
  3788. * Make a temporary mapping for a physical address. This is only intended
  3789. * to be used for panic dumps.
  3790. */
  3791. void *
  3792. pmap_kenter_temporary(vm_paddr_t pa, int i)
  3793. {
  3794. vm_offset_t va;
  3795. va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
  3796. pmap_kenter(va, pa);
  3797. invlpg(va);
  3798. return ((void *)crashdumpmap);
  3799. }
  3800. /*
  3801. * This code maps large physical mmap regions into the
  3802. * processor address space. Note that some shortcuts
  3803. * are taken, but the code works.
  3804. */
  3805. void
  3806. pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
  3807. vm_pindex_t pindex, vm_size_t size)
  3808. {
  3809. pd_entry_t *pde;
  3810. vm_paddr_t pa, ptepa;
  3811. vm_page_t p;
  3812. int pat_mode;
  3813. VM_OBJECT_ASSERT_WLOCKED(object);
  3814. KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
  3815. ("pmap_object_init_pt: non-device object"));
  3816. if (pg_ps_enabled &&
  3817. (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
  3818. if (!vm_object_populate(object, pindex, pindex + atop(size)))
  3819. return;
  3820. p = vm_page_lookup(object, pindex);
  3821. KASSERT(p->valid == VM_PAGE_BITS_ALL,
  3822. ("pmap_object_init_pt: invalid page %p", p));
  3823. pat_mode = p->md.pat_mode;
  3824. /*
  3825. * Abort the mapping if the first page is not physically
  3826. * aligned to a 2/4MB page boundary.
  3827. */
  3828. ptepa = VM_PAGE_TO_PHYS(p);
  3829. if (ptepa & (NBPDR - 1))
  3830. return;
  3831. /*
  3832. * Skip the first page. Abort the mapping if the rest of
  3833. * the pages are not physically contiguous or have differing
  3834. * memory attributes.
  3835. */
  3836. p = TAILQ_NEXT(p, listq);
  3837. for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
  3838. pa += PAGE_SIZE) {
  3839. KASSERT(p->valid == VM_PAGE_BITS_ALL,
  3840. ("pmap_object_init_pt: invalid page %p", p));
  3841. if (pa != VM_PAGE_TO_PHYS(p) ||
  3842. pat_mode != p->md.pat_mode)
  3843. return;
  3844. p = TAILQ_NEXT(p, listq);
  3845. }
  3846. /*
  3847. * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and
  3848. * "size" is a multiple of 2/4M, adding the PAT setting to
  3849. * "pa" will not affect the termination of this loop.
  3850. */
  3851. PMAP_LOCK(pmap);
  3852. for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
  3853. pa < ptepa + size; pa += NBPDR) {
  3854. pde = pmap_pde(pmap, addr);
  3855. if (*pde == 0) {
  3856. pde_store(pde, pa | PG_PS | PG_M | PG_A |
  3857. PG_U | PG_RW | PG_V);
  3858. pmap->pm_stats.resident_count += NBPDR /
  3859. PAGE_SIZE;
  3860. pmap_pde_mappings++;
  3861. }
  3862. /* Else continue on if the PDE is already valid. */
  3863. addr += NBPDR;
  3864. }
  3865. PMAP_UNLOCK(pmap);
  3866. }
  3867. }
  3868. /*
  3869. * Clear the wired attribute from the mappings for the specified range of
  3870. * addresses in the given pmap. Every valid mapping within that range
  3871. * must have the wired attribute set. In contrast, invalid mappings
  3872. * cannot have the wired attribute set, so they are ignored.
  3873. *
  3874. * The wired attribute of the page table entry is not a hardware feature,
  3875. * so there is no need to invalidate any TLB entries.
  3876. */
  3877. void
  3878. pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
  3879. {
  3880. vm_offset_t pdnxt;
  3881. pd_entry_t *pde;
  3882. pt_entry_t *pte;
  3883. boolean_t pv_lists_locked;
  3884. if (pmap_is_current(pmap))
  3885. pv_lists_locked = FALSE;
  3886. else {
  3887. pv_lists_locked = TRUE;
  3888. resume:
  3889. rw_wlock(&pvh_global_lock);
  3890. sched_pin();
  3891. }
  3892. PMAP_LOCK(pmap);
  3893. for (; sva < eva; sva = pdnxt) {
  3894. pdnxt = (sva + NBPDR) & ~PDRMASK;
  3895. if (pdnxt < sva)
  3896. pdnxt = eva;
  3897. pde = pmap_pde(pmap, sva);
  3898. if ((*pde & PG_V) == 0)
  3899. continue;
  3900. if ((*pde & PG_PS) != 0) {
  3901. if ((*pde & PG_W) == 0)
  3902. panic("pmap_unwire: pde %#jx is missing PG_W",
  3903. (uintmax_t)*pde);
  3904. /*
  3905. * Are we unwiring the entire large page? If not,
  3906. * demote the mapping and fall through.
  3907. */
  3908. if (sva + NBPDR == pdnxt && eva >= pdnxt) {
  3909. /*
  3910. * Regardless of whether a pde (or pte) is 32
  3911. * or 64 bits in size, PG_W is among the least
  3912. * significant 32 bits.
  3913. */
  3914. atomic_clear_int((u_int *)pde, PG_W);
  3915. pmap->pm_stats.wired_count -= NBPDR /
  3916. PAGE_SIZE;
  3917. continue;
  3918. } else {
  3919. if (!pv_lists_locked) {
  3920. pv_lists_locked = TRUE;
  3921. if (!rw_try_wlock(&pvh_global_lock)) {
  3922. PMAP_UNLOCK(pmap);
  3923. /* Repeat sva. */
  3924. goto resume;
  3925. }
  3926. sched_pin();
  3927. }
  3928. if (!pmap_demote_pde(pmap, pde, sva))
  3929. panic("pmap_unwire: demotion failed");
  3930. }
  3931. }
  3932. if (pdnxt > eva)
  3933. pdnxt = eva;
  3934. for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
  3935. sva += PAGE_SIZE) {
  3936. if ((*pte & PG_V) == 0)
  3937. continue;
  3938. if ((*pte & PG_W) == 0)
  3939. panic("pmap_unwire: pte %#jx is missing PG_W",
  3940. (uintmax_t)*pte);
  3941. /*
  3942. * PG_W must be cleared atomically. Although the pmap
  3943. * lock synchronizes access to PG_W, another processor
  3944. * could be setting PG_M and/or PG_A concurrently.
  3945. *
  3946. * PG_W is among the least significant 32 bits.
  3947. */
  3948. atomic_clear_int((u_int *)pte, PG_W);
  3949. pmap->pm_stats.wired_count--;
  3950. }
  3951. }
  3952. if (pv_lists_locked) {
  3953. sched_unpin();
  3954. rw_wunlock(&pvh_global_lock);
  3955. }
  3956. PMAP_UNLOCK(pmap);
  3957. }
  3958. /*
  3959. * Copy the range specified by src_addr/len
  3960. * from the source map to the range dst_addr/len
  3961. * in the destination map.
  3962. *
  3963. * This routine is only advisory and need not do anything. Since
  3964. * current pmap is always the kernel pmap when executing in
  3965. * kernel, and we do not copy from the kernel pmap to a user
  3966. * pmap, this optimization is not usable in 4/4G full split i386
  3967. * world.
  3968. */
  3969. void
  3970. pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
  3971. vm_offset_t src_addr)
  3972. {
  3973. struct spglist free;
  3974. pt_entry_t *src_pte, *dst_pte, ptetemp;
  3975. pd_entry_t srcptepaddr;
  3976. vm_page_t dstmpte, srcmpte;
  3977. vm_offset_t addr, end_addr, pdnxt;
  3978. u_int ptepindex;
  3979. if (dst_addr != src_addr)
  3980. return;
  3981. end_addr = src_addr + len;
  3982. rw_wlock(&pvh_global_lock);
  3983. if (dst_pmap < src_pmap) {
  3984. PMAP_LOCK(dst_pmap);
  3985. PMAP_LOCK(src_pmap);
  3986. } else {
  3987. PMAP_LOCK(src_pmap);
  3988. PMAP_LOCK(dst_pmap);
  3989. }
  3990. sched_pin();
  3991. for (addr = src_addr; addr < end_addr; addr = pdnxt) {
  3992. KASSERT(addr < PMAP_TRM_MIN_ADDRESS,
  3993. ("pmap_copy: invalid to pmap_copy the trampoline"));
  3994. pdnxt = (addr + NBPDR) & ~PDRMASK;
  3995. if (pdnxt < addr)
  3996. pdnxt = end_addr;
  3997. ptepindex = addr >> PDRSHIFT;
  3998. srcptepaddr = src_pmap->pm_pdir[ptepindex];
  3999. if (srcptepaddr == 0)
  4000. continue;
  4001. if (srcptepaddr & PG_PS) {
  4002. if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
  4003. continue;
  4004. if (dst_pmap->pm_pdir[ptepindex] == 0 &&
  4005. ((srcptepaddr & PG_MANAGED) == 0 ||
  4006. pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr,
  4007. PMAP_ENTER_NORECLAIM))) {
  4008. dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
  4009. ~PG_W;
  4010. dst_pmap->pm_stats.resident_count +=
  4011. NBPDR / PAGE_SIZE;
  4012. pmap_pde_mappings++;
  4013. }
  4014. continue;
  4015. }
  4016. srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
  4017. KASSERT(srcmpte->wire_count > 0,
  4018. ("pmap_copy: source page table page is unused"));
  4019. if (pdnxt > end_addr)
  4020. pdnxt = end_addr;
  4021. src_pte = pmap_pte_quick3(src_pmap, addr);
  4022. while (addr < pdnxt) {
  4023. ptetemp = *src_pte;
  4024. /*
  4025. * we only virtual copy managed pages
  4026. */
  4027. if ((ptetemp & PG_MANAGED) != 0) {
  4028. dstmpte = pmap_allocpte(dst_pmap, addr,
  4029. PMAP_ENTER_NOSLEEP);
  4030. if (dstmpte == NULL)
  4031. goto out;
  4032. dst_pte = pmap_pte_quick(dst_pmap, addr);
  4033. if (*dst_pte == 0 &&
  4034. pmap_try_insert_pv_entry(dst_pmap, addr,
  4035. PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
  4036. /*
  4037. * Clear the wired, modified, and
  4038. * accessed (referenced) bits
  4039. * during the copy.
  4040. */
  4041. *dst_pte = ptetemp & ~(PG_W | PG_M |
  4042. PG_A);
  4043. dst_pmap->pm_stats.resident_count++;
  4044. } else {
  4045. SLIST_INIT(&free);
  4046. if (pmap_unwire_ptp(dst_pmap, dstmpte,
  4047. &free)) {
  4048. pmap_invalidate_page(dst_pmap,
  4049. addr);
  4050. vm_page_free_pages_toq(&free,
  4051. true);
  4052. }
  4053. goto out;
  4054. }
  4055. if (dstmpte->wire_count >= srcmpte->wire_count)
  4056. break;
  4057. }
  4058. addr += PAGE_SIZE;
  4059. src_pte++;
  4060. }
  4061. }
  4062. out:
  4063. sched_unpin();
  4064. rw_wunlock(&pvh_global_lock);
  4065. PMAP_UNLOCK(src_pmap);
  4066. PMAP_UNLOCK(dst_pmap);
  4067. }
  4068. /*
  4069. * Zero 1 page of virtual memory mapped from a hardware page by the caller.
  4070. */
  4071. static __inline void
  4072. pagezero(void *page)
  4073. {
  4074. #if defined(I686_CPU)
  4075. if (cpu_class == CPUCLASS_686) {
  4076. if (cpu_feature & CPUID_SSE2)
  4077. sse2_pagezero(page);
  4078. else
  4079. i686_pagezero(page);
  4080. } else
  4081. #endif
  4082. bzero(page, PAGE_SIZE);
  4083. }
  4084. /*
  4085. * Zero the specified hardware page.
  4086. */
  4087. void
  4088. pmap_zero_page(vm_page_t m)
  4089. {
  4090. pt_entry_t *cmap_pte2;
  4091. struct pcpu *pc;
  4092. sched_pin();
  4093. pc = get_pcpu();
  4094. cmap_pte2 = pc->pc_cmap_pte2;
  4095. mtx_lock(&pc->pc_cmap_lock);
  4096. if (*cmap_pte2)
  4097. panic("pmap_zero_page: CMAP2 busy");
  4098. *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
  4099. pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
  4100. invlcaddr(pc->pc_cmap_addr2);
  4101. pagezero(pc->pc_cmap_addr2);
  4102. *cmap_pte2 = 0;
  4103. /*
  4104. * Unpin the thread before releasing the lock. Otherwise the thread
  4105. * could be rescheduled while still bound to the current CPU, only
  4106. * to unpin itself immediately upon resuming execution.
  4107. */
  4108. sched_unpin();
  4109. mtx_unlock(&pc->pc_cmap_lock);
  4110. }
  4111. /*
  4112. * Zero an an area within a single hardware page. off and size must not
  4113. * cover an area beyond a single hardware page.
  4114. */
  4115. void
  4116. pmap_zero_page_area(vm_page_t m, int off, int size)
  4117. {
  4118. pt_entry_t *cmap_pte2;
  4119. struct pcpu *pc;
  4120. sched_pin();
  4121. pc = get_pcpu();
  4122. cmap_pte2 = pc->pc_cmap_pte2;
  4123. mtx_lock(&pc->pc_cmap_lock);
  4124. if (*cmap_pte2)
  4125. panic("pmap_zero_page_area: CMAP2 busy");
  4126. *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
  4127. pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
  4128. invlcaddr(pc->pc_cmap_addr2);
  4129. if (off == 0 && size == PAGE_SIZE)
  4130. pagezero(pc->pc_cmap_addr2);
  4131. else
  4132. bzero(pc->pc_cmap_addr2 + off, size);
  4133. *cmap_pte2 = 0;
  4134. sched_unpin();
  4135. mtx_unlock(&pc->pc_cmap_lock);
  4136. }
  4137. /*
  4138. * Copy 1 specified hardware page to another.
  4139. */
  4140. void
  4141. pmap_copy_page(vm_page_t src, vm_page_t dst)
  4142. {
  4143. pt_entry_t *cmap_pte1, *cmap_pte2;
  4144. struct pcpu *pc;
  4145. sched_pin();
  4146. pc = get_pcpu();
  4147. cmap_pte1 = pc->pc_cmap_pte1;
  4148. cmap_pte2 = pc->pc_cmap_pte2;
  4149. mtx_lock(&pc->pc_cmap_lock);
  4150. if (*cmap_pte1)
  4151. panic("pmap_copy_page: CMAP1 busy");
  4152. if (*cmap_pte2)
  4153. panic("pmap_copy_page: CMAP2 busy");
  4154. *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
  4155. pmap_cache_bits(kernel_pmap, src->md.pat_mode, 0);
  4156. invlcaddr(pc->pc_cmap_addr1);
  4157. *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
  4158. pmap_cache_bits(kernel_pmap, dst->md.pat_mode, 0);
  4159. invlcaddr(pc->pc_cmap_addr2);
  4160. bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE);
  4161. *cmap_pte1 = 0;
  4162. *cmap_pte2 = 0;
  4163. sched_unpin();
  4164. mtx_unlock(&pc->pc_cmap_lock);
  4165. }
  4166. int unmapped_buf_allowed = 1;
  4167. void
  4168. pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
  4169. vm_offset_t b_offset, int xfersize)
  4170. {
  4171. vm_page_t a_pg, b_pg;
  4172. char *a_cp, *b_cp;
  4173. vm_offset_t a_pg_offset, b_pg_offset;
  4174. pt_entry_t *cmap_pte1, *cmap_pte2;
  4175. struct pcpu *pc;
  4176. int cnt;
  4177. sched_pin();
  4178. pc = get_pcpu();
  4179. cmap_pte1 = pc->pc_cmap_pte1;
  4180. cmap_pte2 = pc->pc_cmap_pte2;
  4181. mtx_lock(&pc->pc_cmap_lock);
  4182. if (*cmap_pte1 != 0)
  4183. panic("pmap_copy_pages: CMAP1 busy");
  4184. if (*cmap_pte2 != 0)
  4185. panic("pmap_copy_pages: CMAP2 busy");
  4186. while (xfersize > 0) {
  4187. a_pg = ma[a_offset >> PAGE_SHIFT];
  4188. a_pg_offset = a_offset & PAGE_MASK;
  4189. cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
  4190. b_pg = mb[b_offset >> PAGE_SHIFT];
  4191. b_pg_offset = b_offset & PAGE_MASK;
  4192. cnt = min(cnt, PAGE_SIZE - b_pg_offset);
  4193. *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A |
  4194. pmap_cache_bits(kernel_pmap, a_pg->md.pat_mode, 0);
  4195. invlcaddr(pc->pc_cmap_addr1);
  4196. *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A |
  4197. PG_M | pmap_cache_bits(kernel_pmap, b_pg->md.pat_mode, 0);
  4198. invlcaddr(pc->pc_cmap_addr2);
  4199. a_cp = pc->pc_cmap_addr1 + a_pg_offset;
  4200. b_cp = pc->pc_cmap_addr2 + b_pg_offset;
  4201. bcopy(a_cp, b_cp, cnt);
  4202. a_offset += cnt;
  4203. b_offset += cnt;
  4204. xfersize -= cnt;
  4205. }
  4206. *cmap_pte1 = 0;
  4207. *cmap_pte2 = 0;
  4208. sched_unpin();
  4209. mtx_unlock(&pc->pc_cmap_lock);
  4210. }
  4211. /*
  4212. * Returns true if the pmap's pv is one of the first
  4213. * 16 pvs linked to from this page. This count may
  4214. * be changed upwards or downwards in the future; it
  4215. * is only necessary that true be returned for a small
  4216. * subset of pmaps for proper page aging.
  4217. */
  4218. boolean_t
  4219. pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
  4220. {
  4221. struct md_page *pvh;
  4222. pv_entry_t pv;
  4223. int loops = 0;
  4224. boolean_t rv;
  4225. KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  4226. ("pmap_page_exists_quick: page %p is not managed", m));
  4227. rv = FALSE;
  4228. rw_wlock(&pvh_global_lock);
  4229. TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
  4230. if (PV_PMAP(pv) == pmap) {
  4231. rv = TRUE;
  4232. break;
  4233. }
  4234. loops++;
  4235. if (loops >= 16)
  4236. break;
  4237. }
  4238. if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
  4239. pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
  4240. TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
  4241. if (PV_PMAP(pv) == pmap) {
  4242. rv = TRUE;
  4243. break;
  4244. }
  4245. loops++;
  4246. if (loops >= 16)
  4247. break;
  4248. }
  4249. }
  4250. rw_wunlock(&pvh_global_lock);
  4251. return (rv);
  4252. }
  4253. /*
  4254. * pmap_page_wired_mappings:
  4255. *
  4256. * Return the number of managed mappings to the given physical page
  4257. * that are wired.
  4258. */
  4259. int
  4260. pmap_page_wired_mappings(vm_page_t m)
  4261. {
  4262. int count;
  4263. count = 0;
  4264. if ((m->oflags & VPO_UNMANAGED) != 0)
  4265. return (count);
  4266. rw_wlock(&pvh_global_lock);
  4267. count = pmap_pvh_wired_mappings(&m->md, count);
  4268. if ((m->flags & PG_FICTITIOUS) == 0) {
  4269. count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
  4270. count);
  4271. }
  4272. rw_wunlock(&pvh_global_lock);
  4273. return (count);
  4274. }
  4275. /*
  4276. * pmap_pvh_wired_mappings:
  4277. *
  4278. * Return the updated number "count" of managed mappings that are wired.
  4279. */
  4280. static int
  4281. pmap_pvh_wired_mappings(struct md_page *pvh, int count)
  4282. {
  4283. pmap_t pmap;
  4284. pt_entry_t *pte;
  4285. pv_entry_t pv;
  4286. rw_assert(&pvh_global_lock, RA_WLOCKED);
  4287. sched_pin();
  4288. TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
  4289. pmap = PV_PMAP(pv);
  4290. PMAP_LOCK(pmap);
  4291. pte = pmap_pte_quick(pmap, pv->pv_va);
  4292. if ((*pte & PG_W) != 0)
  4293. count++;
  4294. PMAP_UNLOCK(pmap);
  4295. }
  4296. sched_unpin();
  4297. return (count);
  4298. }
  4299. /*
  4300. * Returns TRUE if the given page is mapped individually or as part of
  4301. * a 4mpage. Otherwise, returns FALSE.
  4302. */
  4303. boolean_t
  4304. pmap_page_is_mapped(vm_page_t m)
  4305. {
  4306. boolean_t rv;
  4307. if ((m->oflags & VPO_UNMANAGED) != 0)
  4308. return (FALSE);
  4309. rw_wlock(&pvh_global_lock);
  4310. rv = !TAILQ_EMPTY(&m->md.pv_list) ||
  4311. ((m->flags & PG_FICTITIOUS) == 0 &&
  4312. !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
  4313. rw_wunlock(&pvh_global_lock);
  4314. return (rv);
  4315. }
  4316. /*
  4317. * Remove all pages from specified address space
  4318. * this aids process exit speeds. Also, this code
  4319. * is special cased for current process only, but
  4320. * can have the more generic (and slightly slower)
  4321. * mode enabled. This is much faster than pmap_remove
  4322. * in the case of running down an entire address space.
  4323. */
  4324. void
  4325. pmap_remove_pages(pmap_t pmap)
  4326. {
  4327. pt_entry_t *pte, tpte;
  4328. vm_page_t m, mpte, mt;
  4329. pv_entry_t pv;
  4330. struct md_page *pvh;
  4331. struct pv_chunk *pc, *npc;
  4332. struct spglist free;
  4333. int field, idx;
  4334. int32_t bit;
  4335. uint32_t inuse, bitmask;
  4336. int allfree;
  4337. if (pmap != PCPU_GET(curpmap)) {
  4338. printf("warning: pmap_remove_pages called with non-current pmap\n");
  4339. return;
  4340. }
  4341. SLIST_INIT(&free);
  4342. rw_wlock(&pvh_global_lock);
  4343. PMAP_LOCK(pmap);
  4344. sched_pin();
  4345. TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
  4346. KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap,
  4347. pc->pc_pmap));
  4348. allfree = 1;
  4349. for (field = 0; field < _NPCM; field++) {
  4350. inuse = ~pc->pc_map[field] & pc_freemask[field];
  4351. while (inuse != 0) {
  4352. bit = bsfl(inuse);
  4353. bitmask = 1UL << bit;
  4354. idx = field * 32 + bit;
  4355. pv = &pc->pc_pventry[idx];
  4356. inuse &= ~bitmask;
  4357. pte = pmap_pde(pmap, pv->pv_va);
  4358. tpte = *pte;
  4359. if ((tpte & PG_PS) == 0) {
  4360. pte = pmap_pte_quick(pmap, pv->pv_va);
  4361. tpte = *pte & ~PG_PTE_PAT;
  4362. }
  4363. if (tpte == 0) {
  4364. printf(
  4365. "TPTE at %p IS ZERO @ VA %08x\n",
  4366. pte, pv->pv_va);
  4367. panic("bad pte");
  4368. }
  4369. /*
  4370. * We cannot remove wired pages from a process' mapping at this time
  4371. */
  4372. if (tpte & PG_W) {
  4373. allfree = 0;
  4374. continue;
  4375. }
  4376. m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
  4377. KASSERT(m->phys_addr == (tpte & PG_FRAME),
  4378. ("vm_page_t %p phys_addr mismatch %016jx %016jx",
  4379. m, (uintmax_t)m->phys_addr,
  4380. (uintmax_t)tpte));
  4381. KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
  4382. m < &vm_page_array[vm_page_array_size],
  4383. ("pmap_remove_pages: bad tpte %#jx",
  4384. (uintmax_t)tpte));
  4385. pte_clear(pte);
  4386. /*
  4387. * Update the vm_page_t clean/reference bits.
  4388. */
  4389. if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
  4390. if ((tpte & PG_PS) != 0) {
  4391. for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
  4392. vm_page_dirty(mt);
  4393. } else
  4394. vm_page_dirty(m);
  4395. }
  4396. /* Mark free */
  4397. PV_STAT(pv_entry_frees++);
  4398. PV_STAT(pv_entry_spare++);
  4399. pv_entry_count--;
  4400. pc->pc_map[field] |= bitmask;
  4401. if ((tpte & PG_PS) != 0) {
  4402. pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
  4403. pvh = pa_to_pvh(tpte & PG_PS_FRAME);
  4404. TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
  4405. if (TAILQ_EMPTY(&pvh->pv_list)) {
  4406. for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
  4407. if (TAILQ_EMPTY(&mt->md.pv_list))
  4408. vm_page_aflag_clear(mt, PGA_WRITEABLE);
  4409. }
  4410. mpte = pmap_remove_pt_page(pmap, pv->pv_va);
  4411. if (mpte != NULL) {
  4412. KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
  4413. ("pmap_remove_pages: pte page not promoted"));
  4414. pmap->pm_stats.resident_count--;
  4415. KASSERT(mpte->wire_count == NPTEPG,
  4416. ("pmap_remove_pages: pte page wire count error"));
  4417. mpte->wire_count = 0;
  4418. pmap_add_delayed_free_list(mpte, &free, FALSE);
  4419. }
  4420. } else {
  4421. pmap->pm_stats.resident_count--;
  4422. TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
  4423. if (TAILQ_EMPTY(&m->md.pv_list) &&
  4424. (m->flags & PG_FICTITIOUS) == 0) {
  4425. pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
  4426. if (TAILQ_EMPTY(&pvh->pv_list))
  4427. vm_page_aflag_clear(m, PGA_WRITEABLE);
  4428. }
  4429. pmap_unuse_pt(pmap, pv->pv_va, &free);
  4430. }
  4431. }
  4432. }
  4433. if (allfree) {
  4434. TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
  4435. free_pv_chunk(pc);
  4436. }
  4437. }
  4438. sched_unpin();
  4439. pmap_invalidate_all(pmap);
  4440. rw_wunlock(&pvh_global_lock);
  4441. PMAP_UNLOCK(pmap);
  4442. vm_page_free_pages_toq(&free, true);
  4443. }
  4444. /*
  4445. * pmap_is_modified:
  4446. *
  4447. * Return whether or not the specified physical page was modified
  4448. * in any physical maps.
  4449. */
  4450. boolean_t
  4451. pmap_is_modified(vm_page_t m)
  4452. {
  4453. boolean_t rv;
  4454. KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  4455. ("pmap_is_modified: page %p is not managed", m));
  4456. /*
  4457. * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
  4458. * concurrently set while the object is locked. Thus, if PGA_WRITEABLE
  4459. * is clear, no PTEs can have PG_M set.
  4460. */
  4461. VM_OBJECT_ASSERT_WLOCKED(m->object);
  4462. if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
  4463. return (FALSE);
  4464. rw_wlock(&pvh_global_lock);
  4465. rv = pmap_is_modified_pvh(&m->md) ||
  4466. ((m->flags & PG_FICTITIOUS) == 0 &&
  4467. pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
  4468. rw_wunlock(&pvh_global_lock);
  4469. return (rv);
  4470. }
  4471. /*
  4472. * Returns TRUE if any of the given mappings were used to modify
  4473. * physical memory. Otherwise, returns FALSE. Both page and 2mpage
  4474. * mappings are supported.
  4475. */
  4476. static boolean_t
  4477. pmap_is_modified_pvh(struct md_page *pvh)
  4478. {
  4479. pv_entry_t pv;
  4480. pt_entry_t *pte;
  4481. pmap_t pmap;
  4482. boolean_t rv;
  4483. rw_assert(&pvh_global_lock, RA_WLOCKED);
  4484. rv = FALSE;
  4485. sched_pin();
  4486. TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
  4487. pmap = PV_PMAP(pv);
  4488. PMAP_LOCK(pmap);
  4489. pte = pmap_pte_quick(pmap, pv->pv_va);
  4490. rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
  4491. PMAP_UNLOCK(pmap);
  4492. if (rv)
  4493. break;
  4494. }
  4495. sched_unpin();
  4496. return (rv);
  4497. }
  4498. /*
  4499. * pmap_is_prefaultable:
  4500. *
  4501. * Return whether or not the specified virtual address is elgible
  4502. * for prefault.
  4503. */
  4504. boolean_t
  4505. pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
  4506. {
  4507. pd_entry_t pde;
  4508. boolean_t rv;
  4509. rv = FALSE;
  4510. PMAP_LOCK(pmap);
  4511. pde = *pmap_pde(pmap, addr);
  4512. if (pde != 0 && (pde & PG_PS) == 0)
  4513. rv = pmap_pte_ufast(pmap, addr, pde) == 0;
  4514. PMAP_UNLOCK(pmap);
  4515. return (rv);
  4516. }
  4517. /*
  4518. * pmap_is_referenced:
  4519. *
  4520. * Return whether or not the specified physical page was referenced
  4521. * in any physical maps.
  4522. */
  4523. boolean_t
  4524. pmap_is_referenced(vm_page_t m)
  4525. {
  4526. boolean_t rv;
  4527. KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  4528. ("pmap_is_referenced: page %p is not managed", m));
  4529. rw_wlock(&pvh_global_lock);
  4530. rv = pmap_is_referenced_pvh(&m->md) ||
  4531. ((m->flags & PG_FICTITIOUS) == 0 &&
  4532. pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
  4533. rw_wunlock(&pvh_global_lock);
  4534. return (rv);
  4535. }
  4536. /*
  4537. * Returns TRUE if any of the given mappings were referenced and FALSE
  4538. * otherwise. Both page and 4mpage mappings are supported.
  4539. */
  4540. static boolean_t
  4541. pmap_is_referenced_pvh(struct md_page *pvh)
  4542. {
  4543. pv_entry_t pv;
  4544. pt_entry_t *pte;
  4545. pmap_t pmap;
  4546. boolean_t rv;
  4547. rw_assert(&pvh_global_lock, RA_WLOCKED);
  4548. rv = FALSE;
  4549. sched_pin();
  4550. TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
  4551. pmap = PV_PMAP(pv);
  4552. PMAP_LOCK(pmap);
  4553. pte = pmap_pte_quick(pmap, pv->pv_va);
  4554. rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
  4555. PMAP_UNLOCK(pmap);
  4556. if (rv)
  4557. break;
  4558. }
  4559. sched_unpin();
  4560. return (rv);
  4561. }
  4562. /*
  4563. * Clear the write and modified bits in each of the given page's mappings.
  4564. */
  4565. void
  4566. pmap_remove_write(vm_page_t m)
  4567. {
  4568. struct md_page *pvh;
  4569. pv_entry_t next_pv, pv;
  4570. pmap_t pmap;
  4571. pd_entry_t *pde;
  4572. pt_entry_t oldpte, *pte;
  4573. vm_offset_t va;
  4574. KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  4575. ("pmap_remove_write: page %p is not managed", m));
  4576. /*
  4577. * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
  4578. * set by another thread while the object is locked. Thus,
  4579. * if PGA_WRITEABLE is clear, no page table entries need updating.
  4580. */
  4581. VM_OBJECT_ASSERT_WLOCKED(m->object);
  4582. if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
  4583. return;
  4584. rw_wlock(&pvh_global_lock);
  4585. sched_pin();
  4586. if ((m->flags & PG_FICTITIOUS) != 0)
  4587. goto small_mappings;
  4588. pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
  4589. TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
  4590. va = pv->pv_va;
  4591. pmap = PV_PMAP(pv);
  4592. PMAP_LOCK(pmap);
  4593. pde = pmap_pde(pmap, va);
  4594. if ((*pde & PG_RW) != 0)
  4595. (void)pmap_demote_pde(pmap, pde, va);
  4596. PMAP_UNLOCK(pmap);
  4597. }
  4598. small_mappings:
  4599. TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
  4600. pmap = PV_PMAP(pv);
  4601. PMAP_LOCK(pmap);
  4602. pde = pmap_pde(pmap, pv->pv_va);
  4603. KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
  4604. " a 4mpage in page %p's pv list", m));
  4605. pte = pmap_pte_quick(pmap, pv->pv_va);
  4606. retry:
  4607. oldpte = *pte;
  4608. if ((oldpte & PG_RW) != 0) {
  4609. /*
  4610. * Regardless of whether a pte is 32 or 64 bits
  4611. * in size, PG_RW and PG_M are among the least
  4612. * significant 32 bits.
  4613. */
  4614. if (!atomic_cmpset_int((u_int *)pte, oldpte,
  4615. oldpte & ~(PG_RW | PG_M)))
  4616. goto retry;
  4617. if ((oldpte & PG_M) != 0)
  4618. vm_page_dirty(m);
  4619. pmap_invalidate_page(pmap, pv->pv_va);
  4620. }
  4621. PMAP_UNLOCK(pmap);
  4622. }
  4623. vm_page_aflag_clear(m, PGA_WRITEABLE);
  4624. sched_unpin();
  4625. rw_wunlock(&pvh_global_lock);
  4626. }
  4627. /*
  4628. * pmap_ts_referenced:
  4629. *
  4630. * Return a count of reference bits for a page, clearing those bits.
  4631. * It is not necessary for every reference bit to be cleared, but it
  4632. * is necessary that 0 only be returned when there are truly no
  4633. * reference bits set.
  4634. *
  4635. * As an optimization, update the page's dirty field if a modified bit is
  4636. * found while counting reference bits. This opportunistic update can be
  4637. * performed at low cost and can eliminate the need for some future calls
  4638. * to pmap_is_modified(). However, since this function stops after
  4639. * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
  4640. * dirty pages. Those dirty pages will only be detected by a future call
  4641. * to pmap_is_modified().
  4642. */
  4643. int
  4644. pmap_ts_referenced(vm_page_t m)
  4645. {
  4646. struct md_page *pvh;
  4647. pv_entry_t pv, pvf;
  4648. pmap_t pmap;
  4649. pd_entry_t *pde;
  4650. pt_entry_t *pte;
  4651. vm_paddr_t pa;
  4652. int rtval = 0;
  4653. KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  4654. ("pmap_ts_referenced: page %p is not managed", m));
  4655. pa = VM_PAGE_TO_PHYS(m);
  4656. pvh = pa_to_pvh(pa);
  4657. rw_wlock(&pvh_global_lock);
  4658. sched_pin();
  4659. if ((m->flags & PG_FICTITIOUS) != 0 ||
  4660. (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
  4661. goto small_mappings;
  4662. pv = pvf;
  4663. do {
  4664. pmap = PV_PMAP(pv);
  4665. PMAP_LOCK(pmap);
  4666. pde = pmap_pde(pmap, pv->pv_va);
  4667. if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
  4668. /*
  4669. * Although "*pde" is mapping a 2/4MB page, because
  4670. * this function is called at a 4KB page granularity,
  4671. * we only update the 4KB page under test.
  4672. */
  4673. vm_page_dirty(m);
  4674. }
  4675. if ((*pde & PG_A) != 0) {
  4676. /*
  4677. * Since this reference bit is shared by either 1024
  4678. * or 512 4KB pages, it should not be cleared every
  4679. * time it is tested. Apply a simple "hash" function
  4680. * on the physical page number, the virtual superpage
  4681. * number, and the pmap address to select one 4KB page
  4682. * out of the 1024 or 512 on which testing the
  4683. * reference bit will result in clearing that bit.
  4684. * This function is designed to avoid the selection of
  4685. * the same 4KB page for every 2- or 4MB page mapping.
  4686. *
  4687. * On demotion, a mapping that hasn't been referenced
  4688. * is simply destroyed. To avoid the possibility of a
  4689. * subsequent page fault on a demoted wired mapping,
  4690. * always leave its reference bit set. Moreover,
  4691. * since the superpage is wired, the current state of
  4692. * its reference bit won't affect page replacement.
  4693. */
  4694. if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
  4695. (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
  4696. (*pde & PG_W) == 0) {
  4697. atomic_clear_int((u_int *)pde, PG_A);
  4698. pmap_invalidate_page(pmap, pv->pv_va);
  4699. }
  4700. rtval++;
  4701. }
  4702. PMAP_UNLOCK(pmap);
  4703. /* Rotate the PV list if it has more than one entry. */
  4704. if (TAILQ_NEXT(pv, pv_next) != NULL) {
  4705. TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
  4706. TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
  4707. }
  4708. if (rtval >= PMAP_TS_REFERENCED_MAX)
  4709. goto out;
  4710. } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
  4711. small_mappings:
  4712. if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
  4713. goto out;
  4714. pv = pvf;
  4715. do {
  4716. pmap = PV_PMAP(pv);
  4717. PMAP_LOCK(pmap);
  4718. pde = pmap_pde(pmap, pv->pv_va);
  4719. KASSERT((*pde & PG_PS) == 0,
  4720. ("pmap_ts_referenced: found a 4mpage in page %p's pv list",
  4721. m));
  4722. pte = pmap_pte_quick(pmap, pv->pv_va);
  4723. if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
  4724. vm_page_dirty(m);
  4725. if ((*pte & PG_A) != 0) {
  4726. atomic_clear_int((u_int *)pte, PG_A);
  4727. pmap_invalidate_page(pmap, pv->pv_va);
  4728. rtval++;
  4729. }
  4730. PMAP_UNLOCK(pmap);
  4731. /* Rotate the PV list if it has more than one entry. */
  4732. if (TAILQ_NEXT(pv, pv_next) != NULL) {
  4733. TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
  4734. TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
  4735. }
  4736. } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval <
  4737. PMAP_TS_REFERENCED_MAX);
  4738. out:
  4739. sched_unpin();
  4740. rw_wunlock(&pvh_global_lock);
  4741. return (rtval);
  4742. }
  4743. /*
  4744. * Apply the given advice to the specified range of addresses within the
  4745. * given pmap. Depending on the advice, clear the referenced and/or
  4746. * modified flags in each mapping and set the mapped page's dirty field.
  4747. */
  4748. void
  4749. pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
  4750. {
  4751. pd_entry_t oldpde, *pde;
  4752. pt_entry_t *pte;
  4753. vm_offset_t va, pdnxt;
  4754. vm_page_t m;
  4755. bool anychanged, pv_lists_locked;
  4756. if (advice != MADV_DONTNEED && advice != MADV_FREE)
  4757. return;
  4758. if (pmap_is_current(pmap))
  4759. pv_lists_locked = false;
  4760. else {
  4761. pv_lists_locked = true;
  4762. resume:
  4763. rw_wlock(&pvh_global_lock);
  4764. sched_pin();
  4765. }
  4766. anychanged = false;
  4767. PMAP_LOCK(pmap);
  4768. for (; sva < eva; sva = pdnxt) {
  4769. pdnxt = (sva + NBPDR) & ~PDRMASK;
  4770. if (pdnxt < sva)
  4771. pdnxt = eva;
  4772. pde = pmap_pde(pmap, sva);
  4773. oldpde = *pde;
  4774. if ((oldpde & PG_V) == 0)
  4775. continue;
  4776. else if ((oldpde & PG_PS) != 0) {
  4777. if ((oldpde & PG_MANAGED) == 0)
  4778. continue;
  4779. if (!pv_lists_locked) {
  4780. pv_lists_locked = true;
  4781. if (!rw_try_wlock(&pvh_global_lock)) {
  4782. if (anychanged)
  4783. pmap_invalidate_all(pmap);
  4784. PMAP_UNLOCK(pmap);
  4785. goto resume;
  4786. }
  4787. sched_pin();
  4788. }
  4789. if (!pmap_demote_pde(pmap, pde, sva)) {
  4790. /*
  4791. * The large page mapping was destroyed.
  4792. */
  4793. continue;
  4794. }
  4795. /*
  4796. * Unless the page mappings are wired, remove the
  4797. * mapping to a single page so that a subsequent
  4798. * access may repromote. Choosing the last page
  4799. * within the address range [sva, min(pdnxt, eva))
  4800. * generally results in more repromotions. Since the
  4801. * underlying page table page is fully populated, this
  4802. * removal never frees a page table page.
  4803. */
  4804. if ((oldpde & PG_W) == 0) {
  4805. va = eva;
  4806. if (va > pdnxt)
  4807. va = pdnxt;
  4808. va -= PAGE_SIZE;
  4809. KASSERT(va >= sva,
  4810. ("pmap_advise: no address gap"));
  4811. pte = pmap_pte_quick(pmap, va);
  4812. KASSERT((*pte & PG_V) != 0,
  4813. ("pmap_advise: invalid PTE"));
  4814. pmap_remove_pte(pmap, pte, va, NULL);
  4815. anychanged = true;
  4816. }
  4817. }
  4818. if (pdnxt > eva)
  4819. pdnxt = eva;
  4820. va = pdnxt;
  4821. for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
  4822. sva += PAGE_SIZE) {
  4823. if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
  4824. goto maybe_invlrng;
  4825. else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
  4826. if (advice == MADV_DONTNEED) {
  4827. /*
  4828. * Future calls to pmap_is_modified()
  4829. * can be avoided by making the page
  4830. * dirty now.
  4831. */
  4832. m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
  4833. vm_page_dirty(m);
  4834. }
  4835. atomic_clear_int((u_int *)pte, PG_M | PG_A);
  4836. } else if ((*pte & PG_A) != 0)
  4837. atomic_clear_int((u_int *)pte, PG_A);
  4838. else
  4839. goto maybe_invlrng;
  4840. if ((*pte & PG_G) != 0) {
  4841. if (va == pdnxt)
  4842. va = sva;
  4843. } else
  4844. anychanged = true;
  4845. continue;
  4846. maybe_invlrng:
  4847. if (va != pdnxt) {
  4848. pmap_invalidate_range(pmap, va, sva);
  4849. va = pdnxt;
  4850. }
  4851. }
  4852. if (va != pdnxt)
  4853. pmap_invalidate_range(pmap, va, sva);
  4854. }
  4855. if (anychanged)
  4856. pmap_invalidate_all(pmap);
  4857. if (pv_lists_locked) {
  4858. sched_unpin();
  4859. rw_wunlock(&pvh_global_lock);
  4860. }
  4861. PMAP_UNLOCK(pmap);
  4862. }
  4863. /*
  4864. * Clear the modify bits on the specified physical page.
  4865. */
  4866. void
  4867. pmap_clear_modify(vm_page_t m)
  4868. {
  4869. struct md_page *pvh;
  4870. pv_entry_t next_pv, pv;
  4871. pmap_t pmap;
  4872. pd_entry_t oldpde, *pde;
  4873. pt_entry_t *pte;
  4874. vm_offset_t va;
  4875. KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  4876. ("pmap_clear_modify: page %p is not managed", m));
  4877. VM_OBJECT_ASSERT_WLOCKED(m->object);
  4878. KASSERT(!vm_page_xbusied(m),
  4879. ("pmap_clear_modify: page %p is exclusive busied", m));
  4880. /*
  4881. * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
  4882. * If the object containing the page is locked and the page is not
  4883. * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
  4884. */
  4885. if ((m->aflags & PGA_WRITEABLE) == 0)
  4886. return;
  4887. rw_wlock(&pvh_global_lock);
  4888. sched_pin();
  4889. if ((m->flags & PG_FICTITIOUS) != 0)
  4890. goto small_mappings;
  4891. pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
  4892. TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
  4893. va = pv->pv_va;
  4894. pmap = PV_PMAP(pv);
  4895. PMAP_LOCK(pmap);
  4896. pde = pmap_pde(pmap, va);
  4897. oldpde = *pde;
  4898. /* If oldpde has PG_RW set, then it also has PG_M set. */
  4899. if ((oldpde & PG_RW) != 0 &&
  4900. pmap_demote_pde(pmap, pde, va) &&
  4901. (oldpde & PG_W) == 0) {
  4902. /*
  4903. * Write protect the mapping to a single page so that
  4904. * a subsequent write access may repromote.
  4905. */
  4906. va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME);
  4907. pte = pmap_pte_quick(pmap, va);
  4908. /*
  4909. * Regardless of whether a pte is 32 or 64 bits
  4910. * in size, PG_RW and PG_M are among the least
  4911. * significant 32 bits.
  4912. */
  4913. atomic_clear_int((u_int *)pte, PG_M | PG_RW);
  4914. vm_page_dirty(m);
  4915. pmap_invalidate_page(pmap, va);
  4916. }
  4917. PMAP_UNLOCK(pmap);
  4918. }
  4919. small_mappings:
  4920. TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
  4921. pmap = PV_PMAP(pv);
  4922. PMAP_LOCK(pmap);
  4923. pde = pmap_pde(pmap, pv->pv_va);
  4924. KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
  4925. " a 4mpage in page %p's pv list", m));
  4926. pte = pmap_pte_quick(pmap, pv->pv_va);
  4927. if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
  4928. /*
  4929. * Regardless of whether a pte is 32 or 64 bits
  4930. * in size, PG_M is among the least significant
  4931. * 32 bits.
  4932. */
  4933. atomic_clear_int((u_int *)pte, PG_M);
  4934. pmap_invalidate_page(pmap, pv->pv_va);
  4935. }
  4936. PMAP_UNLOCK(pmap);
  4937. }
  4938. sched_unpin();
  4939. rw_wunlock(&pvh_global_lock);
  4940. }
  4941. /*
  4942. * Miscellaneous support routines follow
  4943. */
  4944. /* Adjust the cache mode for a 4KB page mapped via a PTE. */
  4945. static __inline void
  4946. pmap_pte_attr(pt_entry_t *pte, int cache_bits)
  4947. {
  4948. u_int opte, npte;
  4949. /*
  4950. * The cache mode bits are all in the low 32-bits of the
  4951. * PTE, so we can just spin on updating the low 32-bits.
  4952. */
  4953. do {
  4954. opte = *(u_int *)pte;
  4955. npte = opte & ~PG_PTE_CACHE;
  4956. npte |= cache_bits;
  4957. } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
  4958. }
  4959. /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
  4960. static __inline void
  4961. pmap_pde_attr(pd_entry_t *pde, int cache_bits)
  4962. {
  4963. u_int opde, npde;
  4964. /*
  4965. * The cache mode bits are all in the low 32-bits of the
  4966. * PDE, so we can just spin on updating the low 32-bits.
  4967. */
  4968. do {
  4969. opde = *(u_int *)pde;
  4970. npde = opde & ~PG_PDE_CACHE;
  4971. npde |= cache_bits;
  4972. } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
  4973. }
  4974. /*
  4975. * Map a set of physical memory pages into the kernel virtual
  4976. * address space. Return a pointer to where it is mapped. This
  4977. * routine is intended to be used for mapping device memory,
  4978. * NOT real memory.
  4979. */
  4980. static void *
  4981. pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags)
  4982. {
  4983. struct pmap_preinit_mapping *ppim;
  4984. vm_offset_t va, offset;
  4985. vm_page_t m;
  4986. vm_size_t tmpsize;
  4987. int i;
  4988. offset = pa & PAGE_MASK;
  4989. size = round_page(offset + size);
  4990. pa = pa & PG_FRAME;
  4991. if (pa < PMAP_MAP_LOW && pa + size <= PMAP_MAP_LOW) {
  4992. va = pa + PMAP_MAP_LOW;
  4993. if ((flags & MAPDEV_SETATTR) == 0)
  4994. return ((void *)(va + offset));
  4995. } else if (!pmap_initialized) {
  4996. va = 0;
  4997. for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
  4998. ppim = pmap_preinit_mapping + i;
  4999. if (ppim->va == 0) {
  5000. ppim->pa = pa;
  5001. ppim->sz = size;
  5002. ppim->mode = mode;
  5003. ppim->va = virtual_avail;
  5004. virtual_avail += size;
  5005. va = ppim->va;
  5006. break;
  5007. }
  5008. }
  5009. if (va == 0)
  5010. panic("%s: too many preinit mappings", __func__);
  5011. } else {
  5012. /*
  5013. * If we have a preinit mapping, re-use it.
  5014. */
  5015. for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
  5016. ppim = pmap_preinit_mapping + i;
  5017. if (ppim->pa == pa && ppim->sz == size &&
  5018. (ppim->mode == mode ||
  5019. (flags & MAPDEV_SETATTR) == 0))
  5020. return ((void *)(ppim->va + offset));
  5021. }
  5022. va = kva_alloc(size);
  5023. if (va == 0)
  5024. panic("%s: Couldn't allocate KVA", __func__);
  5025. }
  5026. for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) {
  5027. if ((flags & MAPDEV_SETATTR) == 0 && pmap_initialized) {
  5028. m = PHYS_TO_VM_PAGE(pa);
  5029. if (m != NULL && VM_PAGE_TO_PHYS(m) == pa) {
  5030. pmap_kenter_attr(va + tmpsize, pa + tmpsize,
  5031. m->md.pat_mode);
  5032. continue;
  5033. }
  5034. }
  5035. pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
  5036. }
  5037. pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
  5038. pmap_invalidate_cache_range(va, va + size);
  5039. return ((void *)(va + offset));
  5040. }
  5041. void *
  5042. pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
  5043. {
  5044. return (pmap_mapdev_internal(pa, size, mode, MAPDEV_SETATTR));
  5045. }
  5046. void *
  5047. pmap_mapdev(vm_paddr_t pa, vm_size_t size)
  5048. {
  5049. return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
  5050. }
  5051. void *
  5052. pmap_mapbios(vm_paddr_t pa, vm_size_t size)
  5053. {
  5054. return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, 0));
  5055. }
  5056. void
  5057. pmap_unmapdev(vm_offset_t va, vm_size_t size)
  5058. {
  5059. struct pmap_preinit_mapping *ppim;
  5060. vm_offset_t offset;
  5061. int i;
  5062. if (va >= PMAP_MAP_LOW && va <= KERNBASE && va + size <= KERNBASE)
  5063. return;
  5064. offset = va & PAGE_MASK;
  5065. size = round_page(offset + size);
  5066. va = trunc_page(va);
  5067. for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
  5068. ppim = pmap_preinit_mapping + i;
  5069. if (ppim->va == va && ppim->sz == size) {
  5070. if (pmap_initialized)
  5071. return;
  5072. ppim->pa = 0;
  5073. ppim->va = 0;
  5074. ppim->sz = 0;
  5075. ppim->mode = 0;
  5076. if (va + size == virtual_avail)
  5077. virtual_avail = va;
  5078. return;
  5079. }
  5080. }
  5081. if (pmap_initialized)
  5082. kva_free(va, size);
  5083. }
  5084. /*
  5085. * Sets the memory attribute for the specified page.
  5086. */
  5087. void
  5088. pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
  5089. {
  5090. m->md.pat_mode = ma;
  5091. if ((m->flags & PG_FICTITIOUS) != 0)
  5092. return;
  5093. /*
  5094. * If "m" is a normal page, flush it from the cache.
  5095. * See pmap_invalidate_cache_range().
  5096. *
  5097. * First, try to find an existing mapping of the page by sf
  5098. * buffer. sf_buf_invalidate_cache() modifies mapping and
  5099. * flushes the cache.
  5100. */
  5101. if (sf_buf_invalidate_cache(m))
  5102. return;
  5103. /*
  5104. * If page is not mapped by sf buffer, but CPU does not
  5105. * support self snoop, map the page transient and do
  5106. * invalidation. In the worst case, whole cache is flushed by
  5107. * pmap_invalidate_cache_range().
  5108. */
  5109. if ((cpu_feature & CPUID_SS) == 0)
  5110. pmap_flush_page(m);
  5111. }
  5112. static void
  5113. pmap_flush_page(vm_page_t m)
  5114. {
  5115. pt_entry_t *cmap_pte2;
  5116. struct pcpu *pc;
  5117. vm_offset_t sva, eva;
  5118. bool useclflushopt;
  5119. useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
  5120. if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) {
  5121. sched_pin();
  5122. pc = get_pcpu();
  5123. cmap_pte2 = pc->pc_cmap_pte2;
  5124. mtx_lock(&pc->pc_cmap_lock);
  5125. if (*cmap_pte2)
  5126. panic("pmap_flush_page: CMAP2 busy");
  5127. *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
  5128. PG_A | PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode,
  5129. 0);
  5130. invlcaddr(pc->pc_cmap_addr2);
  5131. sva = (vm_offset_t)pc->pc_cmap_addr2;
  5132. eva = sva + PAGE_SIZE;
  5133. /*
  5134. * Use mfence or sfence despite the ordering implied by
  5135. * mtx_{un,}lock() because clflush on non-Intel CPUs
  5136. * and clflushopt are not guaranteed to be ordered by
  5137. * any other instruction.
  5138. */
  5139. if (useclflushopt)
  5140. sfence();
  5141. else if (cpu_vendor_id != CPU_VENDOR_INTEL)
  5142. mfence();
  5143. for (; sva < eva; sva += cpu_clflush_line_size) {
  5144. if (useclflushopt)
  5145. clflushopt(sva);
  5146. else
  5147. clflush(sva);
  5148. }
  5149. if (useclflushopt)
  5150. sfence();
  5151. else if (cpu_vendor_id != CPU_VENDOR_INTEL)
  5152. mfence();
  5153. *cmap_pte2 = 0;
  5154. sched_unpin();
  5155. mtx_unlock(&pc->pc_cmap_lock);
  5156. } else
  5157. pmap_invalidate_cache();
  5158. }
  5159. /*
  5160. * Changes the specified virtual address range's memory type to that given by
  5161. * the parameter "mode". The specified virtual address range must be
  5162. * completely contained within either the kernel map.
  5163. *
  5164. * Returns zero if the change completed successfully, and either EINVAL or
  5165. * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
  5166. * of the virtual address range was not mapped, and ENOMEM is returned if
  5167. * there was insufficient memory available to complete the change.
  5168. */
  5169. int
  5170. pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
  5171. {
  5172. vm_offset_t base, offset, tmpva;
  5173. pd_entry_t *pde;
  5174. pt_entry_t *pte;
  5175. int cache_bits_pte, cache_bits_pde;
  5176. boolean_t changed;
  5177. base = trunc_page(va);
  5178. offset = va & PAGE_MASK;
  5179. size = round_page(offset + size);
  5180. /*
  5181. * Only supported on kernel virtual addresses above the recursive map.
  5182. */
  5183. if (base < VM_MIN_KERNEL_ADDRESS)
  5184. return (EINVAL);
  5185. cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
  5186. cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
  5187. changed = FALSE;
  5188. /*
  5189. * Pages that aren't mapped aren't supported. Also break down
  5190. * 2/4MB pages into 4KB pages if required.
  5191. */
  5192. PMAP_LOCK(kernel_pmap);
  5193. for (tmpva = base; tmpva < base + size; ) {
  5194. pde = pmap_pde(kernel_pmap, tmpva);
  5195. if (*pde == 0) {
  5196. PMAP_UNLOCK(kernel_pmap);
  5197. return (EINVAL);
  5198. }
  5199. if (*pde & PG_PS) {
  5200. /*
  5201. * If the current 2/4MB page already has
  5202. * the required memory type, then we need not
  5203. * demote this page. Just increment tmpva to
  5204. * the next 2/4MB page frame.
  5205. */
  5206. if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
  5207. tmpva = trunc_4mpage(tmpva) + NBPDR;
  5208. continue;
  5209. }
  5210. /*
  5211. * If the current offset aligns with a 2/4MB
  5212. * page frame and there is at least 2/4MB left
  5213. * within the range, then we need not break
  5214. * down this page into 4KB pages.
  5215. */
  5216. if ((tmpva & PDRMASK) == 0 &&
  5217. tmpva + PDRMASK < base + size) {
  5218. tmpva += NBPDR;
  5219. continue;
  5220. }
  5221. if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
  5222. PMAP_UNLOCK(kernel_pmap);
  5223. return (ENOMEM);
  5224. }
  5225. }
  5226. pte = vtopte(tmpva);
  5227. if (*pte == 0) {
  5228. PMAP_UNLOCK(kernel_pmap);
  5229. return (EINVAL);
  5230. }
  5231. tmpva += PAGE_SIZE;
  5232. }
  5233. PMAP_UNLOCK(kernel_pmap);
  5234. /*
  5235. * Ok, all the pages exist, so run through them updating their
  5236. * cache mode if required.
  5237. */
  5238. for (tmpva = base; tmpva < base + size; ) {
  5239. pde = pmap_pde(kernel_pmap, tmpva);
  5240. if (*pde & PG_PS) {
  5241. if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
  5242. pmap_pde_attr(pde, cache_bits_pde);
  5243. changed = TRUE;
  5244. }
  5245. tmpva = trunc_4mpage(tmpva) + NBPDR;
  5246. } else {
  5247. pte = vtopte(tmpva);
  5248. if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
  5249. pmap_pte_attr(pte, cache_bits_pte);
  5250. changed = TRUE;
  5251. }
  5252. tmpva += PAGE_SIZE;
  5253. }
  5254. }
  5255. /*
  5256. * Flush CPU caches to make sure any data isn't cached that
  5257. * shouldn't be, etc.
  5258. */
  5259. if (changed) {
  5260. pmap_invalidate_range(kernel_pmap, base, tmpva);
  5261. pmap_invalidate_cache_range(base, tmpva);
  5262. }
  5263. return (0);
  5264. }
  5265. /*
  5266. * perform the pmap work for mincore
  5267. */
  5268. int
  5269. pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
  5270. {
  5271. pd_entry_t pde;
  5272. pt_entry_t pte;
  5273. vm_paddr_t pa;
  5274. int val;
  5275. PMAP_LOCK(pmap);
  5276. retry:
  5277. pde = *pmap_pde(pmap, addr);
  5278. if (pde != 0) {
  5279. if ((pde & PG_PS) != 0) {
  5280. pte = pde;
  5281. /* Compute the physical address of the 4KB page. */
  5282. pa = ((pde & PG_PS_FRAME) | (addr & PDRMASK)) &
  5283. PG_FRAME;
  5284. val = MINCORE_SUPER;
  5285. } else {
  5286. pte = pmap_pte_ufast(pmap, addr, pde);
  5287. pa = pte & PG_FRAME;
  5288. val = 0;
  5289. }
  5290. } else {
  5291. pte = 0;
  5292. pa = 0;
  5293. val = 0;
  5294. }
  5295. if ((pte & PG_V) != 0) {
  5296. val |= MINCORE_INCORE;
  5297. if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
  5298. val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
  5299. if ((pte & PG_A) != 0)
  5300. val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
  5301. }
  5302. if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
  5303. (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
  5304. (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
  5305. /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
  5306. if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
  5307. goto retry;
  5308. } else
  5309. PA_UNLOCK_COND(*locked_pa);
  5310. PMAP_UNLOCK(pmap);
  5311. return (val);
  5312. }
  5313. void
  5314. pmap_activate(struct thread *td)
  5315. {
  5316. pmap_t pmap, oldpmap;
  5317. u_int cpuid;
  5318. u_int32_t cr3;
  5319. critical_enter();
  5320. pmap = vmspace_pmap(td->td_proc->p_vmspace);
  5321. oldpmap = PCPU_GET(curpmap);
  5322. cpuid = PCPU_GET(cpuid);
  5323. #if defined(SMP)
  5324. CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
  5325. CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
  5326. #else
  5327. CPU_CLR(cpuid, &oldpmap->pm_active);
  5328. CPU_SET(cpuid, &pmap->pm_active);
  5329. #endif
  5330. #if defined(PAE) || defined(PAE_TABLES)
  5331. cr3 = vtophys(pmap->pm_pdpt);
  5332. #else
  5333. cr3 = vtophys(pmap->pm_pdir);
  5334. #endif
  5335. /*
  5336. * pmap_activate is for the current thread on the current cpu
  5337. */
  5338. td->td_pcb->pcb_cr3 = cr3;
  5339. PCPU_SET(curpmap, pmap);
  5340. critical_exit();
  5341. }
  5342. void
  5343. pmap_activate_boot(pmap_t pmap)
  5344. {
  5345. u_int cpuid;
  5346. cpuid = PCPU_GET(cpuid);
  5347. #if defined(SMP)
  5348. CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
  5349. #else
  5350. CPU_SET(cpuid, &pmap->pm_active);
  5351. #endif
  5352. PCPU_SET(curpmap, pmap);
  5353. }
  5354. void
  5355. pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
  5356. {
  5357. }
  5358. /*
  5359. * Increase the starting virtual address of the given mapping if a
  5360. * different alignment might result in more superpage mappings.
  5361. */
  5362. void
  5363. pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
  5364. vm_offset_t *addr, vm_size_t size)
  5365. {
  5366. vm_offset_t superpage_offset;
  5367. if (size < NBPDR)
  5368. return;
  5369. if (object != NULL && (object->flags & OBJ_COLORED) != 0)
  5370. offset += ptoa(object->pg_color);
  5371. superpage_offset = offset & PDRMASK;
  5372. if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
  5373. (*addr & PDRMASK) == superpage_offset)
  5374. return;
  5375. if ((*addr & PDRMASK) < superpage_offset)
  5376. *addr = (*addr & ~PDRMASK) + superpage_offset;
  5377. else
  5378. *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
  5379. }
  5380. vm_offset_t
  5381. pmap_quick_enter_page(vm_page_t m)
  5382. {
  5383. vm_offset_t qaddr;
  5384. pt_entry_t *pte;
  5385. critical_enter();
  5386. qaddr = PCPU_GET(qmap_addr);
  5387. pte = vtopte(qaddr);
  5388. KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy"));
  5389. *pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
  5390. pmap_cache_bits(kernel_pmap, pmap_page_get_memattr(m), 0);
  5391. invlpg(qaddr);
  5392. return (qaddr);
  5393. }
  5394. void
  5395. pmap_quick_remove_page(vm_offset_t addr)
  5396. {
  5397. vm_offset_t qaddr;
  5398. pt_entry_t *pte;
  5399. qaddr = PCPU_GET(qmap_addr);
  5400. pte = vtopte(qaddr);
  5401. KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use"));
  5402. KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address"));
  5403. *pte = 0;
  5404. critical_exit();
  5405. }
  5406. static vmem_t *pmap_trm_arena;
  5407. static vmem_addr_t pmap_trm_arena_last = PMAP_TRM_MIN_ADDRESS;
  5408. static int trm_guard = PAGE_SIZE;
  5409. static int
  5410. pmap_trm_import(void *unused __unused, vmem_size_t size, int flags,
  5411. vmem_addr_t *addrp)
  5412. {
  5413. vm_page_t m;
  5414. vmem_addr_t af, addr, prev_addr;
  5415. pt_entry_t *trm_pte;
  5416. prev_addr = atomic_load_long(&pmap_trm_arena_last);
  5417. size = round_page(size) + trm_guard;
  5418. for (;;) {
  5419. if (prev_addr + size < prev_addr || prev_addr + size < size ||
  5420. prev_addr + size > PMAP_TRM_MAX_ADDRESS)
  5421. return (ENOMEM);
  5422. addr = prev_addr + size;
  5423. if (atomic_fcmpset_int(&pmap_trm_arena_last, &prev_addr, addr))
  5424. break;
  5425. }
  5426. prev_addr += trm_guard;
  5427. trm_pte = PTmap + atop(prev_addr);
  5428. for (af = prev_addr; af < addr; af += PAGE_SIZE) {
  5429. m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY |
  5430. VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
  5431. pte_store(&trm_pte[atop(af - prev_addr)], VM_PAGE_TO_PHYS(m) |
  5432. PG_M | PG_A | PG_RW | PG_V | pgeflag |
  5433. pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE));
  5434. }
  5435. *addrp = prev_addr;
  5436. return (0);
  5437. }
  5438. static
  5439. void pmap_init_trm(void)
  5440. {
  5441. vm_page_t pd_m;
  5442. TUNABLE_INT_FETCH("machdep.trm_guard", &trm_guard);
  5443. if ((trm_guard & PAGE_MASK) != 0)
  5444. trm_guard = 0;
  5445. pmap_trm_arena = vmem_create("i386trampoline", 0, 0, 1, 0, M_WAITOK);
  5446. vmem_set_import(pmap_trm_arena, pmap_trm_import, NULL, NULL, PAGE_SIZE);
  5447. pd_m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY |
  5448. VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK | VM_ALLOC_ZERO);
  5449. if ((pd_m->flags & PG_ZERO) == 0)
  5450. pmap_zero_page(pd_m);
  5451. PTD[TRPTDI] = VM_PAGE_TO_PHYS(pd_m) | PG_M | PG_A | PG_RW | PG_V |
  5452. pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, TRUE);
  5453. }
  5454. void *
  5455. pmap_trm_alloc(size_t size, int flags)
  5456. {
  5457. vmem_addr_t res;
  5458. int error;
  5459. MPASS((flags & ~(M_WAITOK | M_NOWAIT | M_ZERO)) == 0);
  5460. error = vmem_xalloc(pmap_trm_arena, roundup2(size, 4), sizeof(int),
  5461. 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags | M_FIRSTFIT, &res);
  5462. if (error != 0)
  5463. return (NULL);
  5464. if ((flags & M_ZERO) != 0)
  5465. bzero((void *)res, size);
  5466. return ((void *)res);
  5467. }
  5468. void
  5469. pmap_trm_free(void *addr, size_t size)
  5470. {
  5471. vmem_free(pmap_trm_arena, (uintptr_t)addr, roundup2(size, 4));
  5472. }
  5473. #if defined(PMAP_DEBUG)
  5474. pmap_pid_dump(int pid)
  5475. {
  5476. pmap_t pmap;
  5477. struct proc *p;
  5478. int npte = 0;
  5479. int index;
  5480. sx_slock(&allproc_lock);
  5481. FOREACH_PROC_IN_SYSTEM(p) {
  5482. if (p->p_pid != pid)
  5483. continue;
  5484. if (p->p_vmspace) {
  5485. int i,j;
  5486. index = 0;
  5487. pmap = vmspace_pmap(p->p_vmspace);
  5488. for (i = 0; i < NPDEPTD; i++) {
  5489. pd_entry_t *pde;
  5490. pt_entry_t *pte;
  5491. vm_offset_t base = i << PDRSHIFT;
  5492. pde = &pmap->pm_pdir[i];
  5493. if (pde && pmap_pde_v(pde)) {
  5494. for (j = 0; j < NPTEPG; j++) {
  5495. vm_offset_t va = base + (j << PAGE_SHIFT);
  5496. if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
  5497. if (index) {
  5498. index = 0;
  5499. printf("\n");
  5500. }
  5501. sx_sunlock(&allproc_lock);
  5502. return (npte);
  5503. }
  5504. pte = pmap_pte(pmap, va);
  5505. if (pte && pmap_pte_v(pte)) {
  5506. pt_entry_t pa;
  5507. vm_page_t m;
  5508. pa = *pte;
  5509. m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
  5510. printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
  5511. va, pa, m->hold_count, m->wire_count, m->flags);
  5512. npte++;
  5513. index++;
  5514. if (index >= 2) {
  5515. index = 0;
  5516. printf("\n");
  5517. } else {
  5518. printf(" ");
  5519. }
  5520. }
  5521. }
  5522. }
  5523. }
  5524. }
  5525. }
  5526. sx_sunlock(&allproc_lock);
  5527. return (npte);
  5528. }
  5529. #endif