HardenedBSD src tree https://hardenedbsd.org/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

10427 lines
275 KiB

  1. /*-
  2. * SPDX-License-Identifier: BSD-4-Clause
  3. *
  4. * Copyright (c) 1991 Regents of the University of California.
  5. * All rights reserved.
  6. * Copyright (c) 1994 John S. Dyson
  7. * All rights reserved.
  8. * Copyright (c) 1994 David Greenman
  9. * All rights reserved.
  10. * Copyright (c) 2003 Peter Wemm
  11. * All rights reserved.
  12. * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  13. * All rights reserved.
  14. *
  15. * This code is derived from software contributed to Berkeley by
  16. * the Systems Programming Group of the University of Utah Computer
  17. * Science Department and William Jolitz of UUNET Technologies Inc.
  18. *
  19. * Redistribution and use in source and binary forms, with or without
  20. * modification, are permitted provided that the following conditions
  21. * are met:
  22. * 1. Redistributions of source code must retain the above copyright
  23. * notice, this list of conditions and the following disclaimer.
  24. * 2. Redistributions in binary form must reproduce the above copyright
  25. * notice, this list of conditions and the following disclaimer in the
  26. * documentation and/or other materials provided with the distribution.
  27. * 3. All advertising materials mentioning features or use of this software
  28. * must display the following acknowledgement:
  29. * This product includes software developed by the University of
  30. * California, Berkeley and its contributors.
  31. * 4. Neither the name of the University nor the names of its contributors
  32. * may be used to endorse or promote products derived from this software
  33. * without specific prior written permission.
  34. *
  35. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  36. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  37. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  38. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  39. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  40. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  41. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  42. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  43. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  44. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  45. * SUCH DAMAGE.
  46. *
  47. * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
  48. */
  49. /*-
  50. * Copyright (c) 2003 Networks Associates Technology, Inc.
  51. * Copyright (c) 2014-2019 The FreeBSD Foundation
  52. * All rights reserved.
  53. *
  54. * This software was developed for the FreeBSD Project by Jake Burkholder,
  55. * Safeport Network Services, and Network Associates Laboratories, the
  56. * Security Research Division of Network Associates, Inc. under
  57. * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  58. * CHATS research program.
  59. *
  60. * Portions of this software were developed by
  61. * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
  62. * the FreeBSD Foundation.
  63. *
  64. * Redistribution and use in source and binary forms, with or without
  65. * modification, are permitted provided that the following conditions
  66. * are met:
  67. * 1. Redistributions of source code must retain the above copyright
  68. * notice, this list of conditions and the following disclaimer.
  69. * 2. Redistributions in binary form must reproduce the above copyright
  70. * notice, this list of conditions and the following disclaimer in the
  71. * documentation and/or other materials provided with the distribution.
  72. *
  73. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  74. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  75. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  76. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  77. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  78. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  79. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  80. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  81. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  82. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  83. * SUCH DAMAGE.
  84. */
  85. #define AMD64_NPT_AWARE
  86. #include <sys/cdefs.h>
  87. __FBSDID("$FreeBSD$");
  88. /*
  89. * Manages physical address maps.
  90. *
  91. * Since the information managed by this module is
  92. * also stored by the logical address mapping module,
  93. * this module may throw away valid virtual-to-physical
  94. * mappings at almost any time. However, invalidations
  95. * of virtual-to-physical mappings must be done as
  96. * requested.
  97. *
  98. * In order to cope with hardware architectures which
  99. * make virtual-to-physical map invalidates expensive,
  100. * this module may delay invalidate or reduced protection
  101. * operations until such time as they are actually
  102. * necessary. This module is given full information as
  103. * to which processors are currently using which maps,
  104. * and to when physical maps must be made correct.
  105. */
  106. #include "opt_ddb.h"
  107. #include "opt_pax.h"
  108. #include "opt_pmap.h"
  109. #include "opt_vm.h"
  110. #include <sys/param.h>
  111. #include <sys/bitstring.h>
  112. #include <sys/bus.h>
  113. #include <sys/systm.h>
  114. #include <sys/kernel.h>
  115. #include <sys/ktr.h>
  116. #include <sys/lock.h>
  117. #include <sys/malloc.h>
  118. #include <sys/mman.h>
  119. #include <sys/mutex.h>
  120. #include <sys/proc.h>
  121. #include <sys/rangeset.h>
  122. #include <sys/rwlock.h>
  123. #include <sys/sbuf.h>
  124. #include <sys/sx.h>
  125. #include <sys/turnstile.h>
  126. #include <sys/vmem.h>
  127. #include <sys/vmmeter.h>
  128. #include <sys/sched.h>
  129. #include <sys/sysctl.h>
  130. #include <sys/smp.h>
  131. #ifdef DDB
  132. #include <sys/kdb.h>
  133. #include <ddb/ddb.h>
  134. #endif
  135. #include <vm/vm.h>
  136. #include <vm/vm_param.h>
  137. #include <vm/vm_kern.h>
  138. #include <vm/vm_page.h>
  139. #include <vm/vm_map.h>
  140. #include <vm/vm_object.h>
  141. #include <vm/vm_extern.h>
  142. #include <vm/vm_pageout.h>
  143. #include <vm/vm_pager.h>
  144. #include <vm/vm_phys.h>
  145. #include <vm/vm_radix.h>
  146. #include <vm/vm_reserv.h>
  147. #include <vm/uma.h>
  148. #include <machine/intr_machdep.h>
  149. #include <x86/apicvar.h>
  150. #include <x86/ifunc.h>
  151. #include <machine/cpu.h>
  152. #include <machine/cputypes.h>
  153. #include <machine/md_var.h>
  154. #include <machine/pcb.h>
  155. #include <machine/specialreg.h>
  156. #ifdef SMP
  157. #include <machine/smp.h>
  158. #endif
  159. #include <machine/sysarch.h>
  160. #include <machine/tss.h>
  161. static __inline boolean_t
  162. pmap_type_guest(pmap_t pmap)
  163. {
  164. return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
  165. }
  166. static __inline boolean_t
  167. pmap_emulate_ad_bits(pmap_t pmap)
  168. {
  169. return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
  170. }
  171. static __inline pt_entry_t
  172. pmap_valid_bit(pmap_t pmap)
  173. {
  174. pt_entry_t mask;
  175. switch (pmap->pm_type) {
  176. case PT_X86:
  177. case PT_RVI:
  178. mask = X86_PG_V;
  179. break;
  180. case PT_EPT:
  181. if (pmap_emulate_ad_bits(pmap))
  182. mask = EPT_PG_EMUL_V;
  183. else
  184. mask = EPT_PG_READ;
  185. break;
  186. default:
  187. panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
  188. }
  189. return (mask);
  190. }
  191. static __inline pt_entry_t
  192. pmap_rw_bit(pmap_t pmap)
  193. {
  194. pt_entry_t mask;
  195. switch (pmap->pm_type) {
  196. case PT_X86:
  197. case PT_RVI:
  198. mask = X86_PG_RW;
  199. break;
  200. case PT_EPT:
  201. if (pmap_emulate_ad_bits(pmap))
  202. mask = EPT_PG_EMUL_RW;
  203. else
  204. mask = EPT_PG_WRITE;
  205. break;
  206. default:
  207. panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
  208. }
  209. return (mask);
  210. }
  211. static pt_entry_t pg_g;
  212. static __inline pt_entry_t
  213. pmap_global_bit(pmap_t pmap)
  214. {
  215. pt_entry_t mask;
  216. switch (pmap->pm_type) {
  217. case PT_X86:
  218. mask = pg_g;
  219. break;
  220. case PT_RVI:
  221. case PT_EPT:
  222. mask = 0;
  223. break;
  224. default:
  225. panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
  226. }
  227. return (mask);
  228. }
  229. static __inline pt_entry_t
  230. pmap_accessed_bit(pmap_t pmap)
  231. {
  232. pt_entry_t mask;
  233. switch (pmap->pm_type) {
  234. case PT_X86:
  235. case PT_RVI:
  236. mask = X86_PG_A;
  237. break;
  238. case PT_EPT:
  239. if (pmap_emulate_ad_bits(pmap))
  240. mask = EPT_PG_READ;
  241. else
  242. mask = EPT_PG_A;
  243. break;
  244. default:
  245. panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
  246. }
  247. return (mask);
  248. }
  249. static __inline pt_entry_t
  250. pmap_modified_bit(pmap_t pmap)
  251. {
  252. pt_entry_t mask;
  253. switch (pmap->pm_type) {
  254. case PT_X86:
  255. case PT_RVI:
  256. mask = X86_PG_M;
  257. break;
  258. case PT_EPT:
  259. if (pmap_emulate_ad_bits(pmap))
  260. mask = EPT_PG_WRITE;
  261. else
  262. mask = EPT_PG_M;
  263. break;
  264. default:
  265. panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
  266. }
  267. return (mask);
  268. }
  269. static __inline pt_entry_t
  270. pmap_pku_mask_bit(pmap_t pmap)
  271. {
  272. return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0);
  273. }
  274. #if !defined(DIAGNOSTIC)
  275. #ifdef __GNUC_GNU_INLINE__
  276. #define PMAP_INLINE __attribute__((__gnu_inline__)) inline
  277. #else
  278. #define PMAP_INLINE extern inline
  279. #endif
  280. #else
  281. #define PMAP_INLINE
  282. #endif
  283. #ifdef PV_STATS
  284. #define PV_STAT(x) do { x ; } while (0)
  285. #else
  286. #define PV_STAT(x) do { } while (0)
  287. #endif
  288. #define pa_index(pa) ((pa) >> PDRSHIFT)
  289. #define pa_to_pvh(pa) (&pv_table[pa_index(pa)])
  290. #define NPV_LIST_LOCKS MAXCPU
  291. #define PHYS_TO_PV_LIST_LOCK(pa) \
  292. (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
  293. #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \
  294. struct rwlock **_lockp = (lockp); \
  295. struct rwlock *_new_lock; \
  296. \
  297. _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \
  298. if (_new_lock != *_lockp) { \
  299. if (*_lockp != NULL) \
  300. rw_wunlock(*_lockp); \
  301. *_lockp = _new_lock; \
  302. rw_wlock(*_lockp); \
  303. } \
  304. } while (0)
  305. #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
  306. CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
  307. #define RELEASE_PV_LIST_LOCK(lockp) do { \
  308. struct rwlock **_lockp = (lockp); \
  309. \
  310. if (*_lockp != NULL) { \
  311. rw_wunlock(*_lockp); \
  312. *_lockp = NULL; \
  313. } \
  314. } while (0)
  315. #define VM_PAGE_TO_PV_LIST_LOCK(m) \
  316. PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
  317. struct pmap kernel_pmap_store;
  318. vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
  319. vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
  320. int nkpt;
  321. SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
  322. "Number of kernel page table pages allocated on bootup");
  323. static int ndmpdp;
  324. vm_paddr_t dmaplimit;
  325. vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
  326. pt_entry_t pg_nx;
  327. static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
  328. /* Unused, kept for ABI stability on the stable branch. */
  329. static int pat_works = 1;
  330. SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
  331. "Is page attribute table fully functional?");
  332. static int pg_ps_enabled = 1;
  333. SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
  334. &pg_ps_enabled, 0, "Are large page mappings enabled?");
  335. #define PAT_INDEX_SIZE 8
  336. static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */
  337. static u_int64_t KPTphys; /* phys addr of kernel level 1 */
  338. static u_int64_t KPDphys; /* phys addr of kernel level 2 */
  339. u_int64_t KPDPphys; /* phys addr of kernel level 3 */
  340. u_int64_t KPML4phys; /* phys addr of kernel level 4 */
  341. static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */
  342. static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */
  343. static int ndmpdpphys; /* number of DMPDPphys pages */
  344. static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */
  345. /*
  346. * pmap_mapdev support pre initialization (i.e. console)
  347. */
  348. #define PMAP_PREINIT_MAPPING_COUNT 8
  349. static struct pmap_preinit_mapping {
  350. vm_paddr_t pa;
  351. vm_offset_t va;
  352. vm_size_t sz;
  353. int mode;
  354. } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
  355. static int pmap_initialized;
  356. /*
  357. * Data for the pv entry allocation mechanism.
  358. * Updates to pv_invl_gen are protected by the pv_list_locks[]
  359. * elements, but reads are not.
  360. */
  361. static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
  362. static struct mtx __exclusive_cache_line pv_chunks_mutex;
  363. static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
  364. static u_long pv_invl_gen[NPV_LIST_LOCKS];
  365. static struct md_page *pv_table;
  366. static struct md_page pv_dummy;
  367. /*
  368. * All those kernel PT submaps that BSD is so fond of
  369. */
  370. pt_entry_t *CMAP1 = NULL;
  371. caddr_t CADDR1 = 0;
  372. static vm_offset_t qframe = 0;
  373. static struct mtx qframe_mtx;
  374. static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */
  375. static vmem_t *large_vmem;
  376. static u_int lm_ents;
  377. #define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \
  378. (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents)
  379. int pmap_pcid_enabled = 1;
  380. SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
  381. &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?");
  382. int invpcid_works = 0;
  383. SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
  384. "Is the invpcid instruction available ?");
  385. #ifdef PAX
  386. /* The related part of code is in x86/identcpu.c - see pti_get_default() */
  387. int __read_frequently pti = 1;
  388. #else
  389. int __read_frequently pti = 0;
  390. #endif
  391. SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
  392. &pti, 0,
  393. "Page Table Isolation enabled");
  394. static vm_object_t pti_obj;
  395. static pml4_entry_t *pti_pml4;
  396. static vm_pindex_t pti_pg_idx;
  397. static bool pti_finalized;
  398. struct pmap_pkru_range {
  399. struct rs_el pkru_rs_el;
  400. u_int pkru_keyidx;
  401. int pkru_flags;
  402. };
  403. static uma_zone_t pmap_pkru_ranges_zone;
  404. static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
  405. static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va);
  406. static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
  407. static void *pkru_dup_range(void *ctx, void *data);
  408. static void pkru_free_range(void *ctx, void *node);
  409. static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap);
  410. static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
  411. static void pmap_pkru_deassign_all(pmap_t pmap);
  412. static int
  413. pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
  414. {
  415. int i;
  416. uint64_t res;
  417. res = 0;
  418. CPU_FOREACH(i) {
  419. res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
  420. }
  421. return (sysctl_handle_64(oidp, &res, 0, req));
  422. }
  423. SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RD |
  424. CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
  425. "Count of saved TLB context on switch");
  426. static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
  427. LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
  428. static struct mtx invl_gen_mtx;
  429. /* Fake lock object to satisfy turnstiles interface. */
  430. static struct lock_object invl_gen_ts = {
  431. .lo_name = "invlts",
  432. };
  433. static struct pmap_invl_gen pmap_invl_gen_head = {
  434. .gen = 1,
  435. .next = NULL,
  436. };
  437. static u_long pmap_invl_gen = 1;
  438. static int pmap_invl_waiters;
  439. static struct callout pmap_invl_callout;
  440. static bool pmap_invl_callout_inited;
  441. #define PMAP_ASSERT_NOT_IN_DI() \
  442. KASSERT(pmap_not_in_di(), ("DI already started"))
  443. static bool
  444. pmap_di_locked(void)
  445. {
  446. int tun;
  447. if ((cpu_feature2 & CPUID2_CX16) == 0)
  448. return (true);
  449. tun = 0;
  450. TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun);
  451. return (tun != 0);
  452. }
  453. static int
  454. sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS)
  455. {
  456. int locked;
  457. locked = pmap_di_locked();
  458. return (sysctl_handle_int(oidp, &locked, 0, req));
  459. }
  460. SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN |
  461. CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "",
  462. "Locked delayed invalidation");
  463. static bool pmap_not_in_di_l(void);
  464. static bool pmap_not_in_di_u(void);
  465. DEFINE_IFUNC(, bool, pmap_not_in_di, (void), static)
  466. {
  467. return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u);
  468. }
  469. static bool
  470. pmap_not_in_di_l(void)
  471. {
  472. struct pmap_invl_gen *invl_gen;
  473. invl_gen = &curthread->td_md.md_invl_gen;
  474. return (invl_gen->gen == 0);
  475. }
  476. static void
  477. pmap_thread_init_invl_gen_l(struct thread *td)
  478. {
  479. struct pmap_invl_gen *invl_gen;
  480. invl_gen = &td->td_md.md_invl_gen;
  481. invl_gen->gen = 0;
  482. }
  483. static void
  484. pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen)
  485. {
  486. struct turnstile *ts;
  487. ts = turnstile_trywait(&invl_gen_ts);
  488. if (*m_gen > atomic_load_long(invl_gen))
  489. turnstile_wait(ts, NULL, TS_SHARED_QUEUE);
  490. else
  491. turnstile_cancel(ts);
  492. }
  493. static void
  494. pmap_delayed_invl_finish_unblock(u_long new_gen)
  495. {
  496. struct turnstile *ts;
  497. turnstile_chain_lock(&invl_gen_ts);
  498. ts = turnstile_lookup(&invl_gen_ts);
  499. if (new_gen != 0)
  500. pmap_invl_gen = new_gen;
  501. if (ts != NULL) {
  502. turnstile_broadcast(ts, TS_SHARED_QUEUE);
  503. turnstile_unpend(ts);
  504. }
  505. turnstile_chain_unlock(&invl_gen_ts);
  506. }
  507. /*
  508. * Start a new Delayed Invalidation (DI) block of code, executed by
  509. * the current thread. Within a DI block, the current thread may
  510. * destroy both the page table and PV list entries for a mapping and
  511. * then release the corresponding PV list lock before ensuring that
  512. * the mapping is flushed from the TLBs of any processors with the
  513. * pmap active.
  514. */
  515. static void
  516. pmap_delayed_invl_start_l(void)
  517. {
  518. struct pmap_invl_gen *invl_gen;
  519. u_long currgen;
  520. invl_gen = &curthread->td_md.md_invl_gen;
  521. PMAP_ASSERT_NOT_IN_DI();
  522. mtx_lock(&invl_gen_mtx);
  523. if (LIST_EMPTY(&pmap_invl_gen_tracker))
  524. currgen = pmap_invl_gen;
  525. else
  526. currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen;
  527. invl_gen->gen = currgen + 1;
  528. LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link);
  529. mtx_unlock(&invl_gen_mtx);
  530. }
  531. /*
  532. * Finish the DI block, previously started by the current thread. All
  533. * required TLB flushes for the pages marked by
  534. * pmap_delayed_invl_page() must be finished before this function is
  535. * called.
  536. *
  537. * This function works by bumping the global DI generation number to
  538. * the generation number of the current thread's DI, unless there is a
  539. * pending DI that started earlier. In the latter case, bumping the
  540. * global DI generation number would incorrectly signal that the
  541. * earlier DI had finished. Instead, this function bumps the earlier
  542. * DI's generation number to match the generation number of the
  543. * current thread's DI.
  544. */
  545. static void
  546. pmap_delayed_invl_finish_l(void)
  547. {
  548. struct pmap_invl_gen *invl_gen, *next;
  549. invl_gen = &curthread->td_md.md_invl_gen;
  550. KASSERT(invl_gen->gen != 0, ("missed invl_start"));
  551. mtx_lock(&invl_gen_mtx);
  552. next = LIST_NEXT(invl_gen, link);
  553. if (next == NULL)
  554. pmap_delayed_invl_finish_unblock(invl_gen->gen);
  555. else
  556. next->gen = invl_gen->gen;
  557. LIST_REMOVE(invl_gen, link);
  558. mtx_unlock(&invl_gen_mtx);
  559. invl_gen->gen = 0;
  560. }
  561. static bool
  562. pmap_not_in_di_u(void)
  563. {
  564. struct pmap_invl_gen *invl_gen;
  565. invl_gen = &curthread->td_md.md_invl_gen;
  566. return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0);
  567. }
  568. static void
  569. pmap_thread_init_invl_gen_u(struct thread *td)
  570. {
  571. struct pmap_invl_gen *invl_gen;
  572. invl_gen = &td->td_md.md_invl_gen;
  573. invl_gen->gen = 0;
  574. invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID;
  575. }
  576. static bool
  577. pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out)
  578. {
  579. uint64_t new_high, new_low, old_high, old_low;
  580. char res;
  581. old_low = new_low = 0;
  582. old_high = new_high = (uintptr_t)0;
  583. __asm volatile("lock;cmpxchg16b\t%1;sete\t%0"
  584. : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
  585. : "b"(new_low), "c" (new_high)
  586. : "memory", "cc");
  587. if (res == 0) {
  588. if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0)
  589. return (false);
  590. out->gen = old_low;
  591. out->next = (void *)old_high;
  592. } else {
  593. out->gen = new_low;
  594. out->next = (void *)new_high;
  595. }
  596. return (true);
  597. }
  598. static bool
  599. pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val,
  600. struct pmap_invl_gen *new_val)
  601. {
  602. uint64_t new_high, new_low, old_high, old_low;
  603. char res;
  604. new_low = new_val->gen;
  605. new_high = (uintptr_t)new_val->next;
  606. old_low = old_val->gen;
  607. old_high = (uintptr_t)old_val->next;
  608. __asm volatile("lock;cmpxchg16b\t%1;sete\t%0"
  609. : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
  610. : "b"(new_low), "c" (new_high)
  611. : "memory", "cc");
  612. return (res);
  613. }
  614. #ifdef PV_STATS
  615. static long invl_start_restart;
  616. SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_start_restart, CTLFLAG_RD,
  617. &invl_start_restart, 0,
  618. "");
  619. static long invl_finish_restart;
  620. SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD,
  621. &invl_finish_restart, 0,
  622. "");
  623. static int invl_max_qlen;
  624. SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD,
  625. &invl_max_qlen, 0,
  626. "");
  627. #endif
  628. static struct lock_delay_config __read_frequently di_delay;
  629. LOCK_DELAY_SYSINIT_DEFAULT(di_delay);
  630. static void
  631. pmap_delayed_invl_start_u(void)
  632. {
  633. struct pmap_invl_gen *invl_gen, *p, prev, new_prev;
  634. struct thread *td;
  635. struct lock_delay_arg lda;
  636. uintptr_t prevl;
  637. u_char pri;
  638. #ifdef PV_STATS
  639. int i, ii;
  640. #endif
  641. td = curthread;
  642. invl_gen = &td->td_md.md_invl_gen;
  643. PMAP_ASSERT_NOT_IN_DI();
  644. lock_delay_arg_init(&lda, &di_delay);
  645. invl_gen->saved_pri = 0;
  646. pri = td->td_base_pri;
  647. if (pri > PVM) {
  648. thread_lock(td);
  649. pri = td->td_base_pri;
  650. if (pri > PVM) {
  651. invl_gen->saved_pri = pri;
  652. sched_prio(td, PVM);
  653. }
  654. thread_unlock(td);
  655. }
  656. again:
  657. PV_STAT(i = 0);
  658. for (p = &pmap_invl_gen_head;; p = prev.next) {
  659. PV_STAT(i++);
  660. prevl = atomic_load_ptr(&p->next);
  661. if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
  662. PV_STAT(atomic_add_long(&invl_start_restart, 1));
  663. lock_delay(&lda);
  664. goto again;
  665. }
  666. if (prevl == 0)
  667. break;
  668. prev.next = (void *)prevl;
  669. }
  670. #ifdef PV_STATS
  671. if ((ii = invl_max_qlen) < i)
  672. atomic_cmpset_int(&invl_max_qlen, ii, i);
  673. #endif
  674. if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) {
  675. PV_STAT(atomic_add_long(&invl_start_restart, 1));
  676. lock_delay(&lda);
  677. goto again;
  678. }
  679. new_prev.gen = prev.gen;
  680. new_prev.next = invl_gen;
  681. invl_gen->gen = prev.gen + 1;
  682. /* Formal fence between store to invl->gen and updating *p. */
  683. atomic_thread_fence_rel();
  684. /*
  685. * After inserting an invl_gen element with invalid bit set,
  686. * this thread blocks any other thread trying to enter the
  687. * delayed invalidation block. Do not allow to remove us from
  688. * the CPU, because it causes starvation for other threads.
  689. */
  690. critical_enter();
  691. /*
  692. * ABA for *p is not possible there, since p->gen can only
  693. * increase. So if the *p thread finished its di, then
  694. * started a new one and got inserted into the list at the
  695. * same place, its gen will appear greater than the previously
  696. * read gen.
  697. */
  698. if (!pmap_di_store_invl(p, &prev, &new_prev)) {
  699. critical_exit();
  700. PV_STAT(atomic_add_long(&invl_start_restart, 1));
  701. lock_delay(&lda);
  702. goto again;
  703. }
  704. /*
  705. * There we clear PMAP_INVL_GEN_NEXT_INVALID in
  706. * invl_gen->next, allowing other threads to iterate past us.
  707. * pmap_di_store_invl() provides fence between the generation
  708. * write and the update of next.
  709. */
  710. invl_gen->next = NULL;
  711. critical_exit();
  712. }
  713. static bool
  714. pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen,
  715. struct pmap_invl_gen *p)
  716. {
  717. struct pmap_invl_gen prev, new_prev;
  718. u_long mygen;
  719. /*
  720. * Load invl_gen->gen after setting invl_gen->next
  721. * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger
  722. * generations to propagate to our invl_gen->gen. Lock prefix
  723. * in atomic_set_ptr() worked as seq_cst fence.
  724. */
  725. mygen = atomic_load_long(&invl_gen->gen);
  726. if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen)
  727. return (false);
  728. KASSERT(prev.gen < mygen,
  729. ("invalid di gen sequence %lu %lu", prev.gen, mygen));
  730. new_prev.gen = mygen;
  731. new_prev.next = (void *)((uintptr_t)invl_gen->next &
  732. ~PMAP_INVL_GEN_NEXT_INVALID);
  733. /* Formal fence between load of prev and storing update to it. */
  734. atomic_thread_fence_rel();
  735. return (pmap_di_store_invl(p, &prev, &new_prev));
  736. }
  737. static void
  738. pmap_delayed_invl_finish_u(void)
  739. {
  740. struct pmap_invl_gen *invl_gen, *p;
  741. struct thread *td;
  742. struct lock_delay_arg lda;
  743. uintptr_t prevl;
  744. td = curthread;
  745. invl_gen = &td->td_md.md_invl_gen;
  746. KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0"));
  747. KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0,
  748. ("missed invl_start: INVALID"));
  749. lock_delay_arg_init(&lda, &di_delay);
  750. again:
  751. for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) {
  752. prevl = atomic_load_ptr(&p->next);
  753. if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
  754. PV_STAT(atomic_add_long(&invl_finish_restart, 1));
  755. lock_delay(&lda);
  756. goto again;
  757. }
  758. if ((void *)prevl == invl_gen)
  759. break;
  760. }
  761. /*
  762. * It is legitimate to not find ourself on the list if a
  763. * thread before us finished its DI and started it again.
  764. */
  765. if (__predict_false(p == NULL)) {
  766. PV_STAT(atomic_add_long(&invl_finish_restart, 1));
  767. lock_delay(&lda);
  768. goto again;
  769. }
  770. critical_enter();
  771. atomic_set_ptr((uintptr_t *)&invl_gen->next,
  772. PMAP_INVL_GEN_NEXT_INVALID);
  773. if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) {
  774. atomic_clear_ptr((uintptr_t *)&invl_gen->next,
  775. PMAP_INVL_GEN_NEXT_INVALID);
  776. critical_exit();
  777. PV_STAT(atomic_add_long(&invl_finish_restart, 1));
  778. lock_delay(&lda);
  779. goto again;
  780. }
  781. critical_exit();
  782. if (atomic_load_int(&pmap_invl_waiters) > 0)
  783. pmap_delayed_invl_finish_unblock(0);
  784. if (invl_gen->saved_pri != 0) {
  785. thread_lock(td);
  786. sched_prio(td, invl_gen->saved_pri);
  787. thread_unlock(td);
  788. }
  789. }
  790. #ifdef DDB
  791. DB_SHOW_COMMAND(di_queue, pmap_di_queue)
  792. {
  793. struct pmap_invl_gen *p, *pn;
  794. struct thread *td;
  795. uintptr_t nextl;
  796. bool first;
  797. for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn,
  798. first = false) {
  799. nextl = atomic_load_ptr(&p->next);
  800. pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID);
  801. td = first ? NULL : __containerof(p, struct thread,
  802. td_md.md_invl_gen);
  803. db_printf("gen %lu inv %d td %p tid %d\n", p->gen,
  804. (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td,
  805. td != NULL ? td->td_tid : -1);
  806. }
  807. }
  808. #endif
  809. #ifdef PV_STATS
  810. static long invl_wait;
  811. SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0,
  812. "Number of times DI invalidation blocked pmap_remove_all/write");
  813. static long invl_wait_slow;
  814. SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, &invl_wait_slow, 0,
  815. "Number of slow invalidation waits for lockless DI");
  816. #endif
  817. static u_long *
  818. pmap_delayed_invl_genp(vm_page_t m)
  819. {
  820. return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]);
  821. }
  822. static void
  823. pmap_delayed_invl_callout_func(void *arg __unused)
  824. {
  825. if (atomic_load_int(&pmap_invl_waiters) == 0)
  826. return;
  827. pmap_delayed_invl_finish_unblock(0);
  828. }
  829. static void
  830. pmap_delayed_invl_callout_init(void *arg __unused)
  831. {
  832. if (pmap_di_locked())
  833. return;
  834. callout_init(&pmap_invl_callout, 1);
  835. pmap_invl_callout_inited = true;
  836. }
  837. SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY,
  838. pmap_delayed_invl_callout_init, NULL);
  839. /*
  840. * Ensure that all currently executing DI blocks, that need to flush
  841. * TLB for the given page m, actually flushed the TLB at the time the
  842. * function returned. If the page m has an empty PV list and we call
  843. * pmap_delayed_invl_wait(), upon its return we know that no CPU has a
  844. * valid mapping for the page m in either its page table or TLB.
  845. *
  846. * This function works by blocking until the global DI generation
  847. * number catches up with the generation number associated with the
  848. * given page m and its PV list. Since this function's callers
  849. * typically own an object lock and sometimes own a page lock, it
  850. * cannot sleep. Instead, it blocks on a turnstile to relinquish the
  851. * processor.
  852. */
  853. static void
  854. pmap_delayed_invl_wait_l(vm_page_t m)
  855. {
  856. u_long *m_gen;
  857. #ifdef PV_STATS
  858. bool accounted = false;
  859. #endif
  860. m_gen = pmap_delayed_invl_genp(m);
  861. while (*m_gen > pmap_invl_gen) {
  862. #ifdef PV_STATS
  863. if (!accounted) {
  864. atomic_add_long(&invl_wait, 1);
  865. accounted = true;
  866. }
  867. #endif
  868. pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen);
  869. }
  870. }
  871. static void
  872. pmap_delayed_invl_wait_u(vm_page_t m)
  873. {
  874. u_long *m_gen;
  875. struct lock_delay_arg lda;
  876. bool fast;
  877. fast = true;
  878. m_gen = pmap_delayed_invl_genp(m);
  879. lock_delay_arg_init(&lda, &di_delay);
  880. while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) {
  881. if (fast || !pmap_invl_callout_inited) {
  882. PV_STAT(atomic_add_long(&invl_wait, 1));
  883. lock_delay(&lda);
  884. fast = false;
  885. } else {
  886. /*
  887. * The page's invalidation generation number
  888. * is still below the current thread's number.
  889. * Prepare to block so that we do not waste
  890. * CPU cycles or worse, suffer livelock.
  891. *
  892. * Since it is impossible to block without
  893. * racing with pmap_delayed_invl_finish_u(),
  894. * prepare for the race by incrementing
  895. * pmap_invl_waiters and arming a 1-tick
  896. * callout which will unblock us if we lose
  897. * the race.
  898. */
  899. atomic_add_int(&pmap_invl_waiters, 1);
  900. /*
  901. * Re-check the current thread's invalidation
  902. * generation after incrementing
  903. * pmap_invl_waiters, so that there is no race
  904. * with pmap_delayed_invl_finish_u() setting
  905. * the page generation and checking
  906. * pmap_invl_waiters. The only race allowed
  907. * is for a missed unblock, which is handled
  908. * by the callout.
  909. */
  910. if (*m_gen >
  911. atomic_load_long(&pmap_invl_gen_head.gen)) {
  912. callout_reset(&pmap_invl_callout, 1,
  913. pmap_delayed_invl_callout_func, NULL);
  914. PV_STAT(atomic_add_long(&invl_wait_slow, 1));
  915. pmap_delayed_invl_wait_block(m_gen,
  916. &pmap_invl_gen_head.gen);
  917. }
  918. atomic_add_int(&pmap_invl_waiters, -1);
  919. }
  920. }
  921. }
  922. DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *), static)
  923. {
  924. return (pmap_di_locked() ? pmap_thread_init_invl_gen_l :
  925. pmap_thread_init_invl_gen_u);
  926. }
  927. DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void), static)
  928. {
  929. return (pmap_di_locked() ? pmap_delayed_invl_start_l :
  930. pmap_delayed_invl_start_u);
  931. }
  932. DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void), static)
  933. {
  934. return (pmap_di_locked() ? pmap_delayed_invl_finish_l :
  935. pmap_delayed_invl_finish_u);
  936. }
  937. DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t), static)
  938. {
  939. return (pmap_di_locked() ? pmap_delayed_invl_wait_l :
  940. pmap_delayed_invl_wait_u);
  941. }
  942. /*
  943. * Mark the page m's PV list as participating in the current thread's
  944. * DI block. Any threads concurrently using m's PV list to remove or
  945. * restrict all mappings to m will wait for the current thread's DI
  946. * block to complete before proceeding.
  947. *
  948. * The function works by setting the DI generation number for m's PV
  949. * list to at least the DI generation number of the current thread.
  950. * This forces a caller of pmap_delayed_invl_wait() to block until
  951. * current thread calls pmap_delayed_invl_finish().
  952. */
  953. static void
  954. pmap_delayed_invl_page(vm_page_t m)
  955. {
  956. u_long gen, *m_gen;
  957. rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED);
  958. gen = curthread->td_md.md_invl_gen.gen;
  959. if (gen == 0)
  960. return;
  961. m_gen = pmap_delayed_invl_genp(m);
  962. if (*m_gen < gen)
  963. *m_gen = gen;
  964. }
  965. /*
  966. * Crashdump maps.
  967. */
  968. static caddr_t crashdumpmap;
  969. /*
  970. * Internal flags for pmap_enter()'s helper functions.
  971. */
  972. #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */
  973. #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */
  974. /*
  975. * Internal flags for pmap_mapdev_internal() and
  976. * pmap_change_props_locked().
  977. */
  978. #define MAPDEV_FLUSHCACHE 0x00000001 /* Flush cache after mapping. */
  979. #define MAPDEV_SETATTR 0x00000002 /* Modify existing attrs. */
  980. #define MAPDEV_ASSERTVALID 0x00000004 /* Assert mapping validity. */
  981. static void free_pv_chunk(struct pv_chunk *pc);
  982. static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
  983. static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
  984. static int popcnt_pc_map_pq(uint64_t *map);
  985. static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
  986. static void reserve_pv_entries(pmap_t pmap, int needed,
  987. struct rwlock **lockp);
  988. static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
  989. struct rwlock **lockp);
  990. static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde,
  991. u_int flags, struct rwlock **lockp);
  992. #if VM_NRESERVLEVEL > 0
  993. static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
  994. struct rwlock **lockp);
  995. #endif
  996. static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
  997. static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
  998. vm_offset_t va);
  999. static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
  1000. vm_prot_t prot, int mode, int flags);
  1001. static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
  1002. static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
  1003. vm_offset_t va, struct rwlock **lockp);
  1004. static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
  1005. vm_offset_t va);
  1006. static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
  1007. vm_prot_t prot, struct rwlock **lockp);
  1008. static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde,
  1009. u_int flags, vm_page_t m, struct rwlock **lockp);
  1010. static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
  1011. vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
  1012. static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
  1013. static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted);
  1014. static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva,
  1015. vm_offset_t eva);
  1016. static void pmap_invalidate_cache_range_all(vm_offset_t sva,
  1017. vm_offset_t eva);
  1018. static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
  1019. pd_entry_t pde);
  1020. static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
  1021. static vm_page_t pmap_large_map_getptp_unlocked(void);
  1022. static vm_paddr_t pmap_large_map_kextract(vm_offset_t va);
  1023. #if VM_NRESERVLEVEL > 0
  1024. static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
  1025. struct rwlock **lockp);
  1026. #endif
  1027. static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
  1028. vm_prot_t prot);
  1029. static void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask);
  1030. static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva,
  1031. bool exec);
  1032. static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
  1033. static pd_entry_t *pmap_pti_pde(vm_offset_t va);
  1034. static void pmap_pti_wire_pte(void *pte);
  1035. static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
  1036. struct spglist *free, struct rwlock **lockp);
  1037. static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
  1038. pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
  1039. static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
  1040. static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
  1041. struct spglist *free);
  1042. static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
  1043. pd_entry_t *pde, struct spglist *free,
  1044. struct rwlock **lockp);
  1045. static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
  1046. vm_page_t m, struct rwlock **lockp);
  1047. static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
  1048. pd_entry_t newpde);
  1049. static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
  1050. static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
  1051. struct rwlock **lockp);
  1052. static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
  1053. struct rwlock **lockp);
  1054. static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
  1055. struct rwlock **lockp);
  1056. static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
  1057. struct spglist *free);
  1058. static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
  1059. /********************/
  1060. /* Inline functions */
  1061. /********************/
  1062. /* Return a non-clipped PD index for a given VA */
  1063. static __inline vm_pindex_t
  1064. pmap_pde_pindex(vm_offset_t va)
  1065. {
  1066. return (va >> PDRSHIFT);
  1067. }
  1068. /* Return a pointer to the PML4 slot that corresponds to a VA */
  1069. static __inline pml4_entry_t *
  1070. pmap_pml4e(pmap_t pmap, vm_offset_t va)
  1071. {
  1072. return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
  1073. }
  1074. /* Return a pointer to the PDP slot that corresponds to a VA */
  1075. static __inline pdp_entry_t *
  1076. pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
  1077. {
  1078. pdp_entry_t *pdpe;
  1079. pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
  1080. return (&pdpe[pmap_pdpe_index(va)]);
  1081. }
  1082. /* Return a pointer to the PDP slot that corresponds to a VA */
  1083. static __inline pdp_entry_t *
  1084. pmap_pdpe(pmap_t pmap, vm_offset_t va)
  1085. {
  1086. pml4_entry_t *pml4e;
  1087. pt_entry_t PG_V;
  1088. PG_V = pmap_valid_bit(pmap);
  1089. pml4e = pmap_pml4e(pmap, va);
  1090. if ((*pml4e & PG_V) == 0)
  1091. return (NULL);
  1092. return (pmap_pml4e_to_pdpe(pml4e, va));
  1093. }
  1094. /* Return a pointer to the PD slot that corresponds to a VA */
  1095. static __inline pd_entry_t *
  1096. pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
  1097. {
  1098. pd_entry_t *pde;
  1099. KASSERT((*pdpe & PG_PS) == 0,
  1100. ("%s: pdpe %#lx is a leaf", __func__, *pdpe));
  1101. pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
  1102. return (&pde[pmap_pde_index(va)]);
  1103. }
  1104. /* Return a pointer to the PD slot that corresponds to a VA */
  1105. static __inline pd_entry_t *
  1106. pmap_pde(pmap_t pmap, vm_offset_t va)
  1107. {
  1108. pdp_entry_t *pdpe;
  1109. pt_entry_t PG_V;
  1110. PG_V = pmap_valid_bit(pmap);
  1111. pdpe = pmap_pdpe(pmap, va);
  1112. if (pdpe == NULL || (*pdpe & PG_V) == 0)
  1113. return (NULL);
  1114. return (pmap_pdpe_to_pde(pdpe, va));
  1115. }
  1116. /* Return a pointer to the PT slot that corresponds to a VA */
  1117. static __inline pt_entry_t *
  1118. pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
  1119. {
  1120. pt_entry_t *pte;
  1121. KASSERT((*pde & PG_PS) == 0,
  1122. ("%s: pde %#lx is a leaf", __func__, *pde));
  1123. pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
  1124. return (&pte[pmap_pte_index(va)]);
  1125. }
  1126. /* Return a pointer to the PT slot that corresponds to a VA */
  1127. static __inline pt_entry_t *
  1128. pmap_pte(pmap_t pmap, vm_offset_t va)
  1129. {
  1130. pd_entry_t *pde;
  1131. pt_entry_t PG_V;
  1132. PG_V = pmap_valid_bit(pmap);
  1133. pde = pmap_pde(pmap, va);
  1134. if (pde == NULL || (*pde & PG_V) == 0)
  1135. return (NULL);
  1136. if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */
  1137. return ((pt_entry_t *)pde);
  1138. return (pmap_pde_to_pte(pde, va));
  1139. }
  1140. static __inline void
  1141. pmap_resident_count_inc(pmap_t pmap, int count)
  1142. {
  1143. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  1144. pmap->pm_stats.resident_count += count;
  1145. }
  1146. static __inline void
  1147. pmap_resident_count_dec(pmap_t pmap, int count)
  1148. {
  1149. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  1150. KASSERT(pmap->pm_stats.resident_count >= count,
  1151. ("pmap %p resident count underflow %ld %d", pmap,
  1152. pmap->pm_stats.resident_count, count));
  1153. pmap->pm_stats.resident_count -= count;
  1154. }
  1155. PMAP_INLINE pt_entry_t *
  1156. vtopte(vm_offset_t va)
  1157. {
  1158. u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
  1159. KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
  1160. return (PTmap + ((va >> PAGE_SHIFT) & mask));
  1161. }
  1162. static __inline pd_entry_t *
  1163. vtopde(vm_offset_t va)
  1164. {
  1165. u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
  1166. KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
  1167. return (PDmap + ((va >> PDRSHIFT) & mask));
  1168. }
  1169. static u_int64_t
  1170. allocpages(vm_paddr_t *firstaddr, int n)
  1171. {
  1172. u_int64_t ret;
  1173. ret = *firstaddr;
  1174. bzero((void *)ret, n * PAGE_SIZE);
  1175. *firstaddr += n * PAGE_SIZE;
  1176. return (ret);
  1177. }
  1178. CTASSERT(powerof2(NDMPML4E));
  1179. /* number of kernel PDP slots */
  1180. #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG)
  1181. static void
  1182. nkpt_init(vm_paddr_t addr)
  1183. {
  1184. int pt_pages;
  1185. #ifdef NKPT
  1186. pt_pages = NKPT;
  1187. #else
  1188. pt_pages = howmany(addr, 1 << PDRSHIFT);
  1189. pt_pages += NKPDPE(pt_pages);
  1190. /*
  1191. * Add some slop beyond the bare minimum required for bootstrapping
  1192. * the kernel.
  1193. *
  1194. * This is quite important when allocating KVA for kernel modules.
  1195. * The modules are required to be linked in the negative 2GB of
  1196. * the address space. If we run out of KVA in this region then
  1197. * pmap_growkernel() will need to allocate page table pages to map
  1198. * the entire 512GB of KVA space which is an unnecessary tax on
  1199. * physical memory.
  1200. *
  1201. * Secondly, device memory mapped as part of setting up the low-
  1202. * level console(s) is taken from KVA, starting at virtual_avail.
  1203. * This is because cninit() is called after pmap_bootstrap() but
  1204. * before vm_init() and pmap_init(). 20MB for a frame buffer is
  1205. * not uncommon.
  1206. */
  1207. pt_pages += 32; /* 64MB additional slop. */
  1208. #endif
  1209. nkpt = pt_pages;
  1210. }
  1211. /*
  1212. * Returns the proper write/execute permission for a physical page that is
  1213. * part of the initial boot allocations.
  1214. *
  1215. * If the page has kernel text, it is marked as read-only. If the page has
  1216. * kernel read-only data, it is marked as read-only/not-executable. If the
  1217. * page has only read-write data, it is marked as read-write/not-executable.
  1218. * If the page is below/above the kernel range, it is marked as read-write.
  1219. *
  1220. * This function operates on 2M pages, since we map the kernel space that
  1221. * way.
  1222. *
  1223. * Note that this doesn't currently provide any protection for modules.
  1224. */
  1225. static inline pt_entry_t
  1226. bootaddr_rwx(vm_paddr_t pa)
  1227. {
  1228. /*
  1229. * Everything in the same 2M page as the start of the kernel
  1230. * should be static. On the other hand, things in the same 2M
  1231. * page as the end of the kernel could be read-write/executable,
  1232. * as the kernel image is not guaranteed to end on a 2M boundary.
  1233. */
  1234. if (pa < trunc_2mpage(btext - KERNBASE) ||
  1235. pa >= trunc_2mpage(_end - KERNBASE))
  1236. return (X86_PG_RW);
  1237. /*
  1238. * The linker should ensure that the read-only and read-write
  1239. * portions don't share the same 2M page, so this shouldn't
  1240. * impact read-only data. However, in any case, any page with
  1241. * read-write data needs to be read-write.
  1242. */
  1243. if (pa >= trunc_2mpage(brwsection - KERNBASE))
  1244. return (X86_PG_RW | pg_nx);
  1245. /*
  1246. * Mark any 2M page containing kernel text as read-only. Mark
  1247. * other pages with read-only data as read-only and not executable.
  1248. * (It is likely a small portion of the read-only data section will
  1249. * be marked as read-only, but executable. This should be acceptable
  1250. * since the read-only protection will keep the data from changing.)
  1251. * Note that fixups to the .text section will still work until we
  1252. * set CR0.WP.
  1253. */
  1254. if (pa < round_2mpage(etext - KERNBASE))
  1255. return (0);
  1256. return (pg_nx);
  1257. }
  1258. static void
  1259. create_pagetables(vm_paddr_t *firstaddr)
  1260. {
  1261. int i, j, ndm1g, nkpdpe, nkdmpde;
  1262. pd_entry_t *pd_p;
  1263. pdp_entry_t *pdp_p;
  1264. pml4_entry_t *p4_p;
  1265. uint64_t DMPDkernphys;
  1266. /* Allocate page table pages for the direct map */
  1267. ndmpdp = howmany(ptoa(Maxmem), NBPDP);
  1268. if (ndmpdp < 4) /* Minimum 4GB of dirmap */
  1269. ndmpdp = 4;
  1270. ndmpdpphys = howmany(ndmpdp, NPDPEPG);
  1271. if (ndmpdpphys > NDMPML4E) {
  1272. /*
  1273. * Each NDMPML4E allows 512 GB, so limit to that,
  1274. * and then readjust ndmpdp and ndmpdpphys.
  1275. */
  1276. printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
  1277. Maxmem = atop(NDMPML4E * NBPML4);
  1278. ndmpdpphys = NDMPML4E;
  1279. ndmpdp = NDMPML4E * NPDEPG;
  1280. }
  1281. DMPDPphys = allocpages(firstaddr, ndmpdpphys);
  1282. ndm1g = 0;
  1283. if ((amd_feature & AMDID_PAGE1GB) != 0) {
  1284. /*
  1285. * Calculate the number of 1G pages that will fully fit in
  1286. * Maxmem.
  1287. */
  1288. ndm1g = ptoa(Maxmem) >> PDPSHIFT;
  1289. /*
  1290. * Allocate 2M pages for the kernel. These will be used in
  1291. * place of the first one or more 1G pages from ndm1g.
  1292. */
  1293. nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP);
  1294. DMPDkernphys = allocpages(firstaddr, nkdmpde);
  1295. }
  1296. if (ndm1g < ndmpdp)
  1297. DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
  1298. dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
  1299. /* Allocate pages */
  1300. KPML4phys = allocpages(firstaddr, 1);
  1301. KPDPphys = allocpages(firstaddr, NKPML4E);
  1302. /*
  1303. * Allocate the initial number of kernel page table pages required to
  1304. * bootstrap. We defer this until after all memory-size dependent
  1305. * allocations are done (e.g. direct map), so that we don't have to
  1306. * build in too much slop in our estimate.
  1307. *
  1308. * Note that when NKPML4E > 1, we have an empty page underneath
  1309. * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
  1310. * pages. (pmap_enter requires a PD page to exist for each KPML4E.)
  1311. */
  1312. nkpt_init(*firstaddr);
  1313. nkpdpe = NKPDPE(nkpt);
  1314. KPTphys = allocpages(firstaddr, nkpt);
  1315. KPDphys = allocpages(firstaddr, nkpdpe);
  1316. /*
  1317. * Connect the zero-filled PT pages to their PD entries. This
  1318. * implicitly maps the PT pages at their correct locations within
  1319. * the PTmap.
  1320. */
  1321. pd_p = (pd_entry_t *)KPDphys;
  1322. for (i = 0; i < nkpt; i++)
  1323. pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
  1324. /*
  1325. * Map from physical address zero to the end of loader preallocated
  1326. * memory using 2MB pages. This replaces some of the PD entries
  1327. * created above.
  1328. */
  1329. for (i = 0; (i << PDRSHIFT) < KERNend; i++)
  1330. /* Preset PG_M and PG_A because demotion expects it. */
  1331. pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
  1332. X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT);
  1333. /*
  1334. * Because we map the physical blocks in 2M pages, adjust firstaddr
  1335. * to record the physical blocks we've actually mapped into kernel
  1336. * virtual address space.
  1337. */
  1338. if (*firstaddr < round_2mpage(KERNend))
  1339. *firstaddr = round_2mpage(KERNend);
  1340. /* And connect up the PD to the PDP (leaving room for L4 pages) */
  1341. pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
  1342. for (i = 0; i < nkpdpe; i++)
  1343. pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
  1344. /*
  1345. * Now, set up the direct map region using 2MB and/or 1GB pages. If
  1346. * the end of physical memory is not aligned to a 1GB page boundary,
  1347. * then the residual physical memory is mapped with 2MB pages. Later,
  1348. * if pmap_mapdev{_attr}() uses the direct map for non-write-back
  1349. * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
  1350. * that are partially used.
  1351. */
  1352. pd_p = (pd_entry_t *)DMPDphys;
  1353. for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
  1354. pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
  1355. /* Preset PG_M and PG_A because demotion expects it. */
  1356. pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
  1357. X86_PG_M | X86_PG_A | pg_nx;
  1358. }
  1359. pdp_p = (pdp_entry_t *)DMPDPphys;
  1360. for (i = 0; i < ndm1g; i++) {
  1361. pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
  1362. /* Preset PG_M and PG_A because demotion expects it. */
  1363. pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
  1364. X86_PG_M | X86_PG_A | pg_nx;
  1365. }
  1366. for (j = 0; i < ndmpdp; i++, j++) {
  1367. pdp_p[i] = DMPDphys + ptoa(j);
  1368. pdp_p[i] |= X86_PG_RW | X86_PG_V | pg_nx;
  1369. }
  1370. /*
  1371. * Instead of using a 1G page for the memory containing the kernel,
  1372. * use 2M pages with read-only and no-execute permissions. (If using 1G
  1373. * pages, this will partially overwrite the PDPEs above.)
  1374. */
  1375. if (ndm1g) {
  1376. pd_p = (pd_entry_t *)DMPDkernphys;
  1377. for (i = 0; i < (NPDEPG * nkdmpde); i++)
  1378. pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
  1379. X86_PG_M | X86_PG_A | pg_nx |
  1380. bootaddr_rwx(i << PDRSHIFT);
  1381. for (i = 0; i < nkdmpde; i++)
  1382. pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW |
  1383. X86_PG_V | pg_nx;
  1384. }
  1385. /* And recursively map PML4 to itself in order to get PTmap */
  1386. p4_p = (pml4_entry_t *)KPML4phys;
  1387. p4_p[PML4PML4I] = KPML4phys;
  1388. p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx;
  1389. /* Connect the Direct Map slot(s) up to the PML4. */
  1390. for (i = 0; i < ndmpdpphys; i++) {
  1391. p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
  1392. p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx;
  1393. }
  1394. /* Connect the KVA slots up to the PML4 */
  1395. for (i = 0; i < NKPML4E; i++) {
  1396. p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
  1397. p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V;
  1398. }
  1399. }
  1400. /*
  1401. * Bootstrap the system enough to run with virtual memory.
  1402. *
  1403. * On amd64 this is called after mapping has already been enabled
  1404. * and just syncs the pmap module with what has already been done.
  1405. * [We can't call it easily with mapping off since the kernel is not
  1406. * mapped with PA == VA, hence we would have to relocate every address
  1407. * from the linked base (virtual) address "KERNBASE" to the actual
  1408. * (physical) address starting relative to 0]
  1409. */
  1410. void
  1411. pmap_bootstrap(vm_paddr_t *firstaddr)
  1412. {
  1413. vm_offset_t va;
  1414. pt_entry_t *pte, *pcpu_pte;
  1415. uint64_t cr4, pcpu_phys;
  1416. u_long res;
  1417. int i;
  1418. KERNend = *firstaddr;
  1419. res = atop(KERNend - (vm_paddr_t)kernphys);
  1420. if (!pti)
  1421. pg_g = X86_PG_G;
  1422. /*
  1423. * Create an initial set of page tables to run the kernel in.
  1424. */
  1425. create_pagetables(firstaddr);
  1426. pcpu_phys = allocpages(firstaddr, MAXCPU);
  1427. /*
  1428. * Add a physical memory segment (vm_phys_seg) corresponding to the
  1429. * preallocated kernel page table pages so that vm_page structures
  1430. * representing these pages will be created. The vm_page structures
  1431. * are required for promotion of the corresponding kernel virtual
  1432. * addresses to superpage mappings.
  1433. */
  1434. vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
  1435. /*
  1436. * Account for the virtual addresses mapped by create_pagetables().
  1437. */
  1438. virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend);
  1439. virtual_end = VM_MAX_KERNEL_ADDRESS;
  1440. /*
  1441. * Enable PG_G global pages, then switch to the kernel page
  1442. * table from the bootstrap page table. After the switch, it
  1443. * is possible to enable SMEP and SMAP since PG_U bits are
  1444. * correct now.
  1445. */
  1446. cr4 = rcr4();
  1447. cr4 |= CR4_PGE;
  1448. load_cr4(cr4);
  1449. load_cr3(KPML4phys);
  1450. if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
  1451. cr4 |= CR4_SMEP;
  1452. if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
  1453. cr4 |= CR4_SMAP;
  1454. load_cr4(cr4);
  1455. /*
  1456. * Initialize the kernel pmap (which is statically allocated).
  1457. * Count bootstrap data as being resident in case any of this data is
  1458. * later unmapped (using pmap_remove()) and freed.
  1459. */
  1460. PMAP_LOCK_INIT(kernel_pmap);
  1461. kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
  1462. kernel_pmap->pm_cr3 = KPML4phys;
  1463. kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
  1464. CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */
  1465. TAILQ_INIT(&kernel_pmap->pm_pvchunk);
  1466. kernel_pmap->pm_stats.resident_count = res;
  1467. kernel_pmap->pm_flags = pmap_flags;
  1468. /*
  1469. * Initialize the TLB invalidations generation number lock.
  1470. */
  1471. mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF);
  1472. /*
  1473. * Reserve some special page table entries/VA space for temporary
  1474. * mapping of pages.
  1475. */
  1476. #define SYSMAP(c, p, v, n) \
  1477. v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
  1478. va = virtual_avail;
  1479. pte = vtopte(va);
  1480. /*
  1481. * Crashdump maps. The first page is reused as CMAP1 for the
  1482. * memory test.
  1483. */
  1484. SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
  1485. CADDR1 = crashdumpmap;
  1486. SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU);
  1487. virtual_avail = va;
  1488. for (i = 0; i < MAXCPU; i++) {
  1489. pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW |
  1490. pg_g | pg_nx | X86_PG_M | X86_PG_A;
  1491. }
  1492. STAILQ_INIT(&cpuhead);
  1493. wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
  1494. pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu));
  1495. amd64_bsp_pcpu_init1(&__pcpu[0]);
  1496. amd64_bsp_ist_init(&__pcpu[0]);
  1497. __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic;
  1498. __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id;
  1499. /*
  1500. * Initialize the PAT MSR.
  1501. * pmap_init_pat() clears and sets CR4_PGE, which, as a
  1502. * side-effect, invalidates stale PG_G TLB entries that might
  1503. * have been created in our pre-boot environment.
  1504. */
  1505. pmap_init_pat();
  1506. /* Initialize TLB Context Id. */
  1507. if (pmap_pcid_enabled) {
  1508. for (i = 0; i < MAXCPU; i++) {
  1509. kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN;
  1510. kernel_pmap->pm_pcids[i].pm_gen = 1;
  1511. }
  1512. /*
  1513. * PMAP_PCID_KERN + 1 is used for initialization of
  1514. * proc0 pmap. The pmap' pcid state might be used by
  1515. * EFIRT entry before first context switch, so it
  1516. * needs to be valid.
  1517. */
  1518. PCPU_SET(pcid_next, PMAP_PCID_KERN + 2);
  1519. PCPU_SET(pcid_gen, 1);
  1520. /*
  1521. * pcpu area for APs is zeroed during AP startup.
  1522. * pc_pcid_next and pc_pcid_gen are initialized by AP
  1523. * during pcpu setup.
  1524. */
  1525. load_cr4(rcr4() | CR4_PCIDE);
  1526. }
  1527. }
  1528. /*
  1529. * Setup the PAT MSR.
  1530. */
  1531. void
  1532. pmap_init_pat(void)
  1533. {
  1534. uint64_t pat_msr;
  1535. u_long cr0, cr4;
  1536. int i;
  1537. /* Bail if this CPU doesn't implement PAT. */
  1538. if ((cpu_feature & CPUID_PAT) == 0)
  1539. panic("no PAT??");
  1540. /* Set default PAT index table. */
  1541. for (i = 0; i < PAT_INDEX_SIZE; i++)
  1542. pat_index[i] = -1;
  1543. pat_index[PAT_WRITE_BACK] = 0;
  1544. pat_index[PAT_WRITE_THROUGH] = 1;
  1545. pat_index[PAT_UNCACHEABLE] = 3;
  1546. pat_index[PAT_WRITE_COMBINING] = 6;
  1547. pat_index[PAT_WRITE_PROTECTED] = 5;
  1548. pat_index[PAT_UNCACHED] = 2;
  1549. /*
  1550. * Initialize default PAT entries.
  1551. * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
  1552. * Program 5 and 6 as WP and WC.
  1553. *
  1554. * Leave 4 and 7 as WB and UC. Note that a recursive page table
  1555. * mapping for a 2M page uses a PAT value with the bit 3 set due
  1556. * to its overload with PG_PS.
  1557. */
  1558. pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
  1559. PAT_VALUE(1, PAT_WRITE_THROUGH) |
  1560. PAT_VALUE(2, PAT_UNCACHED) |
  1561. PAT_VALUE(3, PAT_UNCACHEABLE) |
  1562. PAT_VALUE(4, PAT_WRITE_BACK) |
  1563. PAT_VALUE(5, PAT_WRITE_PROTECTED) |
  1564. PAT_VALUE(6, PAT_WRITE_COMBINING) |
  1565. PAT_VALUE(7, PAT_UNCACHEABLE);
  1566. /* Disable PGE. */
  1567. cr4 = rcr4();
  1568. load_cr4(cr4 & ~CR4_PGE);
  1569. /* Disable caches (CD = 1, NW = 0). */
  1570. cr0 = rcr0();
  1571. load_cr0((cr0 & ~CR0_NW) | CR0_CD);
  1572. /* Flushes caches and TLBs. */
  1573. wbinvd();
  1574. invltlb();
  1575. /* Update PAT and index table. */
  1576. wrmsr(MSR_PAT, pat_msr);
  1577. /* Flush caches and TLBs again. */
  1578. wbinvd();
  1579. invltlb();
  1580. /* Restore caches and PGE. */
  1581. load_cr0(cr0);
  1582. load_cr4(cr4);
  1583. }
  1584. /*
  1585. * Initialize a vm_page's machine-dependent fields.
  1586. */
  1587. void
  1588. pmap_page_init(vm_page_t m)
  1589. {
  1590. TAILQ_INIT(&m->md.pv_list);
  1591. m->md.pat_mode = PAT_WRITE_BACK;
  1592. }
  1593. static int pmap_allow_2m_x_ept;
  1594. SYSCTL_INT(_vm_pmap, OID_AUTO, allow_2m_x_ept, CTLFLAG_RWTUN | CTLFLAG_NOFETCH,
  1595. &pmap_allow_2m_x_ept, 0,
  1596. "Allow executable superpage mappings in EPT");
  1597. void
  1598. pmap_allow_2m_x_ept_recalculate(void)
  1599. {
  1600. /*
  1601. * SKL002, SKL012S. Since the EPT format is only used by
  1602. * Intel CPUs, the vendor check is merely a formality.
  1603. */
  1604. if (!(cpu_vendor_id != CPU_VENDOR_INTEL ||
  1605. (cpu_ia32_arch_caps & IA32_ARCH_CAP_IF_PSCHANGE_MC_NO) != 0 ||
  1606. (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
  1607. (CPUID_TO_MODEL(cpu_id) == 0x26 || /* Atoms */
  1608. CPUID_TO_MODEL(cpu_id) == 0x27 ||
  1609. CPUID_TO_MODEL(cpu_id) == 0x35 ||
  1610. CPUID_TO_MODEL(cpu_id) == 0x36 ||
  1611. CPUID_TO_MODEL(cpu_id) == 0x37 ||
  1612. CPUID_TO_MODEL(cpu_id) == 0x86 ||
  1613. CPUID_TO_MODEL(cpu_id) == 0x1c ||
  1614. CPUID_TO_MODEL(cpu_id) == 0x4a ||
  1615. CPUID_TO_MODEL(cpu_id) == 0x4c ||
  1616. CPUID_TO_MODEL(cpu_id) == 0x4d ||
  1617. CPUID_TO_MODEL(cpu_id) == 0x5a ||
  1618. CPUID_TO_MODEL(cpu_id) == 0x5c ||
  1619. CPUID_TO_MODEL(cpu_id) == 0x5d ||
  1620. CPUID_TO_MODEL(cpu_id) == 0x5f ||
  1621. CPUID_TO_MODEL(cpu_id) == 0x6e ||
  1622. CPUID_TO_MODEL(cpu_id) == 0x7a ||
  1623. CPUID_TO_MODEL(cpu_id) == 0x57 || /* Knights */
  1624. CPUID_TO_MODEL(cpu_id) == 0x85))))
  1625. pmap_allow_2m_x_ept = 1;
  1626. TUNABLE_INT_FETCH("hw.allow_2m_x_ept", &pmap_allow_2m_x_ept);
  1627. }
  1628. static bool
  1629. pmap_allow_2m_x_page(pmap_t pmap, bool executable)
  1630. {
  1631. return (pmap->pm_type != PT_EPT || !executable ||
  1632. !pmap_allow_2m_x_ept);
  1633. }
  1634. /*
  1635. * Initialize the pmap module.
  1636. * Called by vm_init, to initialize any structures that the pmap
  1637. * system needs to map virtual memory.
  1638. */
  1639. void
  1640. pmap_init(void)
  1641. {
  1642. struct pmap_preinit_mapping *ppim;
  1643. vm_page_t m, mpte;
  1644. vm_size_t s;
  1645. int error, i, pv_npg, ret, skz63;
  1646. /* L1TF, reserve page @0 unconditionally */
  1647. vm_page_blacklist_add(0, bootverbose);
  1648. /* Detect bare-metal Skylake Server and Skylake-X. */
  1649. if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL &&
  1650. CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) {
  1651. /*
  1652. * Skylake-X errata SKZ63. Processor May Hang When
  1653. * Executing Code In an HLE Transaction Region between
  1654. * 40000000H and 403FFFFFH.
  1655. *
  1656. * Mark the pages in the range as preallocated. It
  1657. * seems to be impossible to distinguish between
  1658. * Skylake Server and Skylake X.
  1659. */
  1660. skz63 = 1;
  1661. TUNABLE_INT_FETCH("hw.skz63_enable", &skz63);
  1662. if (skz63 != 0) {
  1663. if (bootverbose)
  1664. printf("SKZ63: skipping 4M RAM starting "
  1665. "at physical 1G\n");
  1666. for (i = 0; i < atop(0x400000); i++) {
  1667. ret = vm_page_blacklist_add(0x40000000 +
  1668. ptoa(i), FALSE);
  1669. if (!ret && bootverbose)
  1670. printf("page at %#lx already used\n",
  1671. 0x40000000 + ptoa(i));
  1672. }
  1673. }
  1674. }
  1675. /* IFU */
  1676. pmap_allow_2m_x_ept_recalculate();
  1677. /*
  1678. * Initialize the vm page array entries for the kernel pmap's
  1679. * page table pages.
  1680. */
  1681. PMAP_LOCK(kernel_pmap);
  1682. for (i = 0; i < nkpt; i++) {
  1683. mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
  1684. KASSERT(mpte >= vm_page_array &&
  1685. mpte < &vm_page_array[vm_page_array_size],
  1686. ("pmap_init: page table page is out of range"));
  1687. mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
  1688. mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
  1689. mpte->wire_count = 1;
  1690. /*
  1691. * Collect the page table pages that were replaced by a 2MB
  1692. * page in create_pagetables(). They are zero filled.
  1693. */
  1694. if (i << PDRSHIFT < KERNend &&
  1695. pmap_insert_pt_page(kernel_pmap, mpte, false))
  1696. panic("pmap_init: pmap_insert_pt_page failed");
  1697. }
  1698. PMAP_UNLOCK(kernel_pmap);
  1699. vm_wire_add(nkpt);
  1700. /*
  1701. * If the kernel is running on a virtual machine, then it must assume
  1702. * that MCA is enabled by the hypervisor. Moreover, the kernel must
  1703. * be prepared for the hypervisor changing the vendor and family that
  1704. * are reported by CPUID. Consequently, the workaround for AMD Family
  1705. * 10h Erratum 383 is enabled if the processor's feature set does not
  1706. * include at least one feature that is only supported by older Intel
  1707. * or newer AMD processors.
  1708. */
  1709. if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
  1710. (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
  1711. CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
  1712. AMDID2_FMA4)) == 0)
  1713. workaround_erratum383 = 1;
  1714. /*
  1715. * Are large page mappings enabled?
  1716. */
  1717. TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
  1718. if (pg_ps_enabled) {
  1719. KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
  1720. ("pmap_init: can't assign to pagesizes[1]"));
  1721. pagesizes[1] = NBPDR;
  1722. }
  1723. /*
  1724. * Initialize the pv chunk list mutex.
  1725. */
  1726. mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
  1727. /*
  1728. * Initialize the pool of pv list locks.
  1729. */
  1730. for (i = 0; i < NPV_LIST_LOCKS; i++)
  1731. rw_init(&pv_list_locks[i], "pmap pv list");
  1732. /*
  1733. * Calculate the size of the pv head table for superpages.
  1734. */
  1735. pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
  1736. /*
  1737. * Allocate memory for the pv head table for superpages.
  1738. */
  1739. s = (vm_size_t)(pv_npg * sizeof(struct md_page));
  1740. s = round_page(s);
  1741. pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
  1742. for (i = 0; i < pv_npg; i++)
  1743. TAILQ_INIT(&pv_table[i].pv_list);
  1744. TAILQ_INIT(&pv_dummy.pv_list);
  1745. pmap_initialized = 1;
  1746. for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
  1747. ppim = pmap_preinit_mapping + i;
  1748. if (ppim->va == 0)
  1749. continue;
  1750. /* Make the direct map consistent */
  1751. if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) {
  1752. (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa),
  1753. ppim->sz, ppim->mode);
  1754. }
  1755. if (!bootverbose)
  1756. continue;
  1757. printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
  1758. ppim->pa, ppim->va, ppim->sz, ppim->mode);
  1759. }
  1760. mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
  1761. error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
  1762. (vmem_addr_t *)&qframe);
  1763. if (error != 0)
  1764. panic("qframe allocation failed");
  1765. lm_ents = 8;
  1766. TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents);
  1767. if (lm_ents > LMEPML4I - LMSPML4I + 1)
  1768. lm_ents = LMEPML4I - LMSPML4I + 1;
  1769. if (bootverbose)
  1770. printf("pmap: large map %u PML4 slots (%lu Gb)\n",
  1771. lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024));
  1772. if (lm_ents != 0) {
  1773. large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS,
  1774. (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK);
  1775. if (large_vmem == NULL) {
  1776. printf("pmap: cannot create large map\n");
  1777. lm_ents = 0;
  1778. }
  1779. for (i = 0; i < lm_ents; i++) {
  1780. m = pmap_large_map_getptp_unlocked();
  1781. kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V |
  1782. X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx |
  1783. VM_PAGE_TO_PHYS(m);
  1784. }
  1785. }
  1786. }
  1787. static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
  1788. "2MB page mapping counters");
  1789. static u_long pmap_pde_demotions;
  1790. SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
  1791. &pmap_pde_demotions, 0, "2MB page demotions");
  1792. static u_long pmap_pde_mappings;
  1793. SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
  1794. &pmap_pde_mappings, 0, "2MB page mappings");
  1795. static u_long pmap_pde_p_failures;
  1796. SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
  1797. &pmap_pde_p_failures, 0, "2MB page promotion failures");
  1798. static u_long pmap_pde_promotions;
  1799. SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
  1800. &pmap_pde_promotions, 0, "2MB page promotions");
  1801. static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
  1802. "1GB page mapping counters");
  1803. static u_long pmap_pdpe_demotions;
  1804. SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
  1805. &pmap_pdpe_demotions, 0, "1GB page demotions");
  1806. /***************************************************
  1807. * Low level helper routines.....
  1808. ***************************************************/
  1809. static pt_entry_t
  1810. pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
  1811. {
  1812. int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
  1813. switch (pmap->pm_type) {
  1814. case PT_X86:
  1815. case PT_RVI:
  1816. /* Verify that both PAT bits are not set at the same time */
  1817. KASSERT((entry & x86_pat_bits) != x86_pat_bits,
  1818. ("Invalid PAT bits in entry %#lx", entry));
  1819. /* Swap the PAT bits if one of them is set */
  1820. if ((entry & x86_pat_bits) != 0)
  1821. entry ^= x86_pat_bits;
  1822. break;
  1823. case PT_EPT:
  1824. /*
  1825. * Nothing to do - the memory attributes are represented
  1826. * the same way for regular pages and superpages.
  1827. */
  1828. break;
  1829. default:
  1830. panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
  1831. }
  1832. return (entry);
  1833. }
  1834. boolean_t
  1835. pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
  1836. {
  1837. return (mode >= 0 && mode < PAT_INDEX_SIZE &&
  1838. pat_index[(int)mode] >= 0);
  1839. }
  1840. /*
  1841. * Determine the appropriate bits to set in a PTE or PDE for a specified
  1842. * caching mode.
  1843. */
  1844. int
  1845. pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
  1846. {
  1847. int cache_bits, pat_flag, pat_idx;
  1848. if (!pmap_is_valid_memattr(pmap, mode))
  1849. panic("Unknown caching mode %d\n", mode);
  1850. switch (pmap->pm_type) {
  1851. case PT_X86:
  1852. case PT_RVI:
  1853. /* The PAT bit is different for PTE's and PDE's. */
  1854. pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
  1855. /* Map the caching mode to a PAT index. */
  1856. pat_idx = pat_index[mode];
  1857. /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
  1858. cache_bits = 0;
  1859. if (pat_idx & 0x4)
  1860. cache_bits |= pat_flag;
  1861. if (pat_idx & 0x2)
  1862. cache_bits |= PG_NC_PCD;
  1863. if (pat_idx & 0x1)
  1864. cache_bits |= PG_NC_PWT;
  1865. break;
  1866. case PT_EPT:
  1867. cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
  1868. break;
  1869. default:
  1870. panic("unsupported pmap type %d", pmap->pm_type);
  1871. }
  1872. return (cache_bits);
  1873. }
  1874. static int
  1875. pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
  1876. {
  1877. int mask;
  1878. switch (pmap->pm_type) {
  1879. case PT_X86:
  1880. case PT_RVI:
  1881. mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
  1882. break;
  1883. case PT_EPT:
  1884. mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
  1885. break;
  1886. default:
  1887. panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
  1888. }
  1889. return (mask);
  1890. }
  1891. #ifndef PAX_HARDENING
  1892. static int
  1893. pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde)
  1894. {
  1895. int pat_flag, pat_idx;
  1896. pat_idx = 0;
  1897. switch (pmap->pm_type) {
  1898. case PT_X86:
  1899. case PT_RVI:
  1900. /* The PAT bit is different for PTE's and PDE's. */
  1901. pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
  1902. if ((pte & pat_flag) != 0)
  1903. pat_idx |= 0x4;
  1904. if ((pte & PG_NC_PCD) != 0)
  1905. pat_idx |= 0x2;
  1906. if ((pte & PG_NC_PWT) != 0)
  1907. pat_idx |= 0x1;
  1908. break;
  1909. case PT_EPT:
  1910. if ((pte & EPT_PG_IGNORE_PAT) != 0)
  1911. panic("EPT PTE %#lx has no PAT memory type", pte);
  1912. pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3;
  1913. break;
  1914. }
  1915. /* See pmap_init_pat(). */
  1916. if (pat_idx == 4)
  1917. pat_idx = 0;
  1918. if (pat_idx == 7)
  1919. pat_idx = 3;
  1920. return (pat_idx);
  1921. }
  1922. #endif /* !PAX_HARDENING */
  1923. bool
  1924. pmap_ps_enabled(pmap_t pmap)
  1925. {
  1926. return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
  1927. }
  1928. static void
  1929. pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
  1930. {
  1931. switch (pmap->pm_type) {
  1932. case PT_X86:
  1933. break;
  1934. case PT_RVI:
  1935. case PT_EPT:
  1936. /*
  1937. * XXX
  1938. * This is a little bogus since the generation number is
  1939. * supposed to be bumped up when a region of the address
  1940. * space is invalidated in the page tables.
  1941. *
  1942. * In this case the old PDE entry is valid but yet we want
  1943. * to make sure that any mappings using the old entry are
  1944. * invalidated in the TLB.
  1945. *
  1946. * The reason this works as expected is because we rendezvous
  1947. * "all" host cpus and force any vcpu context to exit as a
  1948. * side-effect.
  1949. */
  1950. atomic_add_acq_long(&pmap->pm_eptgen, 1);
  1951. break;
  1952. default:
  1953. panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
  1954. }
  1955. pde_store(pde, newpde);
  1956. }
  1957. /*
  1958. * After changing the page size for the specified virtual address in the page
  1959. * table, flush the corresponding entries from the processor's TLB. Only the
  1960. * calling processor's TLB is affected.
  1961. *
  1962. * The calling thread must be pinned to a processor.
  1963. */
  1964. static void
  1965. pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
  1966. {
  1967. pt_entry_t PG_G;
  1968. if (pmap_type_guest(pmap))
  1969. return;
  1970. KASSERT(pmap->pm_type == PT_X86,
  1971. ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
  1972. PG_G = pmap_global_bit(pmap);
  1973. if ((newpde & PG_PS) == 0)
  1974. /* Demotion: flush a specific 2MB page mapping. */
  1975. invlpg(va);
  1976. else if ((newpde & PG_G) == 0)
  1977. /*
  1978. * Promotion: flush every 4KB page mapping from the TLB
  1979. * because there are too many to flush individually.
  1980. */
  1981. invltlb();
  1982. else {
  1983. /*
  1984. * Promotion: flush every 4KB page mapping from the TLB,
  1985. * including any global (PG_G) mappings.
  1986. */
  1987. invltlb_glob();
  1988. }
  1989. }
  1990. #ifdef SMP
  1991. /*
  1992. * For SMP, these functions have to use the IPI mechanism for coherence.
  1993. *
  1994. * N.B.: Before calling any of the following TLB invalidation functions,
  1995. * the calling processor must ensure that all stores updating a non-
  1996. * kernel page table are globally performed. Otherwise, another
  1997. * processor could cache an old, pre-update entry without being
  1998. * invalidated. This can happen one of two ways: (1) The pmap becomes
  1999. * active on another processor after its pm_active field is checked by
  2000. * one of the following functions but before a store updating the page
  2001. * table is globally performed. (2) The pmap becomes active on another
  2002. * processor before its pm_active field is checked but due to
  2003. * speculative loads one of the following functions stills reads the
  2004. * pmap as inactive on the other processor.
  2005. *
  2006. * The kernel page table is exempt because its pm_active field is
  2007. * immutable. The kernel page table is always active on every
  2008. * processor.
  2009. */
  2010. /*
  2011. * Interrupt the cpus that are executing in the guest context.
  2012. * This will force the vcpu to exit and the cached EPT mappings
  2013. * will be invalidated by the host before the next vmresume.
  2014. */
  2015. static __inline void
  2016. pmap_invalidate_ept(pmap_t pmap)
  2017. {
  2018. int ipinum;
  2019. sched_pin();
  2020. KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
  2021. ("pmap_invalidate_ept: absurd pm_active"));
  2022. /*
  2023. * The TLB mappings associated with a vcpu context are not
  2024. * flushed each time a different vcpu is chosen to execute.
  2025. *
  2026. * This is in contrast with a process's vtop mappings that
  2027. * are flushed from the TLB on each context switch.
  2028. *
  2029. * Therefore we need to do more than just a TLB shootdown on
  2030. * the active cpus in 'pmap->pm_active'. To do this we keep
  2031. * track of the number of invalidations performed on this pmap.
  2032. *
  2033. * Each vcpu keeps a cache of this counter and compares it
  2034. * just before a vmresume. If the counter is out-of-date an
  2035. * invept will be done to flush stale mappings from the TLB.
  2036. */
  2037. atomic_add_acq_long(&pmap->pm_eptgen, 1);
  2038. /*
  2039. * Force the vcpu to exit and trap back into the hypervisor.
  2040. */
  2041. ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
  2042. ipi_selected(pmap->pm_active, ipinum);
  2043. sched_unpin();
  2044. }
  2045. static cpuset_t
  2046. pmap_invalidate_cpu_mask(pmap_t pmap)
  2047. {
  2048. return (pmap == kernel_pmap ? all_cpus : pmap->pm_active);
  2049. }
  2050. static inline void
  2051. pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va,
  2052. const bool invpcid_works1)
  2053. {
  2054. struct invpcid_descr d;
  2055. uint64_t kcr3, ucr3;
  2056. uint32_t pcid;
  2057. u_int cpuid, i;
  2058. cpuid = PCPU_GET(cpuid);
  2059. if (pmap == PCPU_GET(curpmap)) {
  2060. if (pmap->pm_ucr3 != PMAP_NO_CR3) {
  2061. /*
  2062. * Because pm_pcid is recalculated on a
  2063. * context switch, we must disable switching.
  2064. * Otherwise, we might use a stale value
  2065. * below.
  2066. */
  2067. critical_enter();
  2068. pcid = pmap->pm_pcids[cpuid].pm_pcid;
  2069. if (invpcid_works1) {
  2070. d.pcid = pcid | PMAP_PCID_USER_PT;
  2071. d.pad = 0;
  2072. d.addr = va;
  2073. invpcid(&d, INVPCID_ADDR);
  2074. } else {
  2075. kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
  2076. ucr3 = pmap->pm_ucr3 | pcid |
  2077. PMAP_PCID_USER_PT | CR3_PCID_SAVE;
  2078. pmap_pti_pcid_invlpg(ucr3, kcr3, va);
  2079. }
  2080. critical_exit();
  2081. }
  2082. } else
  2083. pmap->pm_pcids[cpuid].pm_gen = 0;
  2084. CPU_FOREACH(i) {
  2085. if (cpuid != i)
  2086. pmap->pm_pcids[i].pm_gen = 0;
  2087. }
  2088. /*
  2089. * The fence is between stores to pm_gen and the read of the
  2090. * pm_active mask. We need to ensure that it is impossible
  2091. * for us to miss the bit update in pm_active and
  2092. * simultaneously observe a non-zero pm_gen in
  2093. * pmap_activate_sw(), otherwise TLB update is missed.
  2094. * Without the fence, IA32 allows such an outcome. Note that
  2095. * pm_active is updated by a locked operation, which provides
  2096. * the reciprocal fence.
  2097. */
  2098. atomic_thread_fence_seq_cst();
  2099. }
  2100. static void
  2101. pmap_invalidate_page_pcid_invpcid(pmap_t pmap, vm_offset_t va)
  2102. {
  2103. pmap_invalidate_page_pcid(pmap, va, true);
  2104. }
  2105. static void
  2106. pmap_invalidate_page_pcid_noinvpcid(pmap_t pmap, vm_offset_t va)
  2107. {
  2108. pmap_invalidate_page_pcid(pmap, va, false);
  2109. }
  2110. static void
  2111. pmap_invalidate_page_nopcid(pmap_t pmap, vm_offset_t va)
  2112. {
  2113. }
  2114. DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t),
  2115. static)
  2116. {
  2117. if (pmap_pcid_enabled)
  2118. return (invpcid_works ? pmap_invalidate_page_pcid_invpcid :
  2119. pmap_invalidate_page_pcid_noinvpcid);
  2120. return (pmap_invalidate_page_nopcid);
  2121. }
  2122. static void
  2123. pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va,
  2124. vm_offset_t addr2 __unused)
  2125. {
  2126. if (pmap == kernel_pmap) {
  2127. invlpg(va);
  2128. } else {
  2129. if (pmap == PCPU_GET(curpmap))
  2130. invlpg(va);
  2131. pmap_invalidate_page_mode(pmap, va);
  2132. }
  2133. }
  2134. void
  2135. pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
  2136. {
  2137. if (pmap_type_guest(pmap)) {
  2138. pmap_invalidate_ept(pmap);
  2139. return;
  2140. }
  2141. KASSERT(pmap->pm_type == PT_X86,
  2142. ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
  2143. smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap,
  2144. pmap_invalidate_page_curcpu_cb);
  2145. }
  2146. /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
  2147. #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE)
  2148. static void
  2149. pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
  2150. const bool invpcid_works1)
  2151. {
  2152. struct invpcid_descr d;
  2153. uint64_t kcr3, ucr3;
  2154. uint32_t pcid;
  2155. u_int cpuid, i;
  2156. cpuid = PCPU_GET(cpuid);
  2157. if (pmap == PCPU_GET(curpmap)) {
  2158. if (pmap->pm_ucr3 != PMAP_NO_CR3) {
  2159. critical_enter();
  2160. pcid = pmap->pm_pcids[cpuid].pm_pcid;
  2161. if (invpcid_works1) {
  2162. d.pcid = pcid | PMAP_PCID_USER_PT;
  2163. d.pad = 0;
  2164. d.addr = sva;
  2165. for (; d.addr < eva; d.addr += PAGE_SIZE)
  2166. invpcid(&d, INVPCID_ADDR);
  2167. } else {
  2168. kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
  2169. ucr3 = pmap->pm_ucr3 | pcid |
  2170. PMAP_PCID_USER_PT | CR3_PCID_SAVE;
  2171. pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
  2172. }
  2173. critical_exit();
  2174. }
  2175. } else
  2176. pmap->pm_pcids[cpuid].pm_gen = 0;
  2177. CPU_FOREACH(i) {
  2178. if (cpuid != i)
  2179. pmap->pm_pcids[i].pm_gen = 0;
  2180. }
  2181. /* See the comment in pmap_invalidate_page_pcid(). */
  2182. atomic_thread_fence_seq_cst();
  2183. }
  2184. static void
  2185. pmap_invalidate_range_pcid_invpcid(pmap_t pmap, vm_offset_t sva,
  2186. vm_offset_t eva)
  2187. {
  2188. pmap_invalidate_range_pcid(pmap, sva, eva, true);
  2189. }
  2190. static void
  2191. pmap_invalidate_range_pcid_noinvpcid(pmap_t pmap, vm_offset_t sva,
  2192. vm_offset_t eva)
  2193. {
  2194. pmap_invalidate_range_pcid(pmap, sva, eva, false);
  2195. }
  2196. static void
  2197. pmap_invalidate_range_nopcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
  2198. {
  2199. }
  2200. DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t,
  2201. vm_offset_t), static)
  2202. {
  2203. if (pmap_pcid_enabled)
  2204. return (invpcid_works ? pmap_invalidate_range_pcid_invpcid :
  2205. pmap_invalidate_range_pcid_noinvpcid);
  2206. return (pmap_invalidate_range_nopcid);
  2207. }
  2208. static void
  2209. pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
  2210. {
  2211. vm_offset_t addr;
  2212. if (pmap == kernel_pmap) {
  2213. for (addr = sva; addr < eva; addr += PAGE_SIZE)
  2214. invlpg(addr);
  2215. } else {
  2216. if (pmap == PCPU_GET(curpmap)) {
  2217. for (addr = sva; addr < eva; addr += PAGE_SIZE)
  2218. invlpg(addr);
  2219. }
  2220. pmap_invalidate_range_mode(pmap, sva, eva);
  2221. }
  2222. }
  2223. void
  2224. pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
  2225. {
  2226. if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
  2227. pmap_invalidate_all(pmap);
  2228. return;
  2229. }
  2230. if (pmap_type_guest(pmap)) {
  2231. pmap_invalidate_ept(pmap);
  2232. return;
  2233. }
  2234. KASSERT(pmap->pm_type == PT_X86,
  2235. ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
  2236. smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap,
  2237. pmap_invalidate_range_curcpu_cb);
  2238. }
  2239. static inline void
  2240. pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1)
  2241. {
  2242. struct invpcid_descr d;
  2243. uint64_t kcr3, ucr3;
  2244. uint32_t pcid;
  2245. u_int cpuid, i;
  2246. if (pmap == kernel_pmap) {
  2247. if (invpcid_works1) {
  2248. bzero(&d, sizeof(d));
  2249. invpcid(&d, INVPCID_CTXGLOB);
  2250. } else {
  2251. invltlb_glob();
  2252. }
  2253. } else {
  2254. cpuid = PCPU_GET(cpuid);
  2255. if (pmap == PCPU_GET(curpmap)) {
  2256. critical_enter();
  2257. pcid = pmap->pm_pcids[cpuid].pm_pcid;
  2258. if (invpcid_works1) {
  2259. d.pcid = pcid;
  2260. d.pad = 0;
  2261. d.addr = 0;
  2262. invpcid(&d, INVPCID_CTX);
  2263. if (pmap->pm_ucr3 != PMAP_NO_CR3) {
  2264. d.pcid |= PMAP_PCID_USER_PT;
  2265. invpcid(&d, INVPCID_CTX);
  2266. }
  2267. } else {
  2268. kcr3 = pmap->pm_cr3 | pcid;
  2269. ucr3 = pmap->pm_ucr3;
  2270. if (ucr3 != PMAP_NO_CR3) {
  2271. ucr3 |= pcid | PMAP_PCID_USER_PT;
  2272. pmap_pti_pcid_invalidate(ucr3, kcr3);
  2273. } else {
  2274. load_cr3(kcr3);
  2275. }
  2276. }
  2277. critical_exit();
  2278. } else
  2279. pmap->pm_pcids[cpuid].pm_gen = 0;
  2280. CPU_FOREACH(i) {
  2281. if (cpuid != i)
  2282. pmap->pm_pcids[i].pm_gen = 0;
  2283. }
  2284. }
  2285. /* See the comment in pmap_invalidate_page_pcid(). */
  2286. atomic_thread_fence_seq_cst();
  2287. }
  2288. static void
  2289. pmap_invalidate_all_pcid_invpcid(pmap_t pmap)
  2290. {
  2291. pmap_invalidate_all_pcid(pmap, true);
  2292. }
  2293. static void
  2294. pmap_invalidate_all_pcid_noinvpcid(pmap_t pmap)
  2295. {
  2296. pmap_invalidate_all_pcid(pmap, false);
  2297. }
  2298. static void
  2299. pmap_invalidate_all_nopcid(pmap_t pmap)
  2300. {
  2301. if (pmap == kernel_pmap)
  2302. invltlb_glob();
  2303. else if (pmap == PCPU_GET(curpmap))
  2304. invltlb();
  2305. }
  2306. DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t), static)
  2307. {
  2308. if (pmap_pcid_enabled)
  2309. return (invpcid_works ? pmap_invalidate_all_pcid_invpcid :
  2310. pmap_invalidate_all_pcid_noinvpcid);
  2311. return (pmap_invalidate_all_nopcid);
  2312. }
  2313. static void
  2314. pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused,
  2315. vm_offset_t addr2 __unused)
  2316. {
  2317. pmap_invalidate_all_mode(pmap);
  2318. }
  2319. void
  2320. pmap_invalidate_all(pmap_t pmap)
  2321. {
  2322. if (pmap_type_guest(pmap)) {
  2323. pmap_invalidate_ept(pmap);
  2324. return;
  2325. }
  2326. KASSERT(pmap->pm_type == PT_X86,
  2327. ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
  2328. smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap,
  2329. pmap_invalidate_all_curcpu_cb);
  2330. }
  2331. static void
  2332. pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused,
  2333. vm_offset_t addr2 __unused)
  2334. {
  2335. wbinvd();
  2336. }
  2337. void
  2338. pmap_invalidate_cache(void)
  2339. {
  2340. smp_cache_flush(pmap_invalidate_cache_curcpu_cb);
  2341. }
  2342. struct pde_action {
  2343. cpuset_t invalidate; /* processors that invalidate their TLB */
  2344. pmap_t pmap;
  2345. vm_offset_t va;
  2346. pd_entry_t *pde;
  2347. pd_entry_t newpde;
  2348. u_int store; /* processor that updates the PDE */
  2349. };
  2350. static void
  2351. pmap_update_pde_action(void *arg)
  2352. {
  2353. struct pde_action *act = arg;
  2354. if (act->store == PCPU_GET(cpuid))
  2355. pmap_update_pde_store(act->pmap, act->pde, act->newpde);
  2356. }
  2357. static void
  2358. pmap_update_pde_teardown(void *arg)
  2359. {
  2360. struct pde_action *act = arg;
  2361. if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
  2362. pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
  2363. }
  2364. /*
  2365. * Change the page size for the specified virtual address in a way that
  2366. * prevents any possibility of the TLB ever having two entries that map the
  2367. * same virtual address using different page sizes. This is the recommended
  2368. * workaround for Erratum 383 on AMD Family 10h processors. It prevents a
  2369. * machine check exception for a TLB state that is improperly diagnosed as a
  2370. * hardware error.
  2371. */
  2372. static void
  2373. pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
  2374. {
  2375. struct pde_action act;
  2376. cpuset_t active, other_cpus;
  2377. u_int cpuid;
  2378. sched_pin();
  2379. cpuid = PCPU_GET(cpuid);
  2380. other_cpus = all_cpus;
  2381. CPU_CLR(cpuid, &other_cpus);
  2382. if (pmap == kernel_pmap || pmap_type_guest(pmap))
  2383. active = all_cpus;
  2384. else {
  2385. active = pmap->pm_active;
  2386. }
  2387. if (CPU_OVERLAP(&active, &other_cpus)) {
  2388. act.store = cpuid;
  2389. act.invalidate = active;
  2390. act.va = va;
  2391. act.pmap = pmap;
  2392. act.pde = pde;
  2393. act.newpde = newpde;
  2394. CPU_SET(cpuid, &active);
  2395. smp_rendezvous_cpus(active,
  2396. smp_no_rendezvous_barrier, pmap_update_pde_action,
  2397. pmap_update_pde_teardown, &act);
  2398. } else {
  2399. pmap_update_pde_store(pmap, pde, newpde);
  2400. if (CPU_ISSET(cpuid, &active))
  2401. pmap_update_pde_invalidate(pmap, va, newpde);
  2402. }
  2403. sched_unpin();
  2404. }
  2405. #else /* !SMP */
  2406. /*
  2407. * Normal, non-SMP, invalidation functions.
  2408. */
  2409. void
  2410. pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
  2411. {
  2412. struct invpcid_descr d;
  2413. uint64_t kcr3, ucr3;
  2414. uint32_t pcid;
  2415. if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
  2416. pmap->pm_eptgen++;
  2417. return;
  2418. }
  2419. KASSERT(pmap->pm_type == PT_X86,
  2420. ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
  2421. if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
  2422. invlpg(va);
  2423. if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
  2424. pmap->pm_ucr3 != PMAP_NO_CR3) {
  2425. critical_enter();
  2426. pcid = pmap->pm_pcids[0].pm_pcid;
  2427. if (invpcid_works) {
  2428. d.pcid = pcid | PMAP_PCID_USER_PT;
  2429. d.pad = 0;
  2430. d.addr = va;
  2431. invpcid(&d, INVPCID_ADDR);
  2432. } else {
  2433. kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
  2434. ucr3 = pmap->pm_ucr3 | pcid |
  2435. PMAP_PCID_USER_PT | CR3_PCID_SAVE;
  2436. pmap_pti_pcid_invlpg(ucr3, kcr3, va);
  2437. }
  2438. critical_exit();
  2439. }
  2440. } else if (pmap_pcid_enabled)
  2441. pmap->pm_pcids[0].pm_gen = 0;
  2442. }
  2443. void
  2444. pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
  2445. {
  2446. struct invpcid_descr d;
  2447. vm_offset_t addr;
  2448. uint64_t kcr3, ucr3;
  2449. if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
  2450. pmap->pm_eptgen++;
  2451. return;
  2452. }
  2453. KASSERT(pmap->pm_type == PT_X86,
  2454. ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
  2455. if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
  2456. for (addr = sva; addr < eva; addr += PAGE_SIZE)
  2457. invlpg(addr);
  2458. if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
  2459. pmap->pm_ucr3 != PMAP_NO_CR3) {
  2460. critical_enter();
  2461. if (invpcid_works) {
  2462. d.pcid = pmap->pm_pcids[0].pm_pcid |
  2463. PMAP_PCID_USER_PT;
  2464. d.pad = 0;
  2465. d.addr = sva;
  2466. for (; d.addr < eva; d.addr += PAGE_SIZE)
  2467. invpcid(&d, INVPCID_ADDR);
  2468. } else {
  2469. kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].
  2470. pm_pcid | CR3_PCID_SAVE;
  2471. ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0].
  2472. pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
  2473. pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
  2474. }
  2475. critical_exit();
  2476. }
  2477. } else if (pmap_pcid_enabled) {
  2478. pmap->pm_pcids[0].pm_gen = 0;
  2479. }
  2480. }
  2481. void
  2482. pmap_invalidate_all(pmap_t pmap)
  2483. {
  2484. struct invpcid_descr d;
  2485. uint64_t kcr3, ucr3;
  2486. if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
  2487. pmap->pm_eptgen++;
  2488. return;
  2489. }
  2490. KASSERT(pmap->pm_type == PT_X86,
  2491. ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
  2492. if (pmap == kernel_pmap) {
  2493. if (pmap_pcid_enabled && invpcid_works) {
  2494. bzero(&d, sizeof(d));
  2495. invpcid(&d, INVPCID_CTXGLOB);
  2496. } else {
  2497. invltlb_glob();
  2498. }
  2499. } else if (pmap == PCPU_GET(curpmap)) {
  2500. if (pmap_pcid_enabled) {
  2501. critical_enter();
  2502. if (invpcid_works) {
  2503. d.pcid = pmap->pm_pcids[0].pm_pcid;
  2504. d.pad = 0;
  2505. d.addr = 0;
  2506. invpcid(&d, INVPCID_CTX);
  2507. if (pmap->pm_ucr3 != PMAP_NO_CR3) {
  2508. d.pcid |= PMAP_PCID_USER_PT;
  2509. invpcid(&d, INVPCID_CTX);
  2510. }
  2511. } else {
  2512. kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid;
  2513. if (pmap->pm_ucr3 != PMAP_NO_CR3) {
  2514. ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[
  2515. 0].pm_pcid | PMAP_PCID_USER_PT;
  2516. pmap_pti_pcid_invalidate(ucr3, kcr3);
  2517. } else
  2518. load_cr3(kcr3);
  2519. }
  2520. critical_exit();
  2521. } else {
  2522. invltlb();
  2523. }
  2524. } else if (pmap_pcid_enabled) {
  2525. pmap->pm_pcids[0].pm_gen = 0;
  2526. }
  2527. }
  2528. PMAP_INLINE void
  2529. pmap_invalidate_cache(void)
  2530. {
  2531. wbinvd();
  2532. }
  2533. static void
  2534. pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
  2535. {
  2536. pmap_update_pde_store(pmap, pde, newpde);
  2537. if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
  2538. pmap_update_pde_invalidate(pmap, va, newpde);
  2539. else
  2540. pmap->pm_pcids[0].pm_gen = 0;
  2541. }
  2542. #endif /* !SMP */
  2543. static void
  2544. pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
  2545. {
  2546. /*
  2547. * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
  2548. * by a promotion that did not invalidate the 512 4KB page mappings
  2549. * that might exist in the TLB. Consequently, at this point, the TLB
  2550. * may hold both 4KB and 2MB page mappings for the address range [va,
  2551. * va + NBPDR). Therefore, the entire range must be invalidated here.
  2552. * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
  2553. * 4KB page mappings for the address range [va, va + NBPDR), and so a
  2554. * single INVLPG suffices to invalidate the 2MB page mapping from the
  2555. * TLB.
  2556. */
  2557. if ((pde & PG_PROMOTED) != 0)
  2558. pmap_invalidate_range(pmap, va, va + NBPDR - 1);
  2559. else
  2560. pmap_invalidate_page(pmap, va);
  2561. }
  2562. DEFINE_IFUNC(, void, pmap_invalidate_cache_range,
  2563. (vm_offset_t sva, vm_offset_t eva), static)
  2564. {
  2565. if ((cpu_feature & CPUID_SS) != 0)
  2566. return (pmap_invalidate_cache_range_selfsnoop);
  2567. if ((cpu_feature & CPUID_CLFSH) != 0)
  2568. return (pmap_force_invalidate_cache_range);
  2569. return (pmap_invalidate_cache_range_all);
  2570. }
  2571. #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024)
  2572. static void
  2573. pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva)
  2574. {
  2575. KASSERT((sva & PAGE_MASK) == 0,
  2576. ("pmap_invalidate_cache_range: sva not page-aligned"));
  2577. KASSERT((eva & PAGE_MASK) == 0,
  2578. ("pmap_invalidate_cache_range: eva not page-aligned"));
  2579. }
  2580. static void
  2581. pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva)
  2582. {
  2583. pmap_invalidate_cache_range_check_align(sva, eva);
  2584. }
  2585. void
  2586. pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
  2587. {
  2588. sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
  2589. /*
  2590. * XXX: Some CPUs fault, hang, or trash the local APIC
  2591. * registers if we use CLFLUSH on the local APIC range. The
  2592. * local APIC is always uncached, so we don't need to flush
  2593. * for that range anyway.
  2594. */
  2595. if (pmap_kextract(sva) == lapic_paddr)
  2596. return;
  2597. if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) {
  2598. /*
  2599. * Do per-cache line flush. Use a locked
  2600. * instruction to insure that previous stores are
  2601. * included in the write-back. The processor
  2602. * propagates flush to other processors in the cache
  2603. * coherence domain.
  2604. */
  2605. atomic_thread_fence_seq_cst();
  2606. for (; sva < eva; sva += cpu_clflush_line_size)
  2607. clflushopt(sva);
  2608. atomic_thread_fence_seq_cst();
  2609. } else {
  2610. /*
  2611. * Writes are ordered by CLFLUSH on Intel CPUs.
  2612. */
  2613. if (cpu_vendor_id != CPU_VENDOR_INTEL)
  2614. mfence();
  2615. for (; sva < eva; sva += cpu_clflush_line_size)
  2616. clflush(sva);
  2617. if (cpu_vendor_id != CPU_VENDOR_INTEL)
  2618. mfence();
  2619. }
  2620. }
  2621. static void
  2622. pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva)
  2623. {
  2624. pmap_invalidate_cache_range_check_align(sva, eva);
  2625. pmap_invalidate_cache();
  2626. }
  2627. /*
  2628. * Remove the specified set of pages from the data and instruction caches.
  2629. *
  2630. * In contrast to pmap_invalidate_cache_range(), this function does not
  2631. * rely on the CPU's self-snoop feature, because it is intended for use
  2632. * when moving pages into a different cache domain.
  2633. */
  2634. void
  2635. pmap_invalidate_cache_pages(vm_page_t *pages, int count)
  2636. {
  2637. vm_offset_t daddr, eva;
  2638. int i;
  2639. bool useclflushopt;
  2640. useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
  2641. if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
  2642. ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
  2643. pmap_invalidate_cache();
  2644. else {
  2645. if (useclflushopt)
  2646. atomic_thread_fence_seq_cst();
  2647. else if (cpu_vendor_id != CPU_VENDOR_INTEL)
  2648. mfence();
  2649. for (i = 0; i < count; i++) {
  2650. daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
  2651. eva = daddr + PAGE_SIZE;
  2652. for (; daddr < eva; daddr += cpu_clflush_line_size) {
  2653. if (useclflushopt)
  2654. clflushopt(daddr);
  2655. else
  2656. clflush(daddr);
  2657. }
  2658. }
  2659. if (useclflushopt)
  2660. atomic_thread_fence_seq_cst();
  2661. else if (cpu_vendor_id != CPU_VENDOR_INTEL)
  2662. mfence();
  2663. }
  2664. }
  2665. void
  2666. pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva)
  2667. {
  2668. pmap_invalidate_cache_range_check_align(sva, eva);
  2669. if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) {
  2670. pmap_force_invalidate_cache_range(sva, eva);
  2671. return;
  2672. }
  2673. /* See comment in pmap_force_invalidate_cache_range(). */
  2674. if (pmap_kextract(sva) == lapic_paddr)
  2675. return;
  2676. atomic_thread_fence_seq_cst();
  2677. for (; sva < eva; sva += cpu_clflush_line_size)
  2678. clwb(sva);
  2679. atomic_thread_fence_seq_cst();
  2680. }
  2681. void
  2682. pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr)
  2683. {
  2684. pt_entry_t *pte;
  2685. vm_offset_t vaddr;
  2686. int error, pte_bits;
  2687. KASSERT((spa & PAGE_MASK) == 0,
  2688. ("pmap_flush_cache_phys_range: spa not page-aligned"));
  2689. KASSERT((epa & PAGE_MASK) == 0,
  2690. ("pmap_flush_cache_phys_range: epa not page-aligned"));
  2691. if (spa < dmaplimit) {
  2692. pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN(
  2693. dmaplimit, epa)));
  2694. if (dmaplimit >= epa)
  2695. return;
  2696. spa = dmaplimit;
  2697. }
  2698. pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW |
  2699. X86_PG_V;
  2700. error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
  2701. &vaddr);
  2702. KASSERT(error == 0, ("vmem_alloc failed: %d", error));
  2703. pte = vtopte(vaddr);
  2704. for (; spa < epa; spa += PAGE_SIZE) {
  2705. sched_pin();
  2706. pte_store(pte, spa | pte_bits);
  2707. invlpg(vaddr);
  2708. /* XXXKIB atomic inside flush_cache_range are excessive */
  2709. pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE);
  2710. sched_unpin();
  2711. }
  2712. vmem_free(kernel_arena, vaddr, PAGE_SIZE);
  2713. }
  2714. /*
  2715. * Routine: pmap_extract
  2716. * Function:
  2717. * Extract the physical page address associated
  2718. * with the given map/virtual_address pair.
  2719. */
  2720. vm_paddr_t
  2721. pmap_extract(pmap_t pmap, vm_offset_t va)
  2722. {
  2723. pdp_entry_t *pdpe;
  2724. pd_entry_t *pde;
  2725. pt_entry_t *pte, PG_V;
  2726. vm_paddr_t pa;
  2727. pa = 0;
  2728. PG_V = pmap_valid_bit(pmap);
  2729. PMAP_LOCK(pmap);
  2730. pdpe = pmap_pdpe(pmap, va);
  2731. if (pdpe != NULL && (*pdpe & PG_V) != 0) {
  2732. if ((*pdpe & PG_PS) != 0)
  2733. pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
  2734. else {
  2735. pde = pmap_pdpe_to_pde(pdpe, va);
  2736. if ((*pde & PG_V) != 0) {
  2737. if ((*pde & PG_PS) != 0) {
  2738. pa = (*pde & PG_PS_FRAME) |
  2739. (va & PDRMASK);
  2740. } else {
  2741. pte = pmap_pde_to_pte(pde, va);
  2742. pa = (*pte & PG_FRAME) |
  2743. (va & PAGE_MASK);
  2744. }
  2745. }
  2746. }
  2747. }
  2748. PMAP_UNLOCK(pmap);
  2749. return (pa);
  2750. }
  2751. /*
  2752. * Routine: pmap_extract_and_hold
  2753. * Function:
  2754. * Atomically extract and hold the physical page
  2755. * with the given pmap and virtual address pair
  2756. * if that mapping permits the given protection.
  2757. */
  2758. vm_page_t
  2759. pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
  2760. {
  2761. pd_entry_t pde, *pdep;
  2762. pt_entry_t pte, PG_RW, PG_V;
  2763. vm_paddr_t pa;
  2764. vm_page_t m;
  2765. pa = 0;
  2766. m = NULL;
  2767. PG_RW = pmap_rw_bit(pmap);
  2768. PG_V = pmap_valid_bit(pmap);
  2769. PMAP_LOCK(pmap);
  2770. retry:
  2771. pdep = pmap_pde(pmap, va);
  2772. if (pdep != NULL && (pde = *pdep)) {
  2773. if (pde & PG_PS) {
  2774. if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
  2775. if (vm_page_pa_tryrelock(pmap, (pde &
  2776. PG_PS_FRAME) | (va & PDRMASK), &pa))
  2777. goto retry;
  2778. m = PHYS_TO_VM_PAGE(pa);
  2779. }
  2780. } else {
  2781. pte = *pmap_pde_to_pte(pdep, va);
  2782. if ((pte & PG_V) &&
  2783. ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
  2784. if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
  2785. &pa))
  2786. goto retry;
  2787. m = PHYS_TO_VM_PAGE(pa);
  2788. }
  2789. }
  2790. if (m != NULL)
  2791. vm_page_hold(m);
  2792. }
  2793. PA_UNLOCK_COND(pa);
  2794. PMAP_UNLOCK(pmap);
  2795. return (m);
  2796. }
  2797. vm_paddr_t
  2798. pmap_kextract(vm_offset_t va)
  2799. {
  2800. pd_entry_t pde;
  2801. vm_paddr_t pa;
  2802. if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
  2803. pa = DMAP_TO_PHYS(va);
  2804. } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) {
  2805. pa = pmap_large_map_kextract(va);
  2806. } else {
  2807. pde = *vtopde(va);
  2808. if (pde & PG_PS) {
  2809. pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
  2810. } else {
  2811. /*
  2812. * Beware of a concurrent promotion that changes the
  2813. * PDE at this point! For example, vtopte() must not
  2814. * be used to access the PTE because it would use the
  2815. * new PDE. It is, however, safe to use the old PDE
  2816. * because the page table page is preserved by the
  2817. * promotion.
  2818. */
  2819. pa = *pmap_pde_to_pte(&pde, va);
  2820. pa = (pa & PG_FRAME) | (va & PAGE_MASK);
  2821. }
  2822. }
  2823. return (pa);
  2824. }
  2825. /***************************************************
  2826. * Low level mapping routines.....
  2827. ***************************************************/
  2828. /*
  2829. * Add a wired page to the kva.
  2830. * Note: not SMP coherent.
  2831. */
  2832. PMAP_INLINE void
  2833. pmap_kenter(vm_offset_t va, vm_paddr_t pa)
  2834. {
  2835. pt_entry_t *pte;
  2836. pte = vtopte(va);
  2837. pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | pg_nx);
  2838. }
  2839. static __inline void
  2840. pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
  2841. {
  2842. pt_entry_t *pte;
  2843. int cache_bits;
  2844. pte = vtopte(va);
  2845. cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
  2846. pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | pg_nx | cache_bits);
  2847. }
  2848. /*
  2849. * Remove a page from the kernel pagetables.
  2850. * Note: not SMP coherent.
  2851. */
  2852. PMAP_INLINE void
  2853. pmap_kremove(vm_offset_t va)
  2854. {
  2855. pt_entry_t *pte;
  2856. pte = vtopte(va);
  2857. pte_clear(pte);
  2858. }
  2859. /*
  2860. * Used to map a range of physical addresses into kernel
  2861. * virtual address space.
  2862. *
  2863. * The value passed in '*virt' is a suggested virtual address for
  2864. * the mapping. Architectures which can support a direct-mapped
  2865. * physical to virtual region can return the appropriate address
  2866. * within that region, leaving '*virt' unchanged. Other
  2867. * architectures should map the pages starting at '*virt' and
  2868. * update '*virt' with the first usable address after the mapped
  2869. * region.
  2870. */
  2871. vm_offset_t
  2872. pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
  2873. {
  2874. return PHYS_TO_DMAP(start);
  2875. }
  2876. /*
  2877. * Add a list of wired pages to the kva
  2878. * this routine is only used for temporary
  2879. * kernel mappings that do not need to have
  2880. * page modification or references recorded.
  2881. * Note that old mappings are simply written
  2882. * over. The page *must* be wired.
  2883. * Note: SMP coherent. Uses a ranged shootdown IPI.
  2884. */
  2885. void
  2886. pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
  2887. {
  2888. pt_entry_t *endpte, oldpte, pa, *pte;
  2889. vm_page_t m;
  2890. int cache_bits;
  2891. oldpte = 0;
  2892. pte = vtopte(sva);
  2893. endpte = pte + count;
  2894. while (pte < endpte) {
  2895. m = *ma++;
  2896. cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
  2897. pa = VM_PAGE_TO_PHYS(m) | cache_bits;
  2898. if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
  2899. oldpte |= *pte;
  2900. pte_store(pte, pa | pg_g | pg_nx | X86_PG_RW | X86_PG_V);
  2901. }
  2902. pte++;
  2903. }
  2904. if (__predict_false((oldpte & X86_PG_V) != 0))
  2905. pmap_invalidate_range(kernel_pmap, sva, sva + count *
  2906. PAGE_SIZE);
  2907. }
  2908. /*
  2909. * This routine tears out page mappings from the
  2910. * kernel -- it is meant only for temporary mappings.
  2911. * Note: SMP coherent. Uses a ranged shootdown IPI.
  2912. */
  2913. void
  2914. pmap_qremove(vm_offset_t sva, int count)
  2915. {
  2916. vm_offset_t va;
  2917. va = sva;
  2918. while (count-- > 0) {
  2919. KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
  2920. pmap_kremove(va);
  2921. va += PAGE_SIZE;
  2922. }
  2923. pmap_invalidate_range(kernel_pmap, sva, va);
  2924. }
  2925. /***************************************************
  2926. * Page table page management routines.....
  2927. ***************************************************/
  2928. /*
  2929. * Schedule the specified unused page table page to be freed. Specifically,
  2930. * add the page to the specified list of pages that will be released to the
  2931. * physical memory manager after the TLB has been updated.
  2932. */
  2933. static __inline void
  2934. pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
  2935. boolean_t set_PG_ZERO)
  2936. {
  2937. if (set_PG_ZERO)
  2938. m->flags |= PG_ZERO;
  2939. else
  2940. m->flags &= ~PG_ZERO;
  2941. SLIST_INSERT_HEAD(free, m, plinks.s.ss);
  2942. }
  2943. /*
  2944. * Inserts the specified page table page into the specified pmap's collection
  2945. * of idle page table pages. Each of a pmap's page table pages is responsible
  2946. * for mapping a distinct range of virtual addresses. The pmap's collection is
  2947. * ordered by this virtual address range.
  2948. *
  2949. * If "promoted" is false, then the page table page "mpte" must be zero filled.
  2950. */
  2951. static __inline int
  2952. pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted)
  2953. {
  2954. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  2955. mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0;
  2956. return (vm_radix_insert(&pmap->pm_root, mpte));
  2957. }
  2958. /*
  2959. * Removes the page table page mapping the specified virtual address from the
  2960. * specified pmap's collection of idle page table pages, and returns it.
  2961. * Otherwise, returns NULL if there is no page table page corresponding to the
  2962. * specified virtual address.
  2963. */
  2964. static __inline vm_page_t
  2965. pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
  2966. {
  2967. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  2968. return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va)));
  2969. }
  2970. /*
  2971. * Decrements a page table page's wire count, which is used to record the
  2972. * number of valid page table entries within the page. If the wire count
  2973. * drops to zero, then the page table page is unmapped. Returns TRUE if the
  2974. * page table page was unmapped and FALSE otherwise.
  2975. */
  2976. static inline boolean_t
  2977. pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
  2978. {
  2979. --m->wire_count;
  2980. if (m->wire_count == 0) {
  2981. _pmap_unwire_ptp(pmap, va, m, free);
  2982. return (TRUE);
  2983. } else
  2984. return (FALSE);
  2985. }
  2986. static void
  2987. _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
  2988. {
  2989. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  2990. /*
  2991. * unmap the page table page
  2992. */
  2993. if (m->pindex >= (NUPDE + NUPDPE)) {
  2994. /* PDP page */
  2995. pml4_entry_t *pml4;
  2996. pml4 = pmap_pml4e(pmap, va);
  2997. *pml4 = 0;
  2998. if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) {
  2999. pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)];
  3000. *pml4 = 0;
  3001. }
  3002. } else if (m->pindex >= NUPDE) {
  3003. /* PD page */
  3004. pdp_entry_t *pdp;
  3005. pdp = pmap_pdpe(pmap, va);
  3006. *pdp = 0;
  3007. } else {
  3008. /* PTE page */
  3009. pd_entry_t *pd;
  3010. pd = pmap_pde(pmap, va);
  3011. *pd = 0;
  3012. }
  3013. pmap_resident_count_dec(pmap, 1);
  3014. if (m->pindex < NUPDE) {
  3015. /* We just released a PT, unhold the matching PD */
  3016. vm_page_t pdpg;
  3017. pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
  3018. pmap_unwire_ptp(pmap, va, pdpg, free);
  3019. }
  3020. if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
  3021. /* We just released a PD, unhold the matching PDP */
  3022. vm_page_t pdppg;
  3023. pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
  3024. pmap_unwire_ptp(pmap, va, pdppg, free);
  3025. }
  3026. /*
  3027. * Put page on a list so that it is released after
  3028. * *ALL* TLB shootdown is done
  3029. */
  3030. pmap_add_delayed_free_list(m, free, TRUE);
  3031. }
  3032. /*
  3033. * After removing a page table entry, this routine is used to
  3034. * conditionally free the page, and manage the hold/wire counts.
  3035. */
  3036. static int
  3037. pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
  3038. struct spglist *free)
  3039. {
  3040. vm_page_t mpte;
  3041. if (va >= VM_MAXUSER_ADDRESS)
  3042. return (0);
  3043. KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
  3044. mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
  3045. return (pmap_unwire_ptp(pmap, va, mpte, free));
  3046. }
  3047. void
  3048. pmap_pinit0(pmap_t pmap)
  3049. {
  3050. struct proc *p;
  3051. struct thread *td;
  3052. int i;
  3053. PMAP_LOCK_INIT(pmap);
  3054. pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
  3055. pmap->pm_pml4u = NULL;
  3056. pmap->pm_cr3 = KPML4phys;
  3057. /* hack to keep pmap_pti_pcid_invalidate() alive */
  3058. pmap->pm_ucr3 = PMAP_NO_CR3;
  3059. pmap->pm_root.rt_root = 0;
  3060. CPU_ZERO(&pmap->pm_active);
  3061. TAILQ_INIT(&pmap->pm_pvchunk);
  3062. bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
  3063. pmap->pm_flags = pmap_flags;
  3064. CPU_FOREACH(i) {
  3065. pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1;
  3066. pmap->pm_pcids[i].pm_gen = 1;
  3067. }
  3068. pmap_activate_boot(pmap);
  3069. td = curthread;
  3070. if (pti) {
  3071. p = td->td_proc;
  3072. PROC_LOCK(p);
  3073. p->p_amd64_md_flags |= P_MD_KPTI;
  3074. PROC_UNLOCK(p);
  3075. }
  3076. pmap_thread_init_invl_gen(td);
  3077. if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
  3078. pmap_pkru_ranges_zone = uma_zcreate("pkru ranges",
  3079. sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL,
  3080. UMA_ALIGN_PTR, 0);
  3081. }
  3082. }
  3083. void
  3084. pmap_pinit_pml4(vm_page_t pml4pg)
  3085. {
  3086. pml4_entry_t *pm_pml4;
  3087. int i;
  3088. pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
  3089. /* Wire in kernel global address entries. */
  3090. for (i = 0; i < NKPML4E; i++) {
  3091. pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW |
  3092. X86_PG_V;
  3093. }
  3094. for (i = 0; i < ndmpdpphys; i++) {
  3095. pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW |
  3096. X86_PG_V;
  3097. }
  3098. /* install self-referential address mapping entry(s) */
  3099. pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW |
  3100. X86_PG_A | X86_PG_M;
  3101. /* install large map entries if configured */
  3102. for (i = 0; i < lm_ents; i++)
  3103. pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i];
  3104. }
  3105. static void
  3106. pmap_pinit_pml4_pti(vm_page_t pml4pg)
  3107. {
  3108. pml4_entry_t *pm_pml4;
  3109. int i;
  3110. pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
  3111. for (i = 0; i < NPML4EPG; i++)
  3112. pm_pml4[i] = pti_pml4[i];
  3113. }
  3114. /*
  3115. * Initialize a preallocated and zeroed pmap structure,
  3116. * such as one in a vmspace structure.
  3117. */
  3118. int
  3119. pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
  3120. {
  3121. vm_page_t pml4pg, pml4pgu;
  3122. vm_paddr_t pml4phys;
  3123. int i;
  3124. /*
  3125. * allocate the page directory page
  3126. */
  3127. pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
  3128. VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK);
  3129. pml4phys = VM_PAGE_TO_PHYS(pml4pg);
  3130. pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
  3131. CPU_FOREACH(i) {
  3132. pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
  3133. pmap->pm_pcids[i].pm_gen = 0;
  3134. }
  3135. pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */
  3136. pmap->pm_ucr3 = PMAP_NO_CR3;
  3137. pmap->pm_pml4u = NULL;
  3138. pmap->pm_type = pm_type;
  3139. if ((pml4pg->flags & PG_ZERO) == 0)
  3140. pagezero(pmap->pm_pml4);
  3141. /*
  3142. * Do not install the host kernel mappings in the nested page
  3143. * tables. These mappings are meaningless in the guest physical
  3144. * address space.
  3145. * Install minimal kernel mappings in PTI case.
  3146. */
  3147. if (pm_type == PT_X86) {
  3148. pmap->pm_cr3 = pml4phys;
  3149. pmap_pinit_pml4(pml4pg);
  3150. if ((curproc->p_amd64_md_flags & P_MD_KPTI) != 0) {
  3151. pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
  3152. VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
  3153. pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(
  3154. VM_PAGE_TO_PHYS(pml4pgu));
  3155. pmap_pinit_pml4_pti(pml4pgu);
  3156. pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
  3157. }
  3158. if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
  3159. rangeset_init(&pmap->pm_pkru, pkru_dup_range,
  3160. pkru_free_range, pmap, M_NOWAIT);
  3161. }
  3162. }
  3163. pmap->pm_root.rt_root = 0;
  3164. CPU_ZERO(&pmap->pm_active);
  3165. TAILQ_INIT(&pmap->pm_pvchunk);
  3166. bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
  3167. pmap->pm_flags = flags;
  3168. pmap->pm_eptgen = 0;
  3169. return (1);
  3170. }
  3171. int
  3172. pmap_pinit(pmap_t pmap)
  3173. {
  3174. return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
  3175. }
  3176. /*
  3177. * This routine is called if the desired page table page does not exist.
  3178. *
  3179. * If page table page allocation fails, this routine may sleep before
  3180. * returning NULL. It sleeps only if a lock pointer was given.
  3181. *
  3182. * Note: If a page allocation fails at page table level two or three,
  3183. * one or two pages may be held during the wait, only to be released
  3184. * afterwards. This conservative approach is easily argued to avoid
  3185. * race conditions.
  3186. */
  3187. static vm_page_t
  3188. _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
  3189. {
  3190. vm_page_t m, pdppg, pdpg;
  3191. pt_entry_t PG_A, PG_M, PG_RW, PG_V;
  3192. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  3193. PG_A = pmap_accessed_bit(pmap);
  3194. PG_M = pmap_modified_bit(pmap);
  3195. PG_V = pmap_valid_bit(pmap);
  3196. PG_RW = pmap_rw_bit(pmap);
  3197. /*
  3198. * Allocate a page table page.
  3199. */
  3200. if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
  3201. VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
  3202. if (lockp != NULL) {
  3203. RELEASE_PV_LIST_LOCK(lockp);
  3204. PMAP_UNLOCK(pmap);
  3205. PMAP_ASSERT_NOT_IN_DI();
  3206. vm_wait(NULL);
  3207. PMAP_LOCK(pmap);
  3208. }
  3209. /*
  3210. * Indicate the need to retry. While waiting, the page table
  3211. * page may have been allocated.
  3212. */
  3213. return (NULL);
  3214. }
  3215. if ((m->flags & PG_ZERO) == 0)
  3216. pmap_zero_page(m);
  3217. /*
  3218. * Map the pagetable page into the process address space, if
  3219. * it isn't already there.
  3220. */
  3221. if (ptepindex >= (NUPDE + NUPDPE)) {
  3222. pml4_entry_t *pml4, *pml4u;
  3223. vm_pindex_t pml4index;
  3224. /* Wire up a new PDPE page */
  3225. pml4index = ptepindex - (NUPDE + NUPDPE);
  3226. pml4 = &pmap->pm_pml4[pml4index];
  3227. *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
  3228. if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) {
  3229. /*
  3230. * PTI: Make all user-space mappings in the
  3231. * kernel-mode page table no-execute so that
  3232. * we detect any programming errors that leave
  3233. * the kernel-mode page table active on return
  3234. * to user space.
  3235. */
  3236. if (pmap->pm_ucr3 != PMAP_NO_CR3)
  3237. *pml4 |= pg_nx;
  3238. pml4u = &pmap->pm_pml4u[pml4index];
  3239. *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
  3240. PG_A | PG_M;
  3241. }
  3242. } else if (ptepindex >= NUPDE) {
  3243. vm_pindex_t pml4index;
  3244. vm_pindex_t pdpindex;
  3245. pml4_entry_t *pml4;
  3246. pdp_entry_t *pdp;
  3247. /* Wire up a new PDE page */
  3248. pdpindex = ptepindex - NUPDE;
  3249. pml4index = pdpindex >> NPML4EPGSHIFT;
  3250. pml4 = &pmap->pm_pml4[pml4index];
  3251. if ((*pml4 & PG_V) == 0) {
  3252. /* Have to allocate a new pdp, recurse */
  3253. if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
  3254. lockp) == NULL) {
  3255. vm_page_unwire_noq(m);
  3256. vm_page_free_zero(m);
  3257. return (NULL);
  3258. }
  3259. } else {
  3260. /* Add reference to pdp page */
  3261. pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
  3262. pdppg->wire_count++;
  3263. }
  3264. pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
  3265. /* Now find the pdp page */
  3266. pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
  3267. *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
  3268. } else {
  3269. vm_pindex_t pml4index;
  3270. vm_pindex_t pdpindex;
  3271. pml4_entry_t *pml4;
  3272. pdp_entry_t *pdp;
  3273. pd_entry_t *pd;
  3274. /* Wire up a new PTE page */
  3275. pdpindex = ptepindex >> NPDPEPGSHIFT;
  3276. pml4index = pdpindex >> NPML4EPGSHIFT;
  3277. /* First, find the pdp and check that its valid. */
  3278. pml4 = &pmap->pm_pml4[pml4index];
  3279. if ((*pml4 & PG_V) == 0) {
  3280. /* Have to allocate a new pd, recurse */
  3281. if (_pmap_allocpte(pmap, NUPDE + pdpindex,
  3282. lockp) == NULL) {
  3283. vm_page_unwire_noq(m);
  3284. vm_page_free_zero(m);
  3285. return (NULL);
  3286. }
  3287. pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
  3288. pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
  3289. } else {
  3290. pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
  3291. pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
  3292. if ((*pdp & PG_V) == 0) {
  3293. /* Have to allocate a new pd, recurse */
  3294. if (_pmap_allocpte(pmap, NUPDE + pdpindex,
  3295. lockp) == NULL) {
  3296. vm_page_unwire_noq(m);
  3297. vm_page_free_zero(m);
  3298. return (NULL);
  3299. }
  3300. } else {
  3301. /* Add reference to the pd page */
  3302. pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
  3303. pdpg->wire_count++;
  3304. }
  3305. }
  3306. pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
  3307. /* Now we know where the page directory page is */
  3308. pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
  3309. *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
  3310. }
  3311. pmap_resident_count_inc(pmap, 1);
  3312. return (m);
  3313. }
  3314. static vm_page_t
  3315. pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
  3316. {
  3317. vm_pindex_t pdpindex, ptepindex;
  3318. pdp_entry_t *pdpe, PG_V;
  3319. vm_page_t pdpg;
  3320. PG_V = pmap_valid_bit(pmap);
  3321. retry:
  3322. pdpe = pmap_pdpe(pmap, va);
  3323. if (pdpe != NULL && (*pdpe & PG_V) != 0) {
  3324. /* Add a reference to the pd page. */
  3325. pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
  3326. pdpg->wire_count++;
  3327. } else {
  3328. /* Allocate a pd page. */
  3329. ptepindex = pmap_pde_pindex(va);
  3330. pdpindex = ptepindex >> NPDPEPGSHIFT;
  3331. pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
  3332. if (pdpg == NULL && lockp != NULL)
  3333. goto retry;
  3334. }
  3335. return (pdpg);
  3336. }
  3337. static vm_page_t
  3338. pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
  3339. {
  3340. vm_pindex_t ptepindex;
  3341. pd_entry_t *pd, PG_V;
  3342. vm_page_t m;
  3343. PG_V = pmap_valid_bit(pmap);
  3344. /*
  3345. * Calculate pagetable page index
  3346. */
  3347. ptepindex = pmap_pde_pindex(va);
  3348. retry:
  3349. /*
  3350. * Get the page directory entry
  3351. */
  3352. pd = pmap_pde(pmap, va);
  3353. /*
  3354. * This supports switching from a 2MB page to a
  3355. * normal 4K page.
  3356. */
  3357. if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
  3358. if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
  3359. /*
  3360. * Invalidation of the 2MB page mapping may have caused
  3361. * the deallocation of the underlying PD page.
  3362. */
  3363. pd = NULL;
  3364. }
  3365. }
  3366. /*
  3367. * If the page table page is mapped, we just increment the
  3368. * hold count, and activate it.
  3369. */
  3370. if (pd != NULL && (*pd & PG_V) != 0) {
  3371. m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
  3372. m->wire_count++;
  3373. } else {
  3374. /*
  3375. * Here if the pte page isn't mapped, or if it has been
  3376. * deallocated.
  3377. */
  3378. m = _pmap_allocpte(pmap, ptepindex, lockp);
  3379. if (m == NULL && lockp != NULL)
  3380. goto retry;
  3381. }
  3382. return (m);
  3383. }
  3384. /***************************************************
  3385. * Pmap allocation/deallocation routines.
  3386. ***************************************************/
  3387. /*
  3388. * Release any resources held by the given physical map.
  3389. * Called when a pmap initialized by pmap_pinit is being released.
  3390. * Should only be called if the map contains no valid mappings.
  3391. */
  3392. void
  3393. pmap_release(pmap_t pmap)
  3394. {
  3395. vm_page_t m;
  3396. int i;
  3397. KASSERT(pmap->pm_stats.resident_count == 0,
  3398. ("pmap_release: pmap resident count %ld != 0",
  3399. pmap->pm_stats.resident_count));
  3400. KASSERT(vm_radix_is_empty(&pmap->pm_root),
  3401. ("pmap_release: pmap has reserved page table page(s)"));
  3402. KASSERT(CPU_EMPTY(&pmap->pm_active),
  3403. ("releasing active pmap %p", pmap));
  3404. m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
  3405. for (i = 0; i < NKPML4E; i++) /* KVA */
  3406. pmap->pm_pml4[KPML4BASE + i] = 0;
  3407. for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
  3408. pmap->pm_pml4[DMPML4I + i] = 0;
  3409. pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */
  3410. for (i = 0; i < lm_ents; i++) /* Large Map */
  3411. pmap->pm_pml4[LMSPML4I + i] = 0;
  3412. vm_page_unwire_noq(m);
  3413. vm_page_free_zero(m);
  3414. if (pmap->pm_pml4u != NULL) {
  3415. m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u));
  3416. vm_page_unwire_noq(m);
  3417. vm_page_free(m);
  3418. }
  3419. if (pmap->pm_type == PT_X86 &&
  3420. (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
  3421. rangeset_fini(&pmap->pm_pkru);
  3422. }
  3423. static int
  3424. kvm_size(SYSCTL_HANDLER_ARGS)
  3425. {
  3426. unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
  3427. return sysctl_handle_long(oidp, &ksize, 0, req);
  3428. }
  3429. SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
  3430. 0, 0, kvm_size, "LU", "Size of KVM");
  3431. static int
  3432. kvm_free(SYSCTL_HANDLER_ARGS)
  3433. {
  3434. unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
  3435. return sysctl_handle_long(oidp, &kfree, 0, req);
  3436. }
  3437. SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
  3438. 0, 0, kvm_free, "LU", "Amount of KVM free");
  3439. /*
  3440. * grow the number of kernel page table entries, if needed
  3441. */
  3442. void
  3443. pmap_growkernel(vm_offset_t addr)
  3444. {
  3445. vm_paddr_t paddr;
  3446. vm_page_t nkpg;
  3447. pd_entry_t *pde, newpdir;
  3448. pdp_entry_t *pdpe;
  3449. mtx_assert(&kernel_map->system_mtx, MA_OWNED);
  3450. /*
  3451. * Return if "addr" is within the range of kernel page table pages
  3452. * that were preallocated during pmap bootstrap. Moreover, leave
  3453. * "kernel_vm_end" and the kernel page table as they were.
  3454. *
  3455. * The correctness of this action is based on the following
  3456. * argument: vm_map_insert() allocates contiguous ranges of the
  3457. * kernel virtual address space. It calls this function if a range
  3458. * ends after "kernel_vm_end". If the kernel is mapped between
  3459. * "kernel_vm_end" and "addr", then the range cannot begin at
  3460. * "kernel_vm_end". In fact, its beginning address cannot be less
  3461. * than the kernel. Thus, there is no immediate need to allocate
  3462. * any new kernel page table pages between "kernel_vm_end" and
  3463. * "KERNBASE".
  3464. */
  3465. if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
  3466. return;
  3467. addr = roundup2(addr, NBPDR);
  3468. if (addr - 1 >= vm_map_max(kernel_map))
  3469. addr = vm_map_max(kernel_map);
  3470. while (kernel_vm_end < addr) {
  3471. pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
  3472. if ((*pdpe & X86_PG_V) == 0) {
  3473. /* We need a new PDP entry */
  3474. nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
  3475. VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
  3476. VM_ALLOC_WIRED | VM_ALLOC_ZERO);
  3477. if (nkpg == NULL)
  3478. panic("pmap_growkernel: no memory to grow kernel");
  3479. if ((nkpg->flags & PG_ZERO) == 0)
  3480. pmap_zero_page(nkpg);
  3481. paddr = VM_PAGE_TO_PHYS(nkpg);
  3482. *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
  3483. X86_PG_A | X86_PG_M);
  3484. continue; /* try again */
  3485. }
  3486. pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
  3487. if ((*pde & X86_PG_V) != 0) {
  3488. kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
  3489. if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
  3490. kernel_vm_end = vm_map_max(kernel_map);
  3491. break;
  3492. }
  3493. continue;
  3494. }
  3495. nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
  3496. VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
  3497. VM_ALLOC_ZERO);
  3498. if (nkpg == NULL)
  3499. panic("pmap_growkernel: no memory to grow kernel");
  3500. if ((nkpg->flags & PG_ZERO) == 0)
  3501. pmap_zero_page(nkpg);
  3502. paddr = VM_PAGE_TO_PHYS(nkpg);
  3503. newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
  3504. pde_store(pde, newpdir);
  3505. kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
  3506. if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
  3507. kernel_vm_end = vm_map_max(kernel_map);
  3508. break;
  3509. }
  3510. }
  3511. }
  3512. /***************************************************
  3513. * page management routines.
  3514. ***************************************************/
  3515. CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
  3516. CTASSERT(_NPCM == 3);
  3517. CTASSERT(_NPCPV == 168);
  3518. static __inline struct pv_chunk *
  3519. pv_to_chunk(pv_entry_t pv)
  3520. {
  3521. return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
  3522. }
  3523. #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
  3524. #define PC_FREE0 0xfffffffffffffffful
  3525. #define PC_FREE1 0xfffffffffffffffful
  3526. #define PC_FREE2 0x000000fffffffffful
  3527. static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
  3528. #ifdef PV_STATS
  3529. static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
  3530. SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
  3531. "Current number of pv entry chunks");
  3532. SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
  3533. "Current number of pv entry chunks allocated");
  3534. SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
  3535. "Current number of pv entry chunks frees");
  3536. SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
  3537. "Number of times tried to get a chunk page but failed.");
  3538. static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
  3539. static int pv_entry_spare;
  3540. SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
  3541. "Current number of pv entry frees");
  3542. SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
  3543. "Current number of pv entry allocs");
  3544. SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
  3545. "Current number of pv entries");
  3546. SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
  3547. "Current number of spare pv entries");
  3548. #endif
  3549. static void
  3550. reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di)
  3551. {
  3552. if (pmap == NULL)
  3553. return;
  3554. pmap_invalidate_all(pmap);
  3555. if (pmap != locked_pmap)
  3556. PMAP_UNLOCK(pmap);
  3557. if (start_di)
  3558. pmap_delayed_invl_finish();
  3559. }
  3560. /*
  3561. * We are in a serious low memory condition. Resort to
  3562. * drastic measures to free some pages so we can allocate
  3563. * another pv entry chunk.
  3564. *
  3565. * Returns NULL if PV entries were reclaimed from the specified pmap.
  3566. *
  3567. * We do not, however, unmap 2mpages because subsequent accesses will
  3568. * allocate per-page pv entries until repromotion occurs, thereby
  3569. * exacerbating the shortage of free pv entries.
  3570. */
  3571. static vm_page_t
  3572. reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
  3573. {
  3574. struct pv_chunk *pc, *pc_marker, *pc_marker_end;
  3575. struct pv_chunk_header pc_marker_b, pc_marker_end_b;
  3576. struct md_page *pvh;
  3577. pd_entry_t *pde;
  3578. pmap_t next_pmap, pmap;
  3579. pt_entry_t *pte, tpte;
  3580. pt_entry_t PG_G, PG_A, PG_M, PG_RW;
  3581. pv_entry_t pv;
  3582. vm_offset_t va;
  3583. vm_page_t m, m_pc;
  3584. struct spglist free;
  3585. uint64_t inuse;
  3586. int bit, field, freed;
  3587. bool start_di;
  3588. static int active_reclaims = 0;
  3589. PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
  3590. KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
  3591. pmap = NULL;
  3592. m_pc = NULL;
  3593. PG_G = PG_A = PG_M = PG_RW = 0;
  3594. SLIST_INIT(&free);
  3595. bzero(&pc_marker_b, sizeof(pc_marker_b));
  3596. bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
  3597. pc_marker = (struct pv_chunk *)&pc_marker_b;
  3598. pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
  3599. /*
  3600. * A delayed invalidation block should already be active if
  3601. * pmap_advise() or pmap_remove() called this function by way
  3602. * of pmap_demote_pde_locked().
  3603. */
  3604. start_di = pmap_not_in_di();
  3605. mtx_lock(&pv_chunks_mutex);
  3606. active_reclaims++;
  3607. TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
  3608. TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
  3609. while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
  3610. SLIST_EMPTY(&free)) {
  3611. next_pmap = pc->pc_pmap;
  3612. if (next_pmap == NULL) {
  3613. /*
  3614. * The next chunk is a marker. However, it is
  3615. * not our marker, so active_reclaims must be
  3616. * > 1. Consequently, the next_chunk code
  3617. * will not rotate the pv_chunks list.
  3618. */
  3619. goto next_chunk;
  3620. }
  3621. mtx_unlock(&pv_chunks_mutex);
  3622. /*
  3623. * A pv_chunk can only be removed from the pc_lru list
  3624. * when both pc_chunks_mutex is owned and the
  3625. * corresponding pmap is locked.
  3626. */
  3627. if (pmap != next_pmap) {
  3628. reclaim_pv_chunk_leave_pmap(pmap, locked_pmap,
  3629. start_di);
  3630. pmap = next_pmap;
  3631. /* Avoid deadlock and lock recursion. */
  3632. if (pmap > locked_pmap) {
  3633. RELEASE_PV_LIST_LOCK(lockp);
  3634. PMAP_LOCK(pmap);
  3635. if (start_di)
  3636. pmap_delayed_invl_start();
  3637. mtx_lock(&pv_chunks_mutex);
  3638. continue;
  3639. } else if (pmap != locked_pmap) {
  3640. if (PMAP_TRYLOCK(pmap)) {
  3641. if (start_di)
  3642. pmap_delayed_invl_start();
  3643. mtx_lock(&pv_chunks_mutex);
  3644. continue;
  3645. } else {
  3646. pmap = NULL; /* pmap is not locked */
  3647. mtx_lock(&pv_chunks_mutex);
  3648. pc = TAILQ_NEXT(pc_marker, pc_lru);
  3649. if (pc == NULL ||
  3650. pc->pc_pmap != next_pmap)
  3651. continue;
  3652. goto next_chunk;
  3653. }
  3654. } else if (start_di)
  3655. pmap_delayed_invl_start();
  3656. PG_G = pmap_global_bit(pmap);
  3657. PG_A = pmap_accessed_bit(pmap);
  3658. PG_M = pmap_modified_bit(pmap);
  3659. PG_RW = pmap_rw_bit(pmap);
  3660. }
  3661. /*
  3662. * Destroy every non-wired, 4 KB page mapping in the chunk.
  3663. */
  3664. freed = 0;
  3665. for (field = 0; field < _NPCM; field++) {
  3666. for (inuse = ~pc->pc_map[field] & pc_freemask[field];
  3667. inuse != 0; inuse &= ~(1UL << bit)) {
  3668. bit = bsfq(inuse);
  3669. pv = &pc->pc_pventry[field * 64 + bit];
  3670. va = pv->pv_va;
  3671. pde = pmap_pde(pmap, va);
  3672. if ((*pde & PG_PS) != 0)
  3673. continue;
  3674. pte = pmap_pde_to_pte(pde, va);
  3675. if ((*pte & PG_W) != 0)
  3676. continue;
  3677. tpte = pte_load_clear(pte);
  3678. if ((tpte & PG_G) != 0)
  3679. pmap_invalidate_page(pmap, va);
  3680. m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
  3681. if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
  3682. vm_page_dirty(m);
  3683. if ((tpte & PG_A) != 0)
  3684. vm_page_aflag_set(m, PGA_REFERENCED);
  3685. CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
  3686. TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
  3687. m->md.pv_gen++;
  3688. if (TAILQ_EMPTY(&m->md.pv_list) &&
  3689. (m->flags & PG_FICTITIOUS) == 0) {
  3690. pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
  3691. if (TAILQ_EMPTY(&pvh->pv_list)) {
  3692. vm_page_aflag_clear(m,
  3693. PGA_WRITEABLE);
  3694. }
  3695. }
  3696. pmap_delayed_invl_page(m);
  3697. pc->pc_map[field] |= 1UL << bit;
  3698. pmap_unuse_pt(pmap, va, *pde, &free);
  3699. freed++;
  3700. }
  3701. }
  3702. if (freed == 0) {
  3703. mtx_lock(&pv_chunks_mutex);
  3704. goto next_chunk;
  3705. }
  3706. /* Every freed mapping is for a 4 KB page. */
  3707. pmap_resident_count_dec(pmap, freed);
  3708. PV_STAT(atomic_add_long(&pv_entry_frees, freed));
  3709. PV_STAT(atomic_add_int(&pv_entry_spare, freed));
  3710. PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
  3711. TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
  3712. if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
  3713. pc->pc_map[2] == PC_FREE2) {
  3714. PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
  3715. PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
  3716. PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
  3717. /* Entire chunk is free; return it. */
  3718. m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
  3719. dump_drop_page(m_pc->phys_addr);
  3720. mtx_lock(&pv_chunks_mutex);
  3721. TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
  3722. break;
  3723. }
  3724. TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
  3725. mtx_lock(&pv_chunks_mutex);
  3726. /* One freed pv entry in locked_pmap is sufficient. */
  3727. if (pmap == locked_pmap)
  3728. break;
  3729. next_chunk:
  3730. TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
  3731. TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
  3732. if (active_reclaims == 1 && pmap != NULL) {
  3733. /*
  3734. * Rotate the pv chunks list so that we do not
  3735. * scan the same pv chunks that could not be
  3736. * freed (because they contained a wired
  3737. * and/or superpage mapping) on every
  3738. * invocation of reclaim_pv_chunk().
  3739. */
  3740. while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
  3741. MPASS(pc->pc_pmap != NULL);
  3742. TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
  3743. TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
  3744. }
  3745. }
  3746. }
  3747. TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
  3748. TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
  3749. active_reclaims--;
  3750. mtx_unlock(&pv_chunks_mutex);
  3751. reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di);
  3752. if (m_pc == NULL && !SLIST_EMPTY(&free)) {
  3753. m_pc = SLIST_FIRST(&free);
  3754. SLIST_REMOVE_HEAD(&free, plinks.s.ss);
  3755. /* Recycle a freed page table page. */
  3756. m_pc->wire_count = 1;
  3757. }
  3758. vm_page_free_pages_toq(&free, true);
  3759. return (m_pc);
  3760. }
  3761. /*
  3762. * free the pv_entry back to the free list
  3763. */
  3764. static void
  3765. free_pv_entry(pmap_t pmap, pv_entry_t pv)
  3766. {
  3767. struct pv_chunk *pc;
  3768. int idx, field, bit;
  3769. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  3770. PV_STAT(atomic_add_long(&pv_entry_frees, 1));
  3771. PV_STAT(atomic_add_int(&pv_entry_spare, 1));
  3772. PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
  3773. pc = pv_to_chunk(pv);
  3774. idx = pv - &pc->pc_pventry[0];
  3775. field = idx / 64;
  3776. bit = idx % 64;
  3777. pc->pc_map[field] |= 1ul << bit;
  3778. if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
  3779. pc->pc_map[2] != PC_FREE2) {
  3780. /* 98% of the time, pc is already at the head of the list. */
  3781. if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
  3782. TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
  3783. TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
  3784. }
  3785. return;
  3786. }
  3787. TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
  3788. free_pv_chunk(pc);
  3789. }
  3790. static void
  3791. free_pv_chunk(struct pv_chunk *pc)
  3792. {
  3793. vm_page_t m;
  3794. mtx_lock(&pv_chunks_mutex);
  3795. TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
  3796. mtx_unlock(&pv_chunks_mutex);
  3797. PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
  3798. PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
  3799. PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
  3800. /* entire chunk is free, return it */
  3801. m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
  3802. dump_drop_page(m->phys_addr);
  3803. vm_page_unwire_noq(m);
  3804. vm_page_free(m);
  3805. }
  3806. /*
  3807. * Returns a new PV entry, allocating a new PV chunk from the system when
  3808. * needed. If this PV chunk allocation fails and a PV list lock pointer was
  3809. * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
  3810. * returned.
  3811. *
  3812. * The given PV list lock may be released.
  3813. */
  3814. static pv_entry_t
  3815. get_pv_entry(pmap_t pmap, struct rwlock **lockp)
  3816. {
  3817. int bit, field;
  3818. pv_entry_t pv;
  3819. struct pv_chunk *pc;
  3820. vm_page_t m;
  3821. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  3822. PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
  3823. retry:
  3824. pc = TAILQ_FIRST(&pmap->pm_pvchunk);
  3825. if (pc != NULL) {
  3826. for (field = 0; field < _NPCM; field++) {
  3827. if (pc->pc_map[field]) {
  3828. bit = bsfq(pc->pc_map[field]);
  3829. break;
  3830. }
  3831. }
  3832. if (field < _NPCM) {
  3833. pv = &pc->pc_pventry[field * 64 + bit];
  3834. pc->pc_map[field] &= ~(1ul << bit);
  3835. /* If this was the last item, move it to tail */
  3836. if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
  3837. pc->pc_map[2] == 0) {
  3838. TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
  3839. TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
  3840. pc_list);
  3841. }
  3842. PV_STAT(atomic_add_long(&pv_entry_count, 1));
  3843. PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
  3844. return (pv);
  3845. }
  3846. }
  3847. /* No free items, allocate another chunk */
  3848. m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
  3849. VM_ALLOC_WIRED);
  3850. if (m == NULL) {
  3851. if (lockp == NULL) {
  3852. PV_STAT(pc_chunk_tryfail++);
  3853. return (NULL);
  3854. }
  3855. m = reclaim_pv_chunk(pmap, lockp);
  3856. if (m == NULL)
  3857. goto retry;
  3858. }
  3859. PV_STAT(atomic_add_int(&pc_chunk_count, 1));
  3860. PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
  3861. dump_add_page(m->phys_addr);
  3862. pc = (void *)PHYS_TO_DMAP(m->phys_addr);
  3863. pc->pc_pmap = pmap;
  3864. pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */
  3865. pc->pc_map[1] = PC_FREE1;
  3866. pc->pc_map[2] = PC_FREE2;
  3867. mtx_lock(&pv_chunks_mutex);
  3868. TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
  3869. mtx_unlock(&pv_chunks_mutex);
  3870. pv = &pc->pc_pventry[0];
  3871. TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
  3872. PV_STAT(atomic_add_long(&pv_entry_count, 1));
  3873. PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
  3874. return (pv);
  3875. }
  3876. /*
  3877. * Returns the number of one bits within the given PV chunk map.
  3878. *
  3879. * The erratas for Intel processors state that "POPCNT Instruction May
  3880. * Take Longer to Execute Than Expected". It is believed that the
  3881. * issue is the spurious dependency on the destination register.
  3882. * Provide a hint to the register rename logic that the destination
  3883. * value is overwritten, by clearing it, as suggested in the
  3884. * optimization manual. It should be cheap for unaffected processors
  3885. * as well.
  3886. *
  3887. * Reference numbers for erratas are
  3888. * 4th Gen Core: HSD146
  3889. * 5th Gen Core: BDM85
  3890. * 6th Gen Core: SKL029
  3891. */
  3892. static int
  3893. popcnt_pc_map_pq(uint64_t *map)
  3894. {
  3895. u_long result, tmp;
  3896. __asm __volatile("xorl %k0,%k0;popcntq %2,%0;"
  3897. "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;"
  3898. "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0"
  3899. : "=&r" (result), "=&r" (tmp)
  3900. : "m" (map[0]), "m" (map[1]), "m" (map[2]));
  3901. return (result);
  3902. }
  3903. /*
  3904. * Ensure that the number of spare PV entries in the specified pmap meets or
  3905. * exceeds the given count, "needed".
  3906. *
  3907. * The given PV list lock may be released.
  3908. */
  3909. static void
  3910. reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
  3911. {
  3912. struct pch new_tail;
  3913. struct pv_chunk *pc;
  3914. vm_page_t m;
  3915. int avail, free;
  3916. bool reclaimed;
  3917. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  3918. KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
  3919. /*
  3920. * Newly allocated PV chunks must be stored in a private list until
  3921. * the required number of PV chunks have been allocated. Otherwise,
  3922. * reclaim_pv_chunk() could recycle one of these chunks. In
  3923. * contrast, these chunks must be added to the pmap upon allocation.
  3924. */
  3925. TAILQ_INIT(&new_tail);
  3926. retry:
  3927. avail = 0;
  3928. TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
  3929. #ifndef __POPCNT__
  3930. if ((cpu_feature2 & CPUID2_POPCNT) == 0)
  3931. bit_count((bitstr_t *)pc->pc_map, 0,
  3932. sizeof(pc->pc_map) * NBBY, &free);
  3933. else
  3934. #endif
  3935. free = popcnt_pc_map_pq(pc->pc_map);
  3936. if (free == 0)
  3937. break;
  3938. avail += free;
  3939. if (avail >= needed)
  3940. break;
  3941. }
  3942. for (reclaimed = false; avail < needed; avail += _NPCPV) {
  3943. m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
  3944. VM_ALLOC_WIRED);
  3945. if (m == NULL) {
  3946. m = reclaim_pv_chunk(pmap, lockp);
  3947. if (m == NULL)
  3948. goto retry;
  3949. reclaimed = true;
  3950. }
  3951. PV_STAT(atomic_add_int(&pc_chunk_count, 1));
  3952. PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
  3953. dump_add_page(m->phys_addr);
  3954. pc = (void *)PHYS_TO_DMAP(m->phys_addr);
  3955. pc->pc_pmap = pmap;
  3956. pc->pc_map[0] = PC_FREE0;
  3957. pc->pc_map[1] = PC_FREE1;
  3958. pc->pc_map[2] = PC_FREE2;
  3959. TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
  3960. TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
  3961. PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
  3962. /*
  3963. * The reclaim might have freed a chunk from the current pmap.
  3964. * If that chunk contained available entries, we need to
  3965. * re-count the number of available entries.
  3966. */
  3967. if (reclaimed)
  3968. goto retry;
  3969. }
  3970. if (!TAILQ_EMPTY(&new_tail)) {
  3971. mtx_lock(&pv_chunks_mutex);
  3972. TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
  3973. mtx_unlock(&pv_chunks_mutex);
  3974. }
  3975. }
  3976. /*
  3977. * First find and then remove the pv entry for the specified pmap and virtual
  3978. * address from the specified pv list. Returns the pv entry if found and NULL
  3979. * otherwise. This operation can be performed on pv lists for either 4KB or
  3980. * 2MB page mappings.
  3981. */
  3982. static __inline pv_entry_t
  3983. pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
  3984. {
  3985. pv_entry_t pv;
  3986. TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
  3987. if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
  3988. TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
  3989. pvh->pv_gen++;
  3990. break;
  3991. }
  3992. }
  3993. return (pv);
  3994. }
  3995. /*
  3996. * After demotion from a 2MB page mapping to 512 4KB page mappings,
  3997. * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
  3998. * entries for each of the 4KB page mappings.
  3999. */
  4000. static void
  4001. pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
  4002. struct rwlock **lockp)
  4003. {
  4004. struct md_page *pvh;
  4005. struct pv_chunk *pc;
  4006. pv_entry_t pv;
  4007. vm_offset_t va_last;
  4008. vm_page_t m;
  4009. int bit, field;
  4010. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  4011. KASSERT((pa & PDRMASK) == 0,
  4012. ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
  4013. CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
  4014. /*
  4015. * Transfer the 2mpage's pv entry for this mapping to the first
  4016. * page's pv list. Once this transfer begins, the pv list lock
  4017. * must not be released until the last pv entry is reinstantiated.
  4018. */
  4019. pvh = pa_to_pvh(pa);
  4020. va = trunc_2mpage(va);
  4021. pv = pmap_pvh_remove(pvh, pmap, va);
  4022. KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
  4023. m = PHYS_TO_VM_PAGE(pa);
  4024. TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
  4025. m->md.pv_gen++;
  4026. /* Instantiate the remaining NPTEPG - 1 pv entries. */
  4027. PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
  4028. va_last = va + NBPDR - PAGE_SIZE;
  4029. for (;;) {
  4030. pc = TAILQ_FIRST(&pmap->pm_pvchunk);
  4031. KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
  4032. pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
  4033. for (field = 0; field < _NPCM; field++) {
  4034. while (pc->pc_map[field]) {
  4035. bit = bsfq(pc->pc_map[field]);
  4036. pc->pc_map[field] &= ~(1ul << bit);
  4037. pv = &pc->pc_pventry[field * 64 + bit];
  4038. va += PAGE_SIZE;
  4039. pv->pv_va = va;
  4040. m++;
  4041. KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  4042. ("pmap_pv_demote_pde: page %p is not managed", m));
  4043. TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
  4044. m->md.pv_gen++;
  4045. if (va == va_last)
  4046. goto out;
  4047. }
  4048. }
  4049. TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
  4050. TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
  4051. }
  4052. out:
  4053. if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
  4054. TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
  4055. TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
  4056. }
  4057. PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
  4058. PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
  4059. }
  4060. #if VM_NRESERVLEVEL > 0
  4061. /*
  4062. * After promotion from 512 4KB page mappings to a single 2MB page mapping,
  4063. * replace the many pv entries for the 4KB page mappings by a single pv entry
  4064. * for the 2MB page mapping.
  4065. */
  4066. static void
  4067. pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
  4068. struct rwlock **lockp)
  4069. {
  4070. struct md_page *pvh;
  4071. pv_entry_t pv;
  4072. vm_offset_t va_last;
  4073. vm_page_t m;
  4074. KASSERT((pa & PDRMASK) == 0,
  4075. ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
  4076. CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
  4077. /*
  4078. * Transfer the first page's pv entry for this mapping to the 2mpage's
  4079. * pv list. Aside from avoiding the cost of a call to get_pv_entry(),
  4080. * a transfer avoids the possibility that get_pv_entry() calls
  4081. * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
  4082. * mappings that is being promoted.
  4083. */
  4084. m = PHYS_TO_VM_PAGE(pa);
  4085. va = trunc_2mpage(va);
  4086. pv = pmap_pvh_remove(&m->md, pmap, va);
  4087. KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
  4088. pvh = pa_to_pvh(pa);
  4089. TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
  4090. pvh->pv_gen++;
  4091. /* Free the remaining NPTEPG - 1 pv entries. */
  4092. va_last = va + NBPDR - PAGE_SIZE;
  4093. do {
  4094. m++;
  4095. va += PAGE_SIZE;
  4096. pmap_pvh_free(&m->md, pmap, va);
  4097. } while (va < va_last);
  4098. }
  4099. #endif /* VM_NRESERVLEVEL > 0 */
  4100. /*
  4101. * First find and then destroy the pv entry for the specified pmap and virtual
  4102. * address. This operation can be performed on pv lists for either 4KB or 2MB
  4103. * page mappings.
  4104. */
  4105. static void
  4106. pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
  4107. {
  4108. pv_entry_t pv;
  4109. pv = pmap_pvh_remove(pvh, pmap, va);
  4110. KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
  4111. free_pv_entry(pmap, pv);
  4112. }
  4113. /*
  4114. * Conditionally create the PV entry for a 4KB page mapping if the required
  4115. * memory can be allocated without resorting to reclamation.
  4116. */
  4117. static boolean_t
  4118. pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
  4119. struct rwlock **lockp)
  4120. {
  4121. pv_entry_t pv;
  4122. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  4123. /* Pass NULL instead of the lock pointer to disable reclamation. */
  4124. if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
  4125. pv->pv_va = va;
  4126. CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
  4127. TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
  4128. m->md.pv_gen++;
  4129. return (TRUE);
  4130. } else
  4131. return (FALSE);
  4132. }
  4133. /*
  4134. * Create the PV entry for a 2MB page mapping. Always returns true unless the
  4135. * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns
  4136. * false if the PV entry cannot be allocated without resorting to reclamation.
  4137. */
  4138. static bool
  4139. pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags,
  4140. struct rwlock **lockp)
  4141. {
  4142. struct md_page *pvh;
  4143. pv_entry_t pv;
  4144. vm_paddr_t pa;
  4145. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  4146. /* Pass NULL instead of the lock pointer to disable reclamation. */
  4147. if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
  4148. NULL : lockp)) == NULL)
  4149. return (false);
  4150. pv->pv_va = va;
  4151. pa = pde & PG_PS_FRAME;
  4152. CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
  4153. pvh = pa_to_pvh(pa);
  4154. TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
  4155. pvh->pv_gen++;
  4156. return (true);
  4157. }
  4158. /*
  4159. * Fills a page table page with mappings to consecutive physical pages.
  4160. */
  4161. static void
  4162. pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
  4163. {
  4164. pt_entry_t *pte;
  4165. for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
  4166. *pte = newpte;
  4167. newpte += PAGE_SIZE;
  4168. }
  4169. }
  4170. /*
  4171. * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page
  4172. * mapping is invalidated.
  4173. */
  4174. static boolean_t
  4175. pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
  4176. {
  4177. struct rwlock *lock;
  4178. boolean_t rv;
  4179. lock = NULL;
  4180. rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
  4181. if (lock != NULL)
  4182. rw_wunlock(lock);
  4183. return (rv);
  4184. }
  4185. static void
  4186. pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused)
  4187. {
  4188. #ifdef INVARIANTS
  4189. #ifdef DIAGNOSTIC
  4190. pt_entry_t *xpte, *ypte;
  4191. for (xpte = firstpte; xpte < firstpte + NPTEPG;
  4192. xpte++, newpte += PAGE_SIZE) {
  4193. if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) {
  4194. printf("pmap_demote_pde: xpte %zd and newpte map "
  4195. "different pages: found %#lx, expected %#lx\n",
  4196. xpte - firstpte, *xpte, newpte);
  4197. printf("page table dump\n");
  4198. for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++)
  4199. printf("%zd %#lx\n", ypte - firstpte, *ypte);
  4200. panic("firstpte");
  4201. }
  4202. }
  4203. #else
  4204. KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
  4205. ("pmap_demote_pde: firstpte and newpte map different physical"
  4206. " addresses"));
  4207. #endif
  4208. #endif
  4209. }
  4210. static void
  4211. pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
  4212. pd_entry_t oldpde, struct rwlock **lockp)
  4213. {
  4214. struct spglist free;
  4215. vm_offset_t sva;
  4216. SLIST_INIT(&free);
  4217. sva = trunc_2mpage(va);
  4218. pmap_remove_pde(pmap, pde, sva, &free, lockp);
  4219. if ((oldpde & pmap_global_bit(pmap)) == 0)
  4220. pmap_invalidate_pde_page(pmap, sva, oldpde);
  4221. vm_page_free_pages_toq(&free, true);
  4222. CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p",
  4223. va, pmap);
  4224. }
  4225. static boolean_t
  4226. pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
  4227. struct rwlock **lockp)
  4228. {
  4229. pd_entry_t newpde, oldpde;
  4230. pt_entry_t *firstpte, newpte;
  4231. pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V;
  4232. vm_paddr_t mptepa;
  4233. vm_page_t mpte;
  4234. int PG_PTE_CACHE;
  4235. bool in_kernel;
  4236. PG_A = pmap_accessed_bit(pmap);
  4237. PG_G = pmap_global_bit(pmap);
  4238. PG_M = pmap_modified_bit(pmap);
  4239. PG_RW = pmap_rw_bit(pmap);
  4240. PG_V = pmap_valid_bit(pmap);
  4241. PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
  4242. PG_PKU_MASK = pmap_pku_mask_bit(pmap);
  4243. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  4244. in_kernel = va >= VM_MAXUSER_ADDRESS;
  4245. oldpde = *pde;
  4246. KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
  4247. ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
  4248. /*
  4249. * Invalidate the 2MB page mapping and return "failure" if the
  4250. * mapping was never accessed.
  4251. */
  4252. if ((oldpde & PG_A) == 0) {
  4253. KASSERT((oldpde & PG_W) == 0,
  4254. ("pmap_demote_pde: a wired mapping is missing PG_A"));
  4255. pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
  4256. return (FALSE);
  4257. }
  4258. mpte = pmap_remove_pt_page(pmap, va);
  4259. if (mpte == NULL) {
  4260. KASSERT((oldpde & PG_W) == 0,
  4261. ("pmap_demote_pde: page table page for a wired mapping"
  4262. " is missing"));
  4263. /*
  4264. * If the page table page is missing and the mapping
  4265. * is for a kernel address, the mapping must belong to
  4266. * the direct map. Page table pages are preallocated
  4267. * for every other part of the kernel address space,
  4268. * so the direct map region is the only part of the
  4269. * kernel address space that must be handled here.
  4270. */
  4271. KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS &&
  4272. va < DMAP_MAX_ADDRESS),
  4273. ("pmap_demote_pde: No saved mpte for va %#lx", va));
  4274. /*
  4275. * If the 2MB page mapping belongs to the direct map
  4276. * region of the kernel's address space, then the page
  4277. * allocation request specifies the highest possible
  4278. * priority (VM_ALLOC_INTERRUPT). Otherwise, the
  4279. * priority is normal.
  4280. */
  4281. mpte = vm_page_alloc(NULL, pmap_pde_pindex(va),
  4282. (in_kernel ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
  4283. VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
  4284. /*
  4285. * If the allocation of the new page table page fails,
  4286. * invalidate the 2MB page mapping and return "failure".
  4287. */
  4288. if (mpte == NULL) {
  4289. pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
  4290. return (FALSE);
  4291. }
  4292. if (!in_kernel) {
  4293. mpte->wire_count = NPTEPG;
  4294. pmap_resident_count_inc(pmap, 1);
  4295. }
  4296. }
  4297. mptepa = VM_PAGE_TO_PHYS(mpte);
  4298. firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
  4299. newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
  4300. KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
  4301. ("pmap_demote_pde: oldpde is missing PG_M"));
  4302. newpte = oldpde & ~PG_PS;
  4303. newpte = pmap_swap_pat(pmap, newpte);
  4304. /*
  4305. * If the page table page is not leftover from an earlier promotion,
  4306. * initialize it.
  4307. */
  4308. if (mpte->valid == 0)
  4309. pmap_fill_ptp(firstpte, newpte);
  4310. pmap_demote_pde_check(firstpte, newpte);
  4311. /*
  4312. * If the mapping has changed attributes, update the page table
  4313. * entries.
  4314. */
  4315. if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
  4316. pmap_fill_ptp(firstpte, newpte);
  4317. /*
  4318. * The spare PV entries must be reserved prior to demoting the
  4319. * mapping, that is, prior to changing the PDE. Otherwise, the state
  4320. * of the PDE and the PV lists will be inconsistent, which can result
  4321. * in reclaim_pv_chunk() attempting to remove a PV entry from the
  4322. * wrong PV list and pmap_pv_demote_pde() failing to find the expected
  4323. * PV entry for the 2MB page mapping that is being demoted.
  4324. */
  4325. if ((oldpde & PG_MANAGED) != 0)
  4326. reserve_pv_entries(pmap, NPTEPG - 1, lockp);
  4327. /*
  4328. * Demote the mapping. This pmap is locked. The old PDE has
  4329. * PG_A set. If the old PDE has PG_RW set, it also has PG_M
  4330. * set. Thus, there is no danger of a race with another
  4331. * processor changing the setting of PG_A and/or PG_M between
  4332. * the read above and the store below.
  4333. */
  4334. if (workaround_erratum383)
  4335. pmap_update_pde(pmap, va, pde, newpde);
  4336. else
  4337. pde_store(pde, newpde);
  4338. /*
  4339. * Invalidate a stale recursive mapping of the page table page.
  4340. */
  4341. if (in_kernel)
  4342. pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
  4343. /*
  4344. * Demote the PV entry.
  4345. */
  4346. if ((oldpde & PG_MANAGED) != 0)
  4347. pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
  4348. atomic_add_long(&pmap_pde_demotions, 1);
  4349. CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p",
  4350. va, pmap);
  4351. return (TRUE);
  4352. }
  4353. /*
  4354. * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
  4355. */
  4356. static void
  4357. pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
  4358. {
  4359. pd_entry_t newpde;
  4360. vm_paddr_t mptepa;
  4361. vm_page_t mpte;
  4362. KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
  4363. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  4364. mpte = pmap_remove_pt_page(pmap, va);
  4365. if (mpte == NULL)
  4366. panic("pmap_remove_kernel_pde: Missing pt page.");
  4367. mptepa = VM_PAGE_TO_PHYS(mpte);
  4368. newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
  4369. /*
  4370. * If this page table page was unmapped by a promotion, then it
  4371. * contains valid mappings. Zero it to invalidate those mappings.
  4372. */
  4373. if (mpte->valid != 0)
  4374. pagezero((void *)PHYS_TO_DMAP(mptepa));
  4375. /*
  4376. * Demote the mapping.
  4377. */
  4378. if (workaround_erratum383)
  4379. pmap_update_pde(pmap, va, pde, newpde);
  4380. else
  4381. pde_store(pde, newpde);
  4382. /*
  4383. * Invalidate a stale recursive mapping of the page table page.
  4384. */
  4385. pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
  4386. }
  4387. /*
  4388. * pmap_remove_pde: do the things to unmap a superpage in a process
  4389. */
  4390. static int
  4391. pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
  4392. struct spglist *free, struct rwlock **lockp)
  4393. {
  4394. struct md_page *pvh;
  4395. pd_entry_t oldpde;
  4396. vm_offset_t eva, va;
  4397. vm_page_t m, mpte;
  4398. pt_entry_t PG_G, PG_A, PG_M, PG_RW;
  4399. PG_G = pmap_global_bit(pmap);
  4400. PG_A = pmap_accessed_bit(pmap);
  4401. PG_M = pmap_modified_bit(pmap);
  4402. PG_RW = pmap_rw_bit(pmap);
  4403. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  4404. KASSERT((sva & PDRMASK) == 0,
  4405. ("pmap_remove_pde: sva is not 2mpage aligned"));
  4406. oldpde = pte_load_clear(pdq);
  4407. if (oldpde & PG_W)
  4408. pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
  4409. if ((oldpde & PG_G) != 0)
  4410. pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
  4411. pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
  4412. if (oldpde & PG_MANAGED) {
  4413. CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
  4414. pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
  4415. pmap_pvh_free(pvh, pmap, sva);
  4416. eva = sva + NBPDR;
  4417. for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
  4418. va < eva; va += PAGE_SIZE, m++) {
  4419. if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
  4420. vm_page_dirty(m);
  4421. if (oldpde & PG_A)
  4422. vm_page_aflag_set(m, PGA_REFERENCED);
  4423. if (TAILQ_EMPTY(&m->md.pv_list) &&
  4424. TAILQ_EMPTY(&pvh->pv_list))
  4425. vm_page_aflag_clear(m, PGA_WRITEABLE);
  4426. pmap_delayed_invl_page(m);
  4427. }
  4428. }
  4429. if (pmap == kernel_pmap) {
  4430. pmap_remove_kernel_pde(pmap, pdq, sva);
  4431. } else {
  4432. mpte = pmap_remove_pt_page(pmap, sva);
  4433. if (mpte != NULL) {
  4434. KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
  4435. ("pmap_remove_pde: pte page not promoted"));
  4436. pmap_resident_count_dec(pmap, 1);
  4437. KASSERT(mpte->wire_count == NPTEPG,
  4438. ("pmap_remove_pde: pte page wire count error"));
  4439. mpte->wire_count = 0;
  4440. pmap_add_delayed_free_list(mpte, free, FALSE);
  4441. }
  4442. }
  4443. return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
  4444. }
  4445. /*
  4446. * pmap_remove_pte: do the things to unmap a page in a process
  4447. */
  4448. static int
  4449. pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
  4450. pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
  4451. {
  4452. struct md_page *pvh;
  4453. pt_entry_t oldpte, PG_A, PG_M, PG_RW;
  4454. vm_page_t m;
  4455. PG_A = pmap_accessed_bit(pmap);
  4456. PG_M = pmap_modified_bit(pmap);
  4457. PG_RW = pmap_rw_bit(pmap);
  4458. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  4459. oldpte = pte_load_clear(ptq);
  4460. if (oldpte & PG_W)
  4461. pmap->pm_stats.wired_count -= 1;
  4462. pmap_resident_count_dec(pmap, 1);
  4463. if (oldpte & PG_MANAGED) {
  4464. m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
  4465. if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
  4466. vm_page_dirty(m);
  4467. if (oldpte & PG_A)
  4468. vm_page_aflag_set(m, PGA_REFERENCED);
  4469. CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
  4470. pmap_pvh_free(&m->md, pmap, va);
  4471. if (TAILQ_EMPTY(&m->md.pv_list) &&
  4472. (m->flags & PG_FICTITIOUS) == 0) {
  4473. pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
  4474. if (TAILQ_EMPTY(&pvh->pv_list))
  4475. vm_page_aflag_clear(m, PGA_WRITEABLE);
  4476. }
  4477. pmap_delayed_invl_page(m);
  4478. }
  4479. return (pmap_unuse_pt(pmap, va, ptepde, free));
  4480. }
  4481. /*
  4482. * Remove a single page from a process address space
  4483. */
  4484. static void
  4485. pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
  4486. struct spglist *free)
  4487. {
  4488. struct rwlock *lock;
  4489. pt_entry_t *pte, PG_V;
  4490. PG_V = pmap_valid_bit(pmap);
  4491. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  4492. if ((*pde & PG_V) == 0)
  4493. return;
  4494. pte = pmap_pde_to_pte(pde, va);
  4495. if ((*pte & PG_V) == 0)
  4496. return;
  4497. lock = NULL;
  4498. pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
  4499. if (lock != NULL)
  4500. rw_wunlock(lock);
  4501. pmap_invalidate_page(pmap, va);
  4502. }
  4503. /*
  4504. * Removes the specified range of addresses from the page table page.
  4505. */
  4506. static bool
  4507. pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
  4508. pd_entry_t *pde, struct spglist *free, struct rwlock **lockp)
  4509. {
  4510. pt_entry_t PG_G, *pte;
  4511. vm_offset_t va;
  4512. bool anyvalid;
  4513. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  4514. PG_G = pmap_global_bit(pmap);
  4515. anyvalid = false;
  4516. va = eva;
  4517. for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++,
  4518. sva += PAGE_SIZE) {
  4519. if (*pte == 0) {
  4520. if (va != eva) {
  4521. pmap_invalidate_range(pmap, va, sva);
  4522. va = eva;
  4523. }
  4524. continue;
  4525. }
  4526. if ((*pte & PG_G) == 0)
  4527. anyvalid = true;
  4528. else if (va == eva)
  4529. va = sva;
  4530. if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) {
  4531. sva += PAGE_SIZE;
  4532. break;
  4533. }
  4534. }
  4535. if (va != eva)
  4536. pmap_invalidate_range(pmap, va, sva);
  4537. return (anyvalid);
  4538. }
  4539. /*
  4540. * Remove the given range of addresses from the specified map.
  4541. *
  4542. * It is assumed that the start and end are properly
  4543. * rounded to the page size.
  4544. */
  4545. void
  4546. pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
  4547. {
  4548. struct rwlock *lock;
  4549. vm_offset_t va_next;
  4550. pml4_entry_t *pml4e;
  4551. pdp_entry_t *pdpe;
  4552. pd_entry_t ptpaddr, *pde;
  4553. pt_entry_t PG_G, PG_V;
  4554. struct spglist free;
  4555. int anyvalid;
  4556. PG_G = pmap_global_bit(pmap);
  4557. PG_V = pmap_valid_bit(pmap);
  4558. /*
  4559. * Perform an unsynchronized read. This is, however, safe.
  4560. */
  4561. if (pmap->pm_stats.resident_count == 0)
  4562. return;
  4563. anyvalid = 0;
  4564. SLIST_INIT(&free);
  4565. pmap_delayed_invl_start();
  4566. PMAP_LOCK(pmap);
  4567. pmap_pkru_on_remove(pmap, sva, eva);
  4568. /*
  4569. * special handling of removing one page. a very
  4570. * common operation and easy to short circuit some
  4571. * code.
  4572. */
  4573. if (sva + PAGE_SIZE == eva) {
  4574. pde = pmap_pde(pmap, sva);
  4575. if (pde && (*pde & PG_PS) == 0) {
  4576. pmap_remove_page(pmap, sva, pde, &free);
  4577. goto out;
  4578. }
  4579. }
  4580. lock = NULL;
  4581. for (; sva < eva; sva = va_next) {
  4582. if (pmap->pm_stats.resident_count == 0)
  4583. break;
  4584. pml4e = pmap_pml4e(pmap, sva);
  4585. if ((*pml4e & PG_V) == 0) {
  4586. va_next = (sva + NBPML4) & ~PML4MASK;
  4587. if (va_next < sva)
  4588. va_next = eva;
  4589. continue;
  4590. }
  4591. pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
  4592. if ((*pdpe & PG_V) == 0) {
  4593. va_next = (sva + NBPDP) & ~PDPMASK;
  4594. if (va_next < sva)
  4595. va_next = eva;
  4596. continue;
  4597. }
  4598. /*
  4599. * Calculate index for next page table.
  4600. */
  4601. va_next = (sva + NBPDR) & ~PDRMASK;
  4602. if (va_next < sva)
  4603. va_next = eva;
  4604. pde = pmap_pdpe_to_pde(pdpe, sva);
  4605. ptpaddr = *pde;
  4606. /*
  4607. * Weed out invalid mappings.
  4608. */
  4609. if (ptpaddr == 0)
  4610. continue;
  4611. /*
  4612. * Check for large page.
  4613. */
  4614. if ((ptpaddr & PG_PS) != 0) {
  4615. /*
  4616. * Are we removing the entire large page? If not,
  4617. * demote the mapping and fall through.
  4618. */
  4619. if (sva + NBPDR == va_next && eva >= va_next) {
  4620. /*
  4621. * The TLB entry for a PG_G mapping is
  4622. * invalidated by pmap_remove_pde().
  4623. */
  4624. if ((ptpaddr & PG_G) == 0)
  4625. anyvalid = 1;
  4626. pmap_remove_pde(pmap, pde, sva, &free, &lock);
  4627. continue;
  4628. } else if (!pmap_demote_pde_locked(pmap, pde, sva,
  4629. &lock)) {
  4630. /* The large page mapping was destroyed. */
  4631. continue;
  4632. } else
  4633. ptpaddr = *pde;
  4634. }
  4635. /*
  4636. * Limit our scan to either the end of the va represented
  4637. * by the current page table page, or to the end of the
  4638. * range being removed.
  4639. */
  4640. if (va_next > eva)
  4641. va_next = eva;
  4642. if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock))
  4643. anyvalid = 1;
  4644. }
  4645. if (lock != NULL)
  4646. rw_wunlock(lock);
  4647. out:
  4648. if (anyvalid)
  4649. pmap_invalidate_all(pmap);
  4650. PMAP_UNLOCK(pmap);
  4651. pmap_delayed_invl_finish();
  4652. vm_page_free_pages_toq(&free, true);
  4653. }
  4654. /*
  4655. * Routine: pmap_remove_all
  4656. * Function:
  4657. * Removes this physical page from
  4658. * all physical maps in which it resides.
  4659. * Reflects back modify bits to the pager.
  4660. *
  4661. * Notes:
  4662. * Original versions of this routine were very
  4663. * inefficient because they iteratively called
  4664. * pmap_remove (slow...)
  4665. */
  4666. void
  4667. pmap_remove_all(vm_page_t m)
  4668. {
  4669. struct md_page *pvh;
  4670. pv_entry_t pv;
  4671. pmap_t pmap;
  4672. struct rwlock *lock;
  4673. pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
  4674. pd_entry_t *pde;
  4675. vm_offset_t va;
  4676. struct spglist free;
  4677. int pvh_gen, md_gen;
  4678. KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  4679. ("pmap_remove_all: page %p is not managed", m));
  4680. SLIST_INIT(&free);
  4681. lock = VM_PAGE_TO_PV_LIST_LOCK(m);
  4682. pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
  4683. pa_to_pvh(VM_PAGE_TO_PHYS(m));
  4684. retry:
  4685. rw_wlock(lock);
  4686. while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
  4687. pmap = PV_PMAP(pv);
  4688. if (!PMAP_TRYLOCK(pmap)) {
  4689. pvh_gen = pvh->pv_gen;
  4690. rw_wunlock(lock);
  4691. PMAP_LOCK(pmap);
  4692. rw_wlock(lock);
  4693. if (pvh_gen != pvh->pv_gen) {
  4694. rw_wunlock(lock);
  4695. PMAP_UNLOCK(pmap);
  4696. goto retry;
  4697. }
  4698. }
  4699. va = pv->pv_va;
  4700. pde = pmap_pde(pmap, va);
  4701. (void)pmap_demote_pde_locked(pmap, pde, va, &lock);
  4702. PMAP_UNLOCK(pmap);
  4703. }
  4704. while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
  4705. pmap = PV_PMAP(pv);
  4706. if (!PMAP_TRYLOCK(pmap)) {
  4707. pvh_gen = pvh->pv_gen;
  4708. md_gen = m->md.pv_gen;
  4709. rw_wunlock(lock);
  4710. PMAP_LOCK(pmap);
  4711. rw_wlock(lock);
  4712. if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
  4713. rw_wunlock(lock);
  4714. PMAP_UNLOCK(pmap);
  4715. goto retry;
  4716. }
  4717. }
  4718. PG_A = pmap_accessed_bit(pmap);
  4719. PG_M = pmap_modified_bit(pmap);
  4720. PG_RW = pmap_rw_bit(pmap);
  4721. pmap_resident_count_dec(pmap, 1);
  4722. pde = pmap_pde(pmap, pv->pv_va);
  4723. KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
  4724. " a 2mpage in page %p's pv list", m));
  4725. pte = pmap_pde_to_pte(pde, pv->pv_va);
  4726. tpte = pte_load_clear(pte);
  4727. if (tpte & PG_W)
  4728. pmap->pm_stats.wired_count--;
  4729. if (tpte & PG_A)
  4730. vm_page_aflag_set(m, PGA_REFERENCED);
  4731. /*
  4732. * Update the vm_page_t clean and reference bits.
  4733. */
  4734. if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
  4735. vm_page_dirty(m);
  4736. pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
  4737. pmap_invalidate_page(pmap, pv->pv_va);
  4738. TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
  4739. m->md.pv_gen++;
  4740. free_pv_entry(pmap, pv);
  4741. PMAP_UNLOCK(pmap);
  4742. }
  4743. vm_page_aflag_clear(m, PGA_WRITEABLE);
  4744. rw_wunlock(lock);
  4745. pmap_delayed_invl_wait(m);
  4746. vm_page_free_pages_toq(&free, true);
  4747. }
  4748. /*
  4749. * pmap_protect_pde: do the things to protect a 2mpage in a process
  4750. */
  4751. static boolean_t
  4752. pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
  4753. {
  4754. pd_entry_t newpde, oldpde;
  4755. vm_page_t m, mt;
  4756. boolean_t anychanged;
  4757. pt_entry_t PG_G, PG_M, PG_RW;
  4758. PG_G = pmap_global_bit(pmap);
  4759. PG_M = pmap_modified_bit(pmap);
  4760. PG_RW = pmap_rw_bit(pmap);
  4761. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  4762. KASSERT((sva & PDRMASK) == 0,
  4763. ("pmap_protect_pde: sva is not 2mpage aligned"));
  4764. anychanged = FALSE;
  4765. retry:
  4766. oldpde = newpde = *pde;
  4767. if ((prot & VM_PROT_WRITE) == 0) {
  4768. if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
  4769. (PG_MANAGED | PG_M | PG_RW)) {
  4770. m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
  4771. for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
  4772. vm_page_dirty(mt);
  4773. }
  4774. newpde &= ~(PG_RW | PG_M);
  4775. }
  4776. if ((prot & VM_PROT_EXECUTE) == 0)
  4777. newpde |= pg_nx;
  4778. if (newpde != oldpde) {
  4779. /*
  4780. * As an optimization to future operations on this PDE, clear
  4781. * PG_PROMOTED. The impending invalidation will remove any
  4782. * lingering 4KB page mappings from the TLB.
  4783. */
  4784. if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED))
  4785. goto retry;
  4786. if ((oldpde & PG_G) != 0)
  4787. pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
  4788. else
  4789. anychanged = TRUE;
  4790. }
  4791. return (anychanged);
  4792. }
  4793. /*
  4794. * Set the physical protection on the
  4795. * specified range of this map as requested.
  4796. */
  4797. void
  4798. pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
  4799. {
  4800. vm_offset_t va_next;
  4801. pml4_entry_t *pml4e;
  4802. pdp_entry_t *pdpe;
  4803. pd_entry_t ptpaddr, *pde;
  4804. pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
  4805. boolean_t anychanged;
  4806. KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
  4807. if (prot == VM_PROT_NONE) {
  4808. pmap_remove(pmap, sva, eva);
  4809. return;
  4810. }
  4811. if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
  4812. (VM_PROT_WRITE|VM_PROT_EXECUTE))
  4813. return;
  4814. PG_G = pmap_global_bit(pmap);
  4815. PG_M = pmap_modified_bit(pmap);
  4816. PG_V = pmap_valid_bit(pmap);
  4817. PG_RW = pmap_rw_bit(pmap);
  4818. anychanged = FALSE;
  4819. /*
  4820. * Although this function delays and batches the invalidation
  4821. * of stale TLB entries, it does not need to call
  4822. * pmap_delayed_invl_start() and
  4823. * pmap_delayed_invl_finish(), because it does not
  4824. * ordinarily destroy mappings. Stale TLB entries from
  4825. * protection-only changes need only be invalidated before the
  4826. * pmap lock is released, because protection-only changes do
  4827. * not destroy PV entries. Even operations that iterate over
  4828. * a physical page's PV list of mappings, like
  4829. * pmap_remove_write(), acquire the pmap lock for each
  4830. * mapping. Consequently, for protection-only changes, the
  4831. * pmap lock suffices to synchronize both page table and TLB
  4832. * updates.
  4833. *
  4834. * This function only destroys a mapping if pmap_demote_pde()
  4835. * fails. In that case, stale TLB entries are immediately
  4836. * invalidated.
  4837. */
  4838. PMAP_LOCK(pmap);
  4839. for (; sva < eva; sva = va_next) {
  4840. pml4e = pmap_pml4e(pmap, sva);
  4841. if ((*pml4e & PG_V) == 0) {
  4842. va_next = (sva + NBPML4) & ~PML4MASK;
  4843. if (va_next < sva)
  4844. va_next = eva;
  4845. continue;
  4846. }
  4847. pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
  4848. if ((*pdpe & PG_V) == 0) {
  4849. va_next = (sva + NBPDP) & ~PDPMASK;
  4850. if (va_next < sva)
  4851. va_next = eva;
  4852. continue;
  4853. }
  4854. va_next = (sva + NBPDR) & ~PDRMASK;
  4855. if (va_next < sva)
  4856. va_next = eva;
  4857. pde = pmap_pdpe_to_pde(pdpe, sva);
  4858. ptpaddr = *pde;
  4859. /*
  4860. * Weed out invalid mappings.
  4861. */
  4862. if (ptpaddr == 0)
  4863. continue;
  4864. /*
  4865. * Check for large page.
  4866. */
  4867. if ((ptpaddr & PG_PS) != 0) {
  4868. /*
  4869. * Are we protecting the entire large page? If not,
  4870. * demote the mapping and fall through.
  4871. */
  4872. if (sva + NBPDR == va_next && eva >= va_next) {
  4873. /*
  4874. * The TLB entry for a PG_G mapping is
  4875. * invalidated by pmap_protect_pde().
  4876. */
  4877. if (pmap_protect_pde(pmap, pde, sva, prot))
  4878. anychanged = TRUE;
  4879. continue;
  4880. } else if (!pmap_demote_pde(pmap, pde, sva)) {
  4881. /*
  4882. * The large page mapping was destroyed.
  4883. */
  4884. continue;
  4885. }
  4886. }
  4887. if (va_next > eva)
  4888. va_next = eva;
  4889. for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
  4890. sva += PAGE_SIZE) {
  4891. pt_entry_t obits, pbits;
  4892. vm_page_t m;
  4893. retry:
  4894. obits = pbits = *pte;
  4895. if ((pbits & PG_V) == 0)
  4896. continue;
  4897. if ((prot & VM_PROT_WRITE) == 0) {
  4898. if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
  4899. (PG_MANAGED | PG_M | PG_RW)) {
  4900. m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
  4901. vm_page_dirty(m);
  4902. }
  4903. pbits &= ~(PG_RW | PG_M);
  4904. }
  4905. if ((prot & VM_PROT_EXECUTE) == 0)
  4906. pbits |= pg_nx;
  4907. if (pbits != obits) {
  4908. if (!atomic_cmpset_long(pte, obits, pbits))
  4909. goto retry;
  4910. if (obits & PG_G)
  4911. pmap_invalidate_page(pmap, sva);
  4912. else
  4913. anychanged = TRUE;
  4914. }
  4915. }
  4916. }
  4917. if (anychanged)
  4918. pmap_invalidate_all(pmap);
  4919. PMAP_UNLOCK(pmap);
  4920. }
  4921. #if VM_NRESERVLEVEL > 0
  4922. static bool
  4923. pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde)
  4924. {
  4925. if (pmap->pm_type != PT_EPT)
  4926. return (false);
  4927. return ((pde & EPT_PG_EXECUTE) != 0);
  4928. }
  4929. /*
  4930. * Tries to promote the 512, contiguous 4KB page mappings that are within a
  4931. * single page table page (PTP) to a single 2MB page mapping. For promotion
  4932. * to occur, two conditions must be met: (1) the 4KB page mappings must map
  4933. * aligned, contiguous physical memory and (2) the 4KB page mappings must have
  4934. * identical characteristics.
  4935. */
  4936. static void
  4937. pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
  4938. struct rwlock **lockp)
  4939. {
  4940. pd_entry_t newpde;
  4941. pt_entry_t *firstpte, oldpte, pa, *pte;
  4942. pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK;
  4943. vm_page_t mpte;
  4944. int PG_PTE_CACHE;
  4945. PG_A = pmap_accessed_bit(pmap);
  4946. PG_G = pmap_global_bit(pmap);
  4947. PG_M = pmap_modified_bit(pmap);
  4948. PG_V = pmap_valid_bit(pmap);
  4949. PG_RW = pmap_rw_bit(pmap);
  4950. PG_PKU_MASK = pmap_pku_mask_bit(pmap);
  4951. PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
  4952. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  4953. /*
  4954. * Examine the first PTE in the specified PTP. Abort if this PTE is
  4955. * either invalid, unused, or does not map the first 4KB physical page
  4956. * within a 2MB page.
  4957. */
  4958. firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
  4959. setpde:
  4960. newpde = *firstpte;
  4961. if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V) ||
  4962. !pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap,
  4963. newpde))) {
  4964. atomic_add_long(&pmap_pde_p_failures, 1);
  4965. CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
  4966. " in pmap %p", va, pmap);
  4967. return;
  4968. }
  4969. if ((newpde & (PG_M | PG_RW)) == PG_RW) {
  4970. /*
  4971. * When PG_M is already clear, PG_RW can be cleared without
  4972. * a TLB invalidation.
  4973. */
  4974. if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
  4975. goto setpde;
  4976. newpde &= ~PG_RW;
  4977. }
  4978. /*
  4979. * Examine each of the other PTEs in the specified PTP. Abort if this
  4980. * PTE maps an unexpected 4KB physical page or does not have identical
  4981. * characteristics to the first PTE.
  4982. */
  4983. pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
  4984. for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
  4985. setpte:
  4986. oldpte = *pte;
  4987. if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
  4988. atomic_add_long(&pmap_pde_p_failures, 1);
  4989. CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
  4990. " in pmap %p", va, pmap);
  4991. return;
  4992. }
  4993. if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
  4994. /*
  4995. * When PG_M is already clear, PG_RW can be cleared
  4996. * without a TLB invalidation.
  4997. */
  4998. if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
  4999. goto setpte;
  5000. oldpte &= ~PG_RW;
  5001. CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
  5002. " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
  5003. (va & ~PDRMASK), pmap);
  5004. }
  5005. if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
  5006. atomic_add_long(&pmap_pde_p_failures, 1);
  5007. CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
  5008. " in pmap %p", va, pmap);
  5009. return;
  5010. }
  5011. pa -= PAGE_SIZE;
  5012. }
  5013. /*
  5014. * Save the page table page in its current state until the PDE
  5015. * mapping the superpage is demoted by pmap_demote_pde() or
  5016. * destroyed by pmap_remove_pde().
  5017. */
  5018. mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
  5019. KASSERT(mpte >= vm_page_array &&
  5020. mpte < &vm_page_array[vm_page_array_size],
  5021. ("pmap_promote_pde: page table page is out of range"));
  5022. KASSERT(mpte->pindex == pmap_pde_pindex(va),
  5023. ("pmap_promote_pde: page table page's pindex is wrong"));
  5024. if (pmap_insert_pt_page(pmap, mpte, true)) {
  5025. atomic_add_long(&pmap_pde_p_failures, 1);
  5026. CTR2(KTR_PMAP,
  5027. "pmap_promote_pde: failure for va %#lx in pmap %p", va,
  5028. pmap);
  5029. return;
  5030. }
  5031. /*
  5032. * Promote the pv entries.
  5033. */
  5034. if ((newpde & PG_MANAGED) != 0)
  5035. pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
  5036. /*
  5037. * Propagate the PAT index to its proper position.
  5038. */
  5039. newpde = pmap_swap_pat(pmap, newpde);
  5040. /*
  5041. * Map the superpage.
  5042. */
  5043. if (workaround_erratum383)
  5044. pmap_update_pde(pmap, va, pde, PG_PS | newpde);
  5045. else
  5046. pde_store(pde, PG_PROMOTED | PG_PS | newpde);
  5047. atomic_add_long(&pmap_pde_promotions, 1);
  5048. CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
  5049. " in pmap %p", va, pmap);
  5050. }
  5051. #endif /* VM_NRESERVLEVEL > 0 */
  5052. /*
  5053. * Insert the given physical page (p) at
  5054. * the specified virtual address (v) in the
  5055. * target physical map with the protection requested.
  5056. *
  5057. * If specified, the page will be wired down, meaning
  5058. * that the related pte can not be reclaimed.
  5059. *
  5060. * NB: This is the only routine which MAY NOT lazy-evaluate
  5061. * or lose information. That is, this routine must actually
  5062. * insert this page into the given map NOW.
  5063. *
  5064. * When destroying both a page table and PV entry, this function
  5065. * performs the TLB invalidation before releasing the PV list
  5066. * lock, so we do not need pmap_delayed_invl_page() calls here.
  5067. */
  5068. int
  5069. pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
  5070. u_int flags, int8_t psind)
  5071. {
  5072. struct rwlock *lock;
  5073. pd_entry_t *pde;
  5074. pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
  5075. pt_entry_t newpte, origpte;
  5076. pv_entry_t pv;
  5077. vm_paddr_t opa, pa;
  5078. vm_page_t mpte, om;
  5079. int rv;
  5080. boolean_t nosleep;
  5081. PG_A = pmap_accessed_bit(pmap);
  5082. PG_G = pmap_global_bit(pmap);
  5083. PG_M = pmap_modified_bit(pmap);
  5084. PG_V = pmap_valid_bit(pmap);
  5085. PG_RW = pmap_rw_bit(pmap);
  5086. va = trunc_page(va);
  5087. KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
  5088. KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
  5089. ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
  5090. va));
  5091. KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
  5092. va >= kmi.clean_eva,
  5093. ("pmap_enter: managed mapping within the clean submap"));
  5094. if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
  5095. VM_OBJECT_ASSERT_LOCKED(m->object);
  5096. KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
  5097. ("pmap_enter: flags %u has reserved bits set", flags));
  5098. pa = VM_PAGE_TO_PHYS(m);
  5099. newpte = (pt_entry_t)(pa | PG_A | PG_V);
  5100. if ((flags & VM_PROT_WRITE) != 0)
  5101. newpte |= PG_M;
  5102. if ((prot & VM_PROT_WRITE) != 0)
  5103. newpte |= PG_RW;
  5104. KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
  5105. ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
  5106. if ((prot & VM_PROT_EXECUTE) == 0)
  5107. newpte |= pg_nx;
  5108. if ((flags & PMAP_ENTER_WIRED) != 0)
  5109. newpte |= PG_W;
  5110. if (va < VM_MAXUSER_ADDRESS)
  5111. newpte |= PG_U;
  5112. if (pmap == kernel_pmap)
  5113. newpte |= PG_G;
  5114. newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0);
  5115. /*
  5116. * Set modified bit gratuitously for writeable mappings if
  5117. * the page is unmanaged. We do not want to take a fault
  5118. * to do the dirty bit accounting for these mappings.
  5119. */
  5120. if ((m->oflags & VPO_UNMANAGED) != 0) {
  5121. if ((newpte & PG_RW) != 0)
  5122. newpte |= PG_M;
  5123. } else
  5124. newpte |= PG_MANAGED;
  5125. lock = NULL;
  5126. PMAP_LOCK(pmap);
  5127. if (psind == 1) {
  5128. /* Assert the required virtual and physical alignment. */
  5129. KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
  5130. KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
  5131. rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock);
  5132. goto out;
  5133. }
  5134. mpte = NULL;
  5135. /*
  5136. * In the case that a page table page is not
  5137. * resident, we are creating it here.
  5138. */
  5139. retry:
  5140. pde = pmap_pde(pmap, va);
  5141. if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
  5142. pmap_demote_pde_locked(pmap, pde, va, &lock))) {
  5143. pte = pmap_pde_to_pte(pde, va);
  5144. if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
  5145. mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
  5146. mpte->wire_count++;
  5147. }
  5148. } else if (va < VM_MAXUSER_ADDRESS) {
  5149. /*
  5150. * Here if the pte page isn't mapped, or if it has been
  5151. * deallocated.
  5152. */
  5153. nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
  5154. mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
  5155. nosleep ? NULL : &lock);
  5156. if (mpte == NULL && nosleep) {
  5157. rv = KERN_RESOURCE_SHORTAGE;
  5158. goto out;
  5159. }
  5160. goto retry;
  5161. } else
  5162. panic("pmap_enter: invalid page directory va=%#lx", va);
  5163. origpte = *pte;
  5164. pv = NULL;
  5165. if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86)
  5166. newpte |= pmap_pkru_get(pmap, va);
  5167. /*
  5168. * Is the specified virtual address already mapped?
  5169. */
  5170. if ((origpte & PG_V) != 0) {
  5171. /*
  5172. * Wiring change, just update stats. We don't worry about
  5173. * wiring PT pages as they remain resident as long as there
  5174. * are valid mappings in them. Hence, if a user page is wired,
  5175. * the PT page will be also.
  5176. */
  5177. if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
  5178. pmap->pm_stats.wired_count++;
  5179. else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
  5180. pmap->pm_stats.wired_count--;
  5181. /*
  5182. * Remove the extra PT page reference.
  5183. */
  5184. if (mpte != NULL) {
  5185. mpte->wire_count--;
  5186. KASSERT(mpte->wire_count > 0,
  5187. ("pmap_enter: missing reference to page table page,"
  5188. " va: 0x%lx", va));
  5189. }
  5190. /*
  5191. * Has the physical page changed?
  5192. */
  5193. opa = origpte & PG_FRAME;
  5194. if (opa == pa) {
  5195. /*
  5196. * No, might be a protection or wiring change.
  5197. */
  5198. if ((origpte & PG_MANAGED) != 0 &&
  5199. (newpte & PG_RW) != 0)
  5200. vm_page_aflag_set(m, PGA_WRITEABLE);
  5201. if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
  5202. goto unchanged;
  5203. goto validate;
  5204. }
  5205. /*
  5206. * The physical page has changed. Temporarily invalidate
  5207. * the mapping. This ensures that all threads sharing the
  5208. * pmap keep a consistent view of the mapping, which is
  5209. * necessary for the correct handling of COW faults. It
  5210. * also permits reuse of the old mapping's PV entry,
  5211. * avoiding an allocation.
  5212. *
  5213. * For consistency, handle unmanaged mappings the same way.
  5214. */
  5215. origpte = pte_load_clear(pte);
  5216. KASSERT((origpte & PG_FRAME) == opa,
  5217. ("pmap_enter: unexpected pa update for %#lx", va));
  5218. if ((origpte & PG_MANAGED) != 0) {
  5219. om = PHYS_TO_VM_PAGE(opa);
  5220. /*
  5221. * The pmap lock is sufficient to synchronize with
  5222. * concurrent calls to pmap_page_test_mappings() and
  5223. * pmap_ts_referenced().
  5224. */
  5225. if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
  5226. vm_page_dirty(om);
  5227. if ((origpte & PG_A) != 0)
  5228. vm_page_aflag_set(om, PGA_REFERENCED);
  5229. CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
  5230. pv = pmap_pvh_remove(&om->md, pmap, va);
  5231. KASSERT(pv != NULL,
  5232. ("pmap_enter: no PV entry for %#lx", va));
  5233. if ((newpte & PG_MANAGED) == 0)
  5234. free_pv_entry(pmap, pv);
  5235. if ((om->aflags & PGA_WRITEABLE) != 0 &&
  5236. TAILQ_EMPTY(&om->md.pv_list) &&
  5237. ((om->flags & PG_FICTITIOUS) != 0 ||
  5238. TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
  5239. vm_page_aflag_clear(om, PGA_WRITEABLE);
  5240. }
  5241. if ((origpte & PG_A) != 0)
  5242. pmap_invalidate_page(pmap, va);
  5243. origpte = 0;
  5244. } else {
  5245. /*
  5246. * Increment the counters.
  5247. */
  5248. if ((newpte & PG_W) != 0)
  5249. pmap->pm_stats.wired_count++;
  5250. pmap_resident_count_inc(pmap, 1);
  5251. }
  5252. /*
  5253. * Enter on the PV list if part of our managed memory.
  5254. */
  5255. if ((newpte & PG_MANAGED) != 0) {
  5256. if (pv == NULL) {
  5257. pv = get_pv_entry(pmap, &lock);
  5258. pv->pv_va = va;
  5259. }
  5260. CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
  5261. TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
  5262. m->md.pv_gen++;
  5263. if ((newpte & PG_RW) != 0)
  5264. vm_page_aflag_set(m, PGA_WRITEABLE);
  5265. }
  5266. /*
  5267. * Update the PTE.
  5268. */
  5269. if ((origpte & PG_V) != 0) {
  5270. validate:
  5271. origpte = pte_load_store(pte, newpte);
  5272. KASSERT((origpte & PG_FRAME) == pa,
  5273. ("pmap_enter: unexpected pa update for %#lx", va));
  5274. if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
  5275. (PG_M | PG_RW)) {
  5276. if ((origpte & PG_MANAGED) != 0)
  5277. vm_page_dirty(m);
  5278. /*
  5279. * Although the PTE may still have PG_RW set, TLB
  5280. * invalidation may nonetheless be required because
  5281. * the PTE no longer has PG_M set.
  5282. */
  5283. } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
  5284. /*
  5285. * This PTE change does not require TLB invalidation.
  5286. */
  5287. goto unchanged;
  5288. }
  5289. if ((origpte & PG_A) != 0)
  5290. pmap_invalidate_page(pmap, va);
  5291. } else
  5292. pte_store(pte, newpte);
  5293. unchanged:
  5294. #if VM_NRESERVLEVEL > 0
  5295. /*
  5296. * If both the page table page and the reservation are fully
  5297. * populated, then attempt promotion.
  5298. */
  5299. if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
  5300. pmap_ps_enabled(pmap) &&
  5301. (m->flags & PG_FICTITIOUS) == 0 &&
  5302. vm_reserv_level_iffullpop(m) == 0)
  5303. pmap_promote_pde(pmap, pde, va, &lock);
  5304. #endif
  5305. rv = KERN_SUCCESS;
  5306. out:
  5307. if (lock != NULL)
  5308. rw_wunlock(lock);
  5309. PMAP_UNLOCK(pmap);
  5310. return (rv);
  5311. }
  5312. /*
  5313. * Tries to create a read- and/or execute-only 2MB page mapping. Returns true
  5314. * if successful. Returns false if (1) a page table page cannot be allocated
  5315. * without sleeping, (2) a mapping already exists at the specified virtual
  5316. * address, or (3) a PV entry cannot be allocated without reclaiming another
  5317. * PV entry.
  5318. */
  5319. static bool
  5320. pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
  5321. struct rwlock **lockp)
  5322. {
  5323. pd_entry_t newpde;
  5324. pt_entry_t PG_V;
  5325. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  5326. PG_V = pmap_valid_bit(pmap);
  5327. newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
  5328. PG_PS | PG_V;
  5329. if ((m->oflags & VPO_UNMANAGED) == 0)
  5330. newpde |= PG_MANAGED;
  5331. if ((prot & VM_PROT_EXECUTE) == 0)
  5332. newpde |= pg_nx;
  5333. if (va < VM_MAXUSER_ADDRESS)
  5334. newpde |= PG_U;
  5335. return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
  5336. PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
  5337. KERN_SUCCESS);
  5338. }
  5339. /*
  5340. * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if
  5341. * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
  5342. * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
  5343. * a mapping already exists at the specified virtual address. Returns
  5344. * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
  5345. * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if
  5346. * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
  5347. *
  5348. * The parameter "m" is only used when creating a managed, writeable mapping.
  5349. */
  5350. static int
  5351. pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
  5352. vm_page_t m, struct rwlock **lockp)
  5353. {
  5354. struct spglist free;
  5355. pd_entry_t oldpde, *pde;
  5356. pt_entry_t PG_G, PG_RW, PG_V;
  5357. vm_page_t mt, pdpg;
  5358. PG_G = pmap_global_bit(pmap);
  5359. PG_RW = pmap_rw_bit(pmap);
  5360. KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
  5361. ("pmap_enter_pde: newpde is missing PG_M"));
  5362. PG_V = pmap_valid_bit(pmap);
  5363. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  5364. if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap,
  5365. newpde))) {
  5366. CTR2(KTR_PMAP, "pmap_enter_pde: 2m x blocked for va %#lx"
  5367. " in pmap %p", va, pmap);
  5368. return (KERN_FAILURE);
  5369. }
  5370. if ((pdpg = pmap_allocpde(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
  5371. NULL : lockp)) == NULL) {
  5372. CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
  5373. " in pmap %p", va, pmap);
  5374. return (KERN_RESOURCE_SHORTAGE);
  5375. }
  5376. /*
  5377. * If pkru is not same for the whole pde range, return failure
  5378. * and let vm_fault() cope. Check after pde allocation, since
  5379. * it could sleep.
  5380. */
  5381. if (!pmap_pkru_same(pmap, va, va + NBPDR)) {
  5382. SLIST_INIT(&free);
  5383. if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
  5384. pmap_invalidate_page(pmap, va);
  5385. vm_page_free_pages_toq(&free, true);
  5386. }
  5387. return (KERN_FAILURE);
  5388. }
  5389. if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) {
  5390. newpde &= ~X86_PG_PKU_MASK;
  5391. newpde |= pmap_pkru_get(pmap, va);
  5392. }
  5393. pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
  5394. pde = &pde[pmap_pde_index(va)];
  5395. oldpde = *pde;
  5396. if ((oldpde & PG_V) != 0) {
  5397. KASSERT(pdpg->wire_count > 1,
  5398. ("pmap_enter_pde: pdpg's wire count is too low"));
  5399. if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
  5400. pdpg->wire_count--;
  5401. CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
  5402. " in pmap %p", va, pmap);
  5403. return (KERN_FAILURE);
  5404. }
  5405. /* Break the existing mapping(s). */
  5406. SLIST_INIT(&free);
  5407. if ((oldpde & PG_PS) != 0) {
  5408. /*
  5409. * The reference to the PD page that was acquired by
  5410. * pmap_allocpde() ensures that it won't be freed.
  5411. * However, if the PDE resulted from a promotion, then
  5412. * a reserved PT page could be freed.
  5413. */
  5414. (void)pmap_remove_pde(pmap, pde, va, &free, lockp);
  5415. if ((oldpde & PG_G) == 0)
  5416. pmap_invalidate_pde_page(pmap, va, oldpde);
  5417. } else {
  5418. pmap_delayed_invl_start();
  5419. if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free,
  5420. lockp))
  5421. pmap_invalidate_all(pmap);
  5422. pmap_delayed_invl_finish();
  5423. }
  5424. vm_page_free_pages_toq(&free, true);
  5425. if (va >= VM_MAXUSER_ADDRESS) {
  5426. /*
  5427. * Both pmap_remove_pde() and pmap_remove_ptes() will
  5428. * leave the kernel page table page zero filled.
  5429. */
  5430. mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
  5431. if (pmap_insert_pt_page(pmap, mt, false))
  5432. panic("pmap_enter_pde: trie insert failed");
  5433. } else
  5434. KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p",
  5435. pde));
  5436. }
  5437. if ((newpde & PG_MANAGED) != 0) {
  5438. /*
  5439. * Abort this mapping if its PV entry could not be created.
  5440. */
  5441. if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
  5442. SLIST_INIT(&free);
  5443. if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
  5444. /*
  5445. * Although "va" is not mapped, paging-
  5446. * structure caches could nonetheless have
  5447. * entries that refer to the freed page table
  5448. * pages. Invalidate those entries.
  5449. */
  5450. pmap_invalidate_page(pmap, va);
  5451. vm_page_free_pages_toq(&free, true);
  5452. }
  5453. CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
  5454. " in pmap %p", va, pmap);
  5455. return (KERN_RESOURCE_SHORTAGE);
  5456. }
  5457. if ((newpde & PG_RW) != 0) {
  5458. for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
  5459. vm_page_aflag_set(mt, PGA_WRITEABLE);
  5460. }
  5461. }
  5462. /*
  5463. * Increment counters.
  5464. */
  5465. if ((newpde & PG_W) != 0)
  5466. pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE;
  5467. pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
  5468. /*
  5469. * Map the superpage. (This is not a promoted mapping; there will not
  5470. * be any lingering 4KB page mappings in the TLB.)
  5471. */
  5472. pde_store(pde, newpde);
  5473. atomic_add_long(&pmap_pde_mappings, 1);
  5474. CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
  5475. " in pmap %p", va, pmap);
  5476. return (KERN_SUCCESS);
  5477. }
  5478. /*
  5479. * Maps a sequence of resident pages belonging to the same object.
  5480. * The sequence begins with the given page m_start. This page is
  5481. * mapped at the given virtual address start. Each subsequent page is
  5482. * mapped at a virtual address that is offset from start by the same
  5483. * amount as the page is offset from m_start within the object. The
  5484. * last page in the sequence is the page with the largest offset from
  5485. * m_start that can be mapped at a virtual address less than the given
  5486. * virtual address end. Not every virtual page between start and end
  5487. * is mapped; only those for which a resident page exists with the
  5488. * corresponding offset from m_start are mapped.
  5489. */
  5490. void
  5491. pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
  5492. vm_page_t m_start, vm_prot_t prot)
  5493. {
  5494. struct rwlock *lock;
  5495. vm_offset_t va;
  5496. vm_page_t m, mpte;
  5497. vm_pindex_t diff, psize;
  5498. VM_OBJECT_ASSERT_LOCKED(m_start->object);
  5499. psize = atop(end - start);
  5500. mpte = NULL;
  5501. m = m_start;
  5502. lock = NULL;
  5503. PMAP_LOCK(pmap);
  5504. while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
  5505. va = start + ptoa(diff);
  5506. if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
  5507. m->psind == 1 && pmap_ps_enabled(pmap) &&
  5508. pmap_allow_2m_x_page(pmap, (prot & VM_PROT_EXECUTE) != 0) &&
  5509. pmap_enter_2mpage(pmap, va, m, prot, &lock))
  5510. m = &m[NBPDR / PAGE_SIZE - 1];
  5511. else
  5512. mpte = pmap_enter_quick_locked(pmap, va, m, prot,
  5513. mpte, &lock);
  5514. m = TAILQ_NEXT(m, listq);
  5515. }
  5516. if (lock != NULL)
  5517. rw_wunlock(lock);
  5518. PMAP_UNLOCK(pmap);
  5519. }
  5520. /*
  5521. * this code makes some *MAJOR* assumptions:
  5522. * 1. Current pmap & pmap exists.
  5523. * 2. Not wired.
  5524. * 3. Read access.
  5525. * 4. No page table pages.
  5526. * but is *MUCH* faster than pmap_enter...
  5527. */
  5528. void
  5529. pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
  5530. {
  5531. struct rwlock *lock;
  5532. lock = NULL;
  5533. PMAP_LOCK(pmap);
  5534. (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
  5535. if (lock != NULL)
  5536. rw_wunlock(lock);
  5537. PMAP_UNLOCK(pmap);
  5538. }
  5539. static vm_page_t
  5540. pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
  5541. vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
  5542. {
  5543. struct spglist free;
  5544. pt_entry_t newpte, *pte, PG_V;
  5545. KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
  5546. (m->oflags & VPO_UNMANAGED) != 0,
  5547. ("pmap_enter_quick_locked: managed mapping within the clean submap"));
  5548. PG_V = pmap_valid_bit(pmap);
  5549. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  5550. /*
  5551. * In the case that a page table page is not
  5552. * resident, we are creating it here.
  5553. */
  5554. if (va < VM_MAXUSER_ADDRESS) {
  5555. vm_pindex_t ptepindex;
  5556. pd_entry_t *ptepa;
  5557. /*
  5558. * Calculate pagetable page index
  5559. */
  5560. ptepindex = pmap_pde_pindex(va);
  5561. if (mpte && (mpte->pindex == ptepindex)) {
  5562. mpte->wire_count++;
  5563. } else {
  5564. /*
  5565. * Get the page directory entry
  5566. */
  5567. ptepa = pmap_pde(pmap, va);
  5568. /*
  5569. * If the page table page is mapped, we just increment
  5570. * the hold count, and activate it. Otherwise, we
  5571. * attempt to allocate a page table page. If this
  5572. * attempt fails, we don't retry. Instead, we give up.
  5573. */
  5574. if (ptepa && (*ptepa & PG_V) != 0) {
  5575. if (*ptepa & PG_PS)
  5576. return (NULL);
  5577. mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
  5578. mpte->wire_count++;
  5579. } else {
  5580. /*
  5581. * Pass NULL instead of the PV list lock
  5582. * pointer, because we don't intend to sleep.
  5583. */
  5584. mpte = _pmap_allocpte(pmap, ptepindex, NULL);
  5585. if (mpte == NULL)
  5586. return (mpte);
  5587. }
  5588. }
  5589. pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
  5590. pte = &pte[pmap_pte_index(va)];
  5591. } else {
  5592. mpte = NULL;
  5593. pte = vtopte(va);
  5594. }
  5595. if (*pte) {
  5596. if (mpte != NULL) {
  5597. mpte->wire_count--;
  5598. mpte = NULL;
  5599. }
  5600. return (mpte);
  5601. }
  5602. /*
  5603. * Enter on the PV list if part of our managed memory.
  5604. */
  5605. if ((m->oflags & VPO_UNMANAGED) == 0 &&
  5606. !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
  5607. if (mpte != NULL) {
  5608. SLIST_INIT(&free);
  5609. if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
  5610. /*
  5611. * Although "va" is not mapped, paging-
  5612. * structure caches could nonetheless have
  5613. * entries that refer to the freed page table
  5614. * pages. Invalidate those entries.
  5615. */
  5616. pmap_invalidate_page(pmap, va);
  5617. vm_page_free_pages_toq(&free, true);
  5618. }
  5619. mpte = NULL;
  5620. }
  5621. return (mpte);
  5622. }
  5623. /*
  5624. * Increment counters
  5625. */
  5626. pmap_resident_count_inc(pmap, 1);
  5627. newpte = VM_PAGE_TO_PHYS(m) | PG_V |
  5628. pmap_cache_bits(pmap, m->md.pat_mode, 0);
  5629. if ((m->oflags & VPO_UNMANAGED) == 0)
  5630. newpte |= PG_MANAGED;
  5631. if ((prot & VM_PROT_EXECUTE) == 0)
  5632. newpte |= pg_nx;
  5633. if (va < VM_MAXUSER_ADDRESS)
  5634. newpte |= PG_U | pmap_pkru_get(pmap, va);
  5635. pte_store(pte, newpte);
  5636. return (mpte);
  5637. }
  5638. /*
  5639. * Make a temporary mapping for a physical address. This is only intended
  5640. * to be used for panic dumps.
  5641. */
  5642. void *
  5643. pmap_kenter_temporary(vm_paddr_t pa, int i)
  5644. {
  5645. vm_offset_t va;
  5646. va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
  5647. pmap_kenter(va, pa);
  5648. invlpg(va);
  5649. return ((void *)crashdumpmap);
  5650. }
  5651. /*
  5652. * This code maps large physical mmap regions into the
  5653. * processor address space. Note that some shortcuts
  5654. * are taken, but the code works.
  5655. */
  5656. void
  5657. pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
  5658. vm_pindex_t pindex, vm_size_t size)
  5659. {
  5660. pd_entry_t *pde;
  5661. pt_entry_t PG_A, PG_M, PG_RW, PG_V;
  5662. vm_paddr_t pa, ptepa;
  5663. vm_page_t p, pdpg;
  5664. int pat_mode;
  5665. PG_A = pmap_accessed_bit(pmap);
  5666. PG_M = pmap_modified_bit(pmap);
  5667. PG_V = pmap_valid_bit(pmap);
  5668. PG_RW = pmap_rw_bit(pmap);
  5669. VM_OBJECT_ASSERT_WLOCKED(object);
  5670. KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
  5671. ("pmap_object_init_pt: non-device object"));
  5672. if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
  5673. if (!pmap_ps_enabled(pmap))
  5674. return;
  5675. if (!vm_object_populate(object, pindex, pindex + atop(size)))
  5676. return;
  5677. p = vm_page_lookup(object, pindex);
  5678. KASSERT(p->valid == VM_PAGE_BITS_ALL,
  5679. ("pmap_object_init_pt: invalid page %p", p));
  5680. pat_mode = p->md.pat_mode;
  5681. /*
  5682. * Abort the mapping if the first page is not physically
  5683. * aligned to a 2MB page boundary.
  5684. */
  5685. ptepa = VM_PAGE_TO_PHYS(p);
  5686. if (ptepa & (NBPDR - 1))
  5687. return;
  5688. /*
  5689. * Skip the first page. Abort the mapping if the rest of
  5690. * the pages are not physically contiguous or have differing
  5691. * memory attributes.
  5692. */
  5693. p = TAILQ_NEXT(p, listq);
  5694. for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
  5695. pa += PAGE_SIZE) {
  5696. KASSERT(p->valid == VM_PAGE_BITS_ALL,
  5697. ("pmap_object_init_pt: invalid page %p", p));
  5698. if (pa != VM_PAGE_TO_PHYS(p) ||
  5699. pat_mode != p->md.pat_mode)
  5700. return;
  5701. p = TAILQ_NEXT(p, listq);
  5702. }
  5703. /*
  5704. * Map using 2MB pages. Since "ptepa" is 2M aligned and
  5705. * "size" is a multiple of 2M, adding the PAT setting to "pa"
  5706. * will not affect the termination of this loop.
  5707. */
  5708. PMAP_LOCK(pmap);
  5709. for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
  5710. pa < ptepa + size; pa += NBPDR) {
  5711. pdpg = pmap_allocpde(pmap, addr, NULL);
  5712. if (pdpg == NULL) {
  5713. /*
  5714. * The creation of mappings below is only an
  5715. * optimization. If a page directory page
  5716. * cannot be allocated without blocking,
  5717. * continue on to the next mapping rather than
  5718. * blocking.
  5719. */
  5720. addr += NBPDR;
  5721. continue;
  5722. }
  5723. pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
  5724. pde = &pde[pmap_pde_index(addr)];
  5725. if ((*pde & PG_V) == 0) {
  5726. pde_store(pde, pa | PG_PS | PG_M | PG_A |
  5727. PG_U | PG_RW | PG_V);
  5728. pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
  5729. atomic_add_long(&pmap_pde_mappings, 1);
  5730. } else {
  5731. /* Continue on if the PDE is already valid. */
  5732. pdpg->wire_count--;
  5733. KASSERT(pdpg->wire_count > 0,
  5734. ("pmap_object_init_pt: missing reference "
  5735. "to page directory page, va: 0x%lx", addr));
  5736. }
  5737. addr += NBPDR;
  5738. }
  5739. PMAP_UNLOCK(pmap);
  5740. }
  5741. }
  5742. /*
  5743. * Clear the wired attribute from the mappings for the specified range of
  5744. * addresses in the given pmap. Every valid mapping within that range
  5745. * must have the wired attribute set. In contrast, invalid mappings
  5746. * cannot have the wired attribute set, so they are ignored.
  5747. *
  5748. * The wired attribute of the page table entry is not a hardware
  5749. * feature, so there is no need to invalidate any TLB entries.
  5750. * Since pmap_demote_pde() for the wired entry must never fail,
  5751. * pmap_delayed_invl_start()/finish() calls around the
  5752. * function are not needed.
  5753. */
  5754. void
  5755. pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
  5756. {
  5757. vm_offset_t va_next;
  5758. pml4_entry_t *pml4e;
  5759. pdp_entry_t *pdpe;
  5760. pd_entry_t *pde;
  5761. pt_entry_t *pte, PG_V;
  5762. PG_V = pmap_valid_bit(pmap);
  5763. PMAP_LOCK(pmap);
  5764. for (; sva < eva; sva = va_next) {
  5765. pml4e = pmap_pml4e(pmap, sva);
  5766. if ((*pml4e & PG_V) == 0) {
  5767. va_next = (sva + NBPML4) & ~PML4MASK;
  5768. if (va_next < sva)
  5769. va_next = eva;
  5770. continue;
  5771. }
  5772. pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
  5773. if ((*pdpe & PG_V) == 0) {
  5774. va_next = (sva + NBPDP) & ~PDPMASK;
  5775. if (va_next < sva)
  5776. va_next = eva;
  5777. continue;
  5778. }
  5779. va_next = (sva + NBPDR) & ~PDRMASK;
  5780. if (va_next < sva)
  5781. va_next = eva;
  5782. pde = pmap_pdpe_to_pde(pdpe, sva);
  5783. if ((*pde & PG_V) == 0)
  5784. continue;
  5785. if ((*pde & PG_PS) != 0) {
  5786. if ((*pde & PG_W) == 0)
  5787. panic("pmap_unwire: pde %#jx is missing PG_W",
  5788. (uintmax_t)*pde);
  5789. /*
  5790. * Are we unwiring the entire large page? If not,
  5791. * demote the mapping and fall through.
  5792. */
  5793. if (sva + NBPDR == va_next && eva >= va_next) {
  5794. atomic_clear_long(pde, PG_W);
  5795. pmap->pm_stats.wired_count -= NBPDR /
  5796. PAGE_SIZE;
  5797. continue;
  5798. } else if (!pmap_demote_pde(pmap, pde, sva))
  5799. panic("pmap_unwire: demotion failed");
  5800. }
  5801. if (va_next > eva)
  5802. va_next = eva;
  5803. for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
  5804. sva += PAGE_SIZE) {
  5805. if ((*pte & PG_V) == 0)
  5806. continue;
  5807. if ((*pte & PG_W) == 0)
  5808. panic("pmap_unwire: pte %#jx is missing PG_W",
  5809. (uintmax_t)*pte);
  5810. /*
  5811. * PG_W must be cleared atomically. Although the pmap
  5812. * lock synchronizes access to PG_W, another processor
  5813. * could be setting PG_M and/or PG_A concurrently.
  5814. */
  5815. atomic_clear_long(pte, PG_W);
  5816. pmap->pm_stats.wired_count--;
  5817. }
  5818. }
  5819. PMAP_UNLOCK(pmap);
  5820. }
  5821. /*
  5822. * Copy the range specified by src_addr/len
  5823. * from the source map to the range dst_addr/len
  5824. * in the destination map.
  5825. *
  5826. * This routine is only advisory and need not do anything.
  5827. */
  5828. void
  5829. pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
  5830. vm_offset_t src_addr)
  5831. {
  5832. struct rwlock *lock;
  5833. struct spglist free;
  5834. pml4_entry_t *pml4e;
  5835. pdp_entry_t *pdpe;
  5836. pd_entry_t *pde, srcptepaddr;
  5837. pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte;
  5838. vm_offset_t addr, end_addr, va_next;
  5839. vm_page_t dst_pdpg, dstmpte, srcmpte;
  5840. if (dst_addr != src_addr)
  5841. return;
  5842. if (dst_pmap->pm_type != src_pmap->pm_type)
  5843. return;
  5844. /*
  5845. * EPT page table entries that require emulation of A/D bits are
  5846. * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
  5847. * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
  5848. * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
  5849. * implementations flag an EPT misconfiguration for exec-only
  5850. * mappings we skip this function entirely for emulated pmaps.
  5851. */
  5852. if (pmap_emulate_ad_bits(dst_pmap))
  5853. return;
  5854. end_addr = src_addr + len;
  5855. lock = NULL;
  5856. if (dst_pmap < src_pmap) {
  5857. PMAP_LOCK(dst_pmap);
  5858. PMAP_LOCK(src_pmap);
  5859. } else {
  5860. PMAP_LOCK(src_pmap);
  5861. PMAP_LOCK(dst_pmap);
  5862. }
  5863. PG_A = pmap_accessed_bit(dst_pmap);
  5864. PG_M = pmap_modified_bit(dst_pmap);
  5865. PG_V = pmap_valid_bit(dst_pmap);
  5866. for (addr = src_addr; addr < end_addr; addr = va_next) {
  5867. KASSERT(addr < UPT_MIN_ADDRESS,
  5868. ("pmap_copy: invalid to pmap_copy page tables"));
  5869. pml4e = pmap_pml4e(src_pmap, addr);
  5870. if ((*pml4e & PG_V) == 0) {
  5871. va_next = (addr + NBPML4) & ~PML4MASK;
  5872. if (va_next < addr)
  5873. va_next = end_addr;
  5874. continue;
  5875. }
  5876. pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
  5877. if ((*pdpe & PG_V) == 0) {
  5878. va_next = (addr + NBPDP) & ~PDPMASK;
  5879. if (va_next < addr)
  5880. va_next = end_addr;
  5881. continue;
  5882. }
  5883. va_next = (addr + NBPDR) & ~PDRMASK;
  5884. if (va_next < addr)
  5885. va_next = end_addr;
  5886. pde = pmap_pdpe_to_pde(pdpe, addr);
  5887. srcptepaddr = *pde;
  5888. if (srcptepaddr == 0)
  5889. continue;
  5890. if (srcptepaddr & PG_PS) {
  5891. if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
  5892. continue;
  5893. dst_pdpg = pmap_allocpde(dst_pmap, addr, NULL);
  5894. if (dst_pdpg == NULL)
  5895. break;
  5896. pde = (pd_entry_t *)
  5897. PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg));
  5898. pde = &pde[pmap_pde_index(addr)];
  5899. if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
  5900. pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr,
  5901. PMAP_ENTER_NORECLAIM, &lock))) {
  5902. *pde = srcptepaddr & ~PG_W;
  5903. pmap_resident_count_inc(dst_pmap, NBPDR /
  5904. PAGE_SIZE);
  5905. atomic_add_long(&pmap_pde_mappings, 1);
  5906. } else
  5907. dst_pdpg->wire_count--;
  5908. continue;
  5909. }
  5910. srcptepaddr &= PG_FRAME;
  5911. srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
  5912. KASSERT(srcmpte->wire_count > 0,
  5913. ("pmap_copy: source page table page is unused"));
  5914. if (va_next > end_addr)
  5915. va_next = end_addr;
  5916. src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
  5917. src_pte = &src_pte[pmap_pte_index(addr)];
  5918. dstmpte = NULL;
  5919. for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
  5920. ptetemp = *src_pte;
  5921. /*
  5922. * We only virtual copy managed pages.
  5923. */
  5924. if ((ptetemp & PG_MANAGED) == 0)
  5925. continue;
  5926. if (dstmpte != NULL) {
  5927. KASSERT(dstmpte->pindex ==
  5928. pmap_pde_pindex(addr),
  5929. ("dstmpte pindex/addr mismatch"));
  5930. dstmpte->wire_count++;
  5931. } else if ((dstmpte = pmap_allocpte(dst_pmap, addr,
  5932. NULL)) == NULL)
  5933. goto out;
  5934. dst_pte = (pt_entry_t *)
  5935. PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
  5936. dst_pte = &dst_pte[pmap_pte_index(addr)];
  5937. if (*dst_pte == 0 &&
  5938. pmap_try_insert_pv_entry(dst_pmap, addr,
  5939. PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) {
  5940. /*
  5941. * Clear the wired, modified, and accessed
  5942. * (referenced) bits during the copy.
  5943. */
  5944. *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A);
  5945. pmap_resident_count_inc(dst_pmap, 1);
  5946. } else {
  5947. SLIST_INIT(&free);
  5948. if (pmap_unwire_ptp(dst_pmap, addr, dstmpte,
  5949. &free)) {
  5950. /*
  5951. * Although "addr" is not mapped,
  5952. * paging-structure caches could
  5953. * nonetheless have entries that refer
  5954. * to the freed page table pages.
  5955. * Invalidate those entries.
  5956. */
  5957. pmap_invalidate_page(dst_pmap, addr);
  5958. vm_page_free_pages_toq(&free, true);
  5959. }
  5960. goto out;
  5961. }
  5962. /* Have we copied all of the valid mappings? */
  5963. if (dstmpte->wire_count >= srcmpte->wire_count)
  5964. break;
  5965. }
  5966. }
  5967. out:
  5968. if (lock != NULL)
  5969. rw_wunlock(lock);
  5970. PMAP_UNLOCK(src_pmap);
  5971. PMAP_UNLOCK(dst_pmap);
  5972. }
  5973. int
  5974. pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
  5975. {
  5976. int error;
  5977. if (dst_pmap->pm_type != src_pmap->pm_type ||
  5978. dst_pmap->pm_type != PT_X86 ||
  5979. (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
  5980. return (0);
  5981. for (;;) {
  5982. if (dst_pmap < src_pmap) {
  5983. PMAP_LOCK(dst_pmap);
  5984. PMAP_LOCK(src_pmap);
  5985. } else {
  5986. PMAP_LOCK(src_pmap);
  5987. PMAP_LOCK(dst_pmap);
  5988. }
  5989. error = pmap_pkru_copy(dst_pmap, src_pmap);
  5990. /* Clean up partial copy on failure due to no memory. */
  5991. if (error == ENOMEM)
  5992. pmap_pkru_deassign_all(dst_pmap);
  5993. PMAP_UNLOCK(src_pmap);
  5994. PMAP_UNLOCK(dst_pmap);
  5995. if (error != ENOMEM)
  5996. break;
  5997. vm_wait(NULL);
  5998. }
  5999. return (error);
  6000. }
  6001. /*
  6002. * Zero the specified hardware page.
  6003. */
  6004. void
  6005. pmap_zero_page(vm_page_t m)
  6006. {
  6007. vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
  6008. pagezero((void *)va);
  6009. }
  6010. /*
  6011. * Zero an an area within a single hardware page. off and size must not
  6012. * cover an area beyond a single hardware page.
  6013. */
  6014. void
  6015. pmap_zero_page_area(vm_page_t m, int off, int size)
  6016. {
  6017. vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
  6018. if (off == 0 && size == PAGE_SIZE)
  6019. pagezero((void *)va);
  6020. else
  6021. bzero((char *)va + off, size);
  6022. }
  6023. /*
  6024. * Copy 1 specified hardware page to another.
  6025. */
  6026. void
  6027. pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
  6028. {
  6029. vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
  6030. vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
  6031. pagecopy((void *)src, (void *)dst);
  6032. }
  6033. int unmapped_buf_allowed = 1;
  6034. void
  6035. pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
  6036. vm_offset_t b_offset, int xfersize)
  6037. {
  6038. void *a_cp, *b_cp;
  6039. vm_page_t pages[2];
  6040. vm_offset_t vaddr[2], a_pg_offset, b_pg_offset;
  6041. int cnt;
  6042. boolean_t mapped;
  6043. while (xfersize > 0) {
  6044. a_pg_offset = a_offset & PAGE_MASK;
  6045. pages[0] = ma[a_offset >> PAGE_SHIFT];
  6046. b_pg_offset = b_offset & PAGE_MASK;
  6047. pages[1] = mb[b_offset >> PAGE_SHIFT];
  6048. cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
  6049. cnt = min(cnt, PAGE_SIZE - b_pg_offset);
  6050. mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE);
  6051. a_cp = (char *)vaddr[0] + a_pg_offset;
  6052. b_cp = (char *)vaddr[1] + b_pg_offset;
  6053. bcopy(a_cp, b_cp, cnt);
  6054. if (__predict_false(mapped))
  6055. pmap_unmap_io_transient(pages, vaddr, 2, FALSE);
  6056. a_offset += cnt;
  6057. b_offset += cnt;
  6058. xfersize -= cnt;
  6059. }
  6060. }
  6061. /*
  6062. * Returns true if the pmap's pv is one of the first
  6063. * 16 pvs linked to from this page. This count may
  6064. * be changed upwards or downwards in the future; it
  6065. * is only necessary that true be returned for a small
  6066. * subset of pmaps for proper page aging.
  6067. */
  6068. boolean_t
  6069. pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
  6070. {
  6071. struct md_page *pvh;
  6072. struct rwlock *lock;
  6073. pv_entry_t pv;
  6074. int loops = 0;
  6075. boolean_t rv;
  6076. KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  6077. ("pmap_page_exists_quick: page %p is not managed", m));
  6078. rv = FALSE;
  6079. lock = VM_PAGE_TO_PV_LIST_LOCK(m);
  6080. rw_rlock(lock);
  6081. TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
  6082. if (PV_PMAP(pv) == pmap) {
  6083. rv = TRUE;
  6084. break;
  6085. }
  6086. loops++;
  6087. if (loops >= 16)
  6088. break;
  6089. }
  6090. if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
  6091. pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
  6092. TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
  6093. if (PV_PMAP(pv) == pmap) {
  6094. rv = TRUE;
  6095. break;
  6096. }
  6097. loops++;
  6098. if (loops >= 16)
  6099. break;
  6100. }
  6101. }
  6102. rw_runlock(lock);
  6103. return (rv);
  6104. }
  6105. /*
  6106. * pmap_page_wired_mappings:
  6107. *
  6108. * Return the number of managed mappings to the given physical page
  6109. * that are wired.
  6110. */
  6111. int
  6112. pmap_page_wired_mappings(vm_page_t m)
  6113. {
  6114. struct rwlock *lock;
  6115. struct md_page *pvh;
  6116. pmap_t pmap;
  6117. pt_entry_t *pte;
  6118. pv_entry_t pv;
  6119. int count, md_gen, pvh_gen;
  6120. if ((m->oflags & VPO_UNMANAGED) != 0)
  6121. return (0);
  6122. lock = VM_PAGE_TO_PV_LIST_LOCK(m);
  6123. rw_rlock(lock);
  6124. restart:
  6125. count = 0;
  6126. TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
  6127. pmap = PV_PMAP(pv);
  6128. if (!PMAP_TRYLOCK(pmap)) {
  6129. md_gen = m->md.pv_gen;
  6130. rw_runlock(lock);
  6131. PMAP_LOCK(pmap);
  6132. rw_rlock(lock);
  6133. if (md_gen != m->md.pv_gen) {
  6134. PMAP_UNLOCK(pmap);
  6135. goto restart;
  6136. }
  6137. }
  6138. pte = pmap_pte(pmap, pv->pv_va);
  6139. if ((*pte & PG_W) != 0)
  6140. count++;
  6141. PMAP_UNLOCK(pmap);
  6142. }
  6143. if ((m->flags & PG_FICTITIOUS) == 0) {
  6144. pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
  6145. TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
  6146. pmap = PV_PMAP(pv);
  6147. if (!PMAP_TRYLOCK(pmap)) {
  6148. md_gen = m->md.pv_gen;
  6149. pvh_gen = pvh->pv_gen;
  6150. rw_runlock(lock);
  6151. PMAP_LOCK(pmap);
  6152. rw_rlock(lock);
  6153. if (md_gen != m->md.pv_gen ||
  6154. pvh_gen != pvh->pv_gen) {
  6155. PMAP_UNLOCK(pmap);
  6156. goto restart;
  6157. }
  6158. }
  6159. pte = pmap_pde(pmap, pv->pv_va);
  6160. if ((*pte & PG_W) != 0)
  6161. count++;
  6162. PMAP_UNLOCK(pmap);
  6163. }
  6164. }
  6165. rw_runlock(lock);
  6166. return (count);
  6167. }
  6168. /*
  6169. * Returns TRUE if the given page is mapped individually or as part of
  6170. * a 2mpage. Otherwise, returns FALSE.
  6171. */
  6172. boolean_t
  6173. pmap_page_is_mapped(vm_page_t m)
  6174. {
  6175. struct rwlock *lock;
  6176. boolean_t rv;
  6177. if ((m->oflags & VPO_UNMANAGED) != 0)
  6178. return (FALSE);
  6179. lock = VM_PAGE_TO_PV_LIST_LOCK(m);
  6180. rw_rlock(lock);
  6181. rv = !TAILQ_EMPTY(&m->md.pv_list) ||
  6182. ((m->flags & PG_FICTITIOUS) == 0 &&
  6183. !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
  6184. rw_runlock(lock);
  6185. return (rv);
  6186. }
  6187. /*
  6188. * Destroy all managed, non-wired mappings in the given user-space
  6189. * pmap. This pmap cannot be active on any processor besides the
  6190. * caller.
  6191. *
  6192. * This function cannot be applied to the kernel pmap. Moreover, it
  6193. * is not intended for general use. It is only to be used during
  6194. * process termination. Consequently, it can be implemented in ways
  6195. * that make it faster than pmap_remove(). First, it can more quickly
  6196. * destroy mappings by iterating over the pmap's collection of PV
  6197. * entries, rather than searching the page table. Second, it doesn't
  6198. * have to test and clear the page table entries atomically, because
  6199. * no processor is currently accessing the user address space. In
  6200. * particular, a page table entry's dirty bit won't change state once
  6201. * this function starts.
  6202. *
  6203. * Although this function destroys all of the pmap's managed,
  6204. * non-wired mappings, it can delay and batch the invalidation of TLB
  6205. * entries without calling pmap_delayed_invl_start() and
  6206. * pmap_delayed_invl_finish(). Because the pmap is not active on
  6207. * any other processor, none of these TLB entries will ever be used
  6208. * before their eventual invalidation. Consequently, there is no need
  6209. * for either pmap_remove_all() or pmap_remove_write() to wait for
  6210. * that eventual TLB invalidation.
  6211. */
  6212. void
  6213. pmap_remove_pages(pmap_t pmap)
  6214. {
  6215. pd_entry_t ptepde;
  6216. pt_entry_t *pte, tpte;
  6217. pt_entry_t PG_M, PG_RW, PG_V;
  6218. struct spglist free;
  6219. vm_page_t m, mpte, mt;
  6220. pv_entry_t pv;
  6221. struct md_page *pvh;
  6222. struct pv_chunk *pc, *npc;
  6223. struct rwlock *lock;
  6224. int64_t bit;
  6225. uint64_t inuse, bitmask;
  6226. int allfree, field, freed, idx;
  6227. boolean_t superpage;
  6228. vm_paddr_t pa;
  6229. /*
  6230. * Assert that the given pmap is only active on the current
  6231. * CPU. Unfortunately, we cannot block another CPU from
  6232. * activating the pmap while this function is executing.
  6233. */
  6234. KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
  6235. #ifdef INVARIANTS
  6236. {
  6237. cpuset_t other_cpus;
  6238. other_cpus = all_cpus;
  6239. critical_enter();
  6240. CPU_CLR(PCPU_GET(cpuid), &other_cpus);
  6241. CPU_AND(&other_cpus, &pmap->pm_active);
  6242. critical_exit();
  6243. KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
  6244. }
  6245. #endif
  6246. lock = NULL;
  6247. PG_M = pmap_modified_bit(pmap);
  6248. PG_V = pmap_valid_bit(pmap);
  6249. PG_RW = pmap_rw_bit(pmap);
  6250. SLIST_INIT(&free);
  6251. PMAP_LOCK(pmap);
  6252. TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
  6253. allfree = 1;
  6254. freed = 0;
  6255. for (field = 0; field < _NPCM; field++) {
  6256. inuse = ~pc->pc_map[field] & pc_freemask[field];
  6257. while (inuse != 0) {
  6258. bit = bsfq(inuse);
  6259. bitmask = 1UL << bit;
  6260. idx = field * 64 + bit;
  6261. pv = &pc->pc_pventry[idx];
  6262. inuse &= ~bitmask;
  6263. pte = pmap_pdpe(pmap, pv->pv_va);
  6264. ptepde = *pte;
  6265. pte = pmap_pdpe_to_pde(pte, pv->pv_va);
  6266. tpte = *pte;
  6267. if ((tpte & (PG_PS | PG_V)) == PG_V) {
  6268. superpage = FALSE;
  6269. ptepde = tpte;
  6270. pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
  6271. PG_FRAME);
  6272. pte = &pte[pmap_pte_index(pv->pv_va)];
  6273. tpte = *pte;
  6274. } else {
  6275. /*
  6276. * Keep track whether 'tpte' is a
  6277. * superpage explicitly instead of
  6278. * relying on PG_PS being set.
  6279. *
  6280. * This is because PG_PS is numerically
  6281. * identical to PG_PTE_PAT and thus a
  6282. * regular page could be mistaken for
  6283. * a superpage.
  6284. */
  6285. superpage = TRUE;
  6286. }
  6287. if ((tpte & PG_V) == 0) {
  6288. panic("bad pte va %lx pte %lx",
  6289. pv->pv_va, tpte);
  6290. }
  6291. /*
  6292. * We cannot remove wired pages from a process' mapping at this time
  6293. */
  6294. if (tpte & PG_W) {
  6295. allfree = 0;
  6296. continue;
  6297. }
  6298. if (superpage)
  6299. pa = tpte & PG_PS_FRAME;
  6300. else
  6301. pa = tpte & PG_FRAME;
  6302. m = PHYS_TO_VM_PAGE(pa);
  6303. KASSERT(m->phys_addr == pa,
  6304. ("vm_page_t %p phys_addr mismatch %016jx %016jx",
  6305. m, (uintmax_t)m->phys_addr,
  6306. (uintmax_t)tpte));
  6307. KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
  6308. m < &vm_page_array[vm_page_array_size],
  6309. ("pmap_remove_pages: bad tpte %#jx",
  6310. (uintmax_t)tpte));
  6311. pte_clear(pte);
  6312. /*
  6313. * Update the vm_page_t clean/reference bits.
  6314. */
  6315. if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
  6316. if (superpage) {
  6317. for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
  6318. vm_page_dirty(mt);
  6319. } else
  6320. vm_page_dirty(m);
  6321. }
  6322. CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
  6323. /* Mark free */
  6324. pc->pc_map[field] |= bitmask;
  6325. if (superpage) {
  6326. pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
  6327. pvh = pa_to_pvh(tpte & PG_PS_FRAME);
  6328. TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
  6329. pvh->pv_gen++;
  6330. if (TAILQ_EMPTY(&pvh->pv_list)) {
  6331. for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
  6332. if ((mt->aflags & PGA_WRITEABLE) != 0 &&
  6333. TAILQ_EMPTY(&mt->md.pv_list))
  6334. vm_page_aflag_clear(mt, PGA_WRITEABLE);
  6335. }
  6336. mpte = pmap_remove_pt_page(pmap, pv->pv_va);
  6337. if (mpte != NULL) {
  6338. KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
  6339. ("pmap_remove_pages: pte page not promoted"));
  6340. pmap_resident_count_dec(pmap, 1);
  6341. KASSERT(mpte->wire_count == NPTEPG,
  6342. ("pmap_remove_pages: pte page wire count error"));
  6343. mpte->wire_count = 0;
  6344. pmap_add_delayed_free_list(mpte, &free, FALSE);
  6345. }
  6346. } else {
  6347. pmap_resident_count_dec(pmap, 1);
  6348. TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
  6349. m->md.pv_gen++;
  6350. if ((m->aflags & PGA_WRITEABLE) != 0 &&
  6351. TAILQ_EMPTY(&m->md.pv_list) &&
  6352. (m->flags & PG_FICTITIOUS) == 0) {
  6353. pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
  6354. if (TAILQ_EMPTY(&pvh->pv_list))
  6355. vm_page_aflag_clear(m, PGA_WRITEABLE);
  6356. }
  6357. }
  6358. pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
  6359. freed++;
  6360. }
  6361. }
  6362. PV_STAT(atomic_add_long(&pv_entry_frees, freed));
  6363. PV_STAT(atomic_add_int(&pv_entry_spare, freed));
  6364. PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
  6365. if (allfree) {
  6366. TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
  6367. free_pv_chunk(pc);
  6368. }
  6369. }
  6370. if (lock != NULL)
  6371. rw_wunlock(lock);
  6372. pmap_invalidate_all(pmap);
  6373. pmap_pkru_deassign_all(pmap);
  6374. PMAP_UNLOCK(pmap);
  6375. vm_page_free_pages_toq(&free, true);
  6376. }
  6377. static boolean_t
  6378. pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
  6379. {
  6380. struct rwlock *lock;
  6381. pv_entry_t pv;
  6382. struct md_page *pvh;
  6383. pt_entry_t *pte, mask;
  6384. pt_entry_t PG_A, PG_M, PG_RW, PG_V;
  6385. pmap_t pmap;
  6386. int md_gen, pvh_gen;
  6387. boolean_t rv;
  6388. rv = FALSE;
  6389. lock = VM_PAGE_TO_PV_LIST_LOCK(m);
  6390. rw_rlock(lock);
  6391. restart:
  6392. TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
  6393. pmap = PV_PMAP(pv);
  6394. if (!PMAP_TRYLOCK(pmap)) {
  6395. md_gen = m->md.pv_gen;
  6396. rw_runlock(lock);
  6397. PMAP_LOCK(pmap);
  6398. rw_rlock(lock);
  6399. if (md_gen != m->md.pv_gen) {
  6400. PMAP_UNLOCK(pmap);
  6401. goto restart;
  6402. }
  6403. }
  6404. pte = pmap_pte(pmap, pv->pv_va);
  6405. mask = 0;
  6406. if (modified) {
  6407. PG_M = pmap_modified_bit(pmap);
  6408. PG_RW = pmap_rw_bit(pmap);
  6409. mask |= PG_RW | PG_M;
  6410. }
  6411. if (accessed) {
  6412. PG_A = pmap_accessed_bit(pmap);
  6413. PG_V = pmap_valid_bit(pmap);
  6414. mask |= PG_V | PG_A;
  6415. }
  6416. rv = (*pte & mask) == mask;
  6417. PMAP_UNLOCK(pmap);
  6418. if (rv)
  6419. goto out;
  6420. }
  6421. if ((m->flags & PG_FICTITIOUS) == 0) {
  6422. pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
  6423. TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
  6424. pmap = PV_PMAP(pv);
  6425. if (!PMAP_TRYLOCK(pmap)) {
  6426. md_gen = m->md.pv_gen;
  6427. pvh_gen = pvh->pv_gen;
  6428. rw_runlock(lock);
  6429. PMAP_LOCK(pmap);
  6430. rw_rlock(lock);
  6431. if (md_gen != m->md.pv_gen ||
  6432. pvh_gen != pvh->pv_gen) {
  6433. PMAP_UNLOCK(pmap);
  6434. goto restart;
  6435. }
  6436. }
  6437. pte = pmap_pde(pmap, pv->pv_va);
  6438. mask = 0;
  6439. if (modified) {
  6440. PG_M = pmap_modified_bit(pmap);
  6441. PG_RW = pmap_rw_bit(pmap);
  6442. mask |= PG_RW | PG_M;
  6443. }
  6444. if (accessed) {
  6445. PG_A = pmap_accessed_bit(pmap);
  6446. PG_V = pmap_valid_bit(pmap);
  6447. mask |= PG_V | PG_A;
  6448. }
  6449. rv = (*pte & mask) == mask;
  6450. PMAP_UNLOCK(pmap);
  6451. if (rv)
  6452. goto out;
  6453. }
  6454. }
  6455. out:
  6456. rw_runlock(lock);
  6457. return (rv);
  6458. }
  6459. /*
  6460. * pmap_is_modified:
  6461. *
  6462. * Return whether or not the specified physical page was modified
  6463. * in any physical maps.
  6464. */
  6465. boolean_t
  6466. pmap_is_modified(vm_page_t m)
  6467. {
  6468. KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  6469. ("pmap_is_modified: page %p is not managed", m));
  6470. /*
  6471. * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
  6472. * concurrently set while the object is locked. Thus, if PGA_WRITEABLE
  6473. * is clear, no PTEs can have PG_M set.
  6474. */
  6475. VM_OBJECT_ASSERT_WLOCKED(m->object);
  6476. if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
  6477. return (FALSE);
  6478. return (pmap_page_test_mappings(m, FALSE, TRUE));
  6479. }
  6480. /*
  6481. * pmap_is_prefaultable:
  6482. *
  6483. * Return whether or not the specified virtual address is eligible
  6484. * for prefault.
  6485. */
  6486. boolean_t
  6487. pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
  6488. {
  6489. pd_entry_t *pde;
  6490. pt_entry_t *pte, PG_V;
  6491. boolean_t rv;
  6492. PG_V = pmap_valid_bit(pmap);
  6493. rv = FALSE;
  6494. PMAP_LOCK(pmap);
  6495. pde = pmap_pde(pmap, addr);
  6496. if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
  6497. pte = pmap_pde_to_pte(pde, addr);
  6498. rv = (*pte & PG_V) == 0;
  6499. }
  6500. PMAP_UNLOCK(pmap);
  6501. return (rv);
  6502. }
  6503. /*
  6504. * pmap_is_referenced:
  6505. *
  6506. * Return whether or not the specified physical page was referenced
  6507. * in any physical maps.
  6508. */
  6509. boolean_t
  6510. pmap_is_referenced(vm_page_t m)
  6511. {
  6512. KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  6513. ("pmap_is_referenced: page %p is not managed", m));
  6514. return (pmap_page_test_mappings(m, TRUE, FALSE));
  6515. }
  6516. /*
  6517. * Clear the write and modified bits in each of the given page's mappings.
  6518. */
  6519. void
  6520. pmap_remove_write(vm_page_t m)
  6521. {
  6522. struct md_page *pvh;
  6523. pmap_t pmap;
  6524. struct rwlock *lock;
  6525. pv_entry_t next_pv, pv;
  6526. pd_entry_t *pde;
  6527. pt_entry_t oldpte, *pte, PG_M, PG_RW;
  6528. vm_offset_t va;
  6529. int pvh_gen, md_gen;
  6530. KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  6531. ("pmap_remove_write: page %p is not managed", m));
  6532. /*
  6533. * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
  6534. * set by another thread while the object is locked. Thus,
  6535. * if PGA_WRITEABLE is clear, no page table entries need updating.
  6536. */
  6537. VM_OBJECT_ASSERT_WLOCKED(m->object);
  6538. if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
  6539. return;
  6540. lock = VM_PAGE_TO_PV_LIST_LOCK(m);
  6541. pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
  6542. pa_to_pvh(VM_PAGE_TO_PHYS(m));
  6543. retry_pv_loop:
  6544. rw_wlock(lock);
  6545. TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
  6546. pmap = PV_PMAP(pv);
  6547. if (!PMAP_TRYLOCK(pmap)) {
  6548. pvh_gen = pvh->pv_gen;
  6549. rw_wunlock(lock);
  6550. PMAP_LOCK(pmap);
  6551. rw_wlock(lock);
  6552. if (pvh_gen != pvh->pv_gen) {
  6553. PMAP_UNLOCK(pmap);
  6554. rw_wunlock(lock);
  6555. goto retry_pv_loop;
  6556. }
  6557. }
  6558. PG_RW = pmap_rw_bit(pmap);
  6559. va = pv->pv_va;
  6560. pde = pmap_pde(pmap, va);
  6561. if ((*pde & PG_RW) != 0)
  6562. (void)pmap_demote_pde_locked(pmap, pde, va, &lock);
  6563. KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
  6564. ("inconsistent pv lock %p %p for page %p",
  6565. lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
  6566. PMAP_UNLOCK(pmap);
  6567. }
  6568. TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
  6569. pmap = PV_PMAP(pv);
  6570. if (!PMAP_TRYLOCK(pmap)) {
  6571. pvh_gen = pvh->pv_gen;
  6572. md_gen = m->md.pv_gen;
  6573. rw_wunlock(lock);
  6574. PMAP_LOCK(pmap);
  6575. rw_wlock(lock);
  6576. if (pvh_gen != pvh->pv_gen ||
  6577. md_gen != m->md.pv_gen) {
  6578. PMAP_UNLOCK(pmap);
  6579. rw_wunlock(lock);
  6580. goto retry_pv_loop;
  6581. }
  6582. }
  6583. PG_M = pmap_modified_bit(pmap);
  6584. PG_RW = pmap_rw_bit(pmap);
  6585. pde = pmap_pde(pmap, pv->pv_va);
  6586. KASSERT((*pde & PG_PS) == 0,
  6587. ("pmap_remove_write: found a 2mpage in page %p's pv list",
  6588. m));
  6589. pte = pmap_pde_to_pte(pde, pv->pv_va);
  6590. retry:
  6591. oldpte = *pte;
  6592. if (oldpte & PG_RW) {
  6593. if (!atomic_cmpset_long(pte, oldpte, oldpte &
  6594. ~(PG_RW | PG_M)))
  6595. goto retry;
  6596. if ((oldpte & PG_M) != 0)
  6597. vm_page_dirty(m);
  6598. pmap_invalidate_page(pmap, pv->pv_va);
  6599. }
  6600. PMAP_UNLOCK(pmap);
  6601. }
  6602. rw_wunlock(lock);
  6603. vm_page_aflag_clear(m, PGA_WRITEABLE);
  6604. pmap_delayed_invl_wait(m);
  6605. }
  6606. static __inline boolean_t
  6607. safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
  6608. {
  6609. if (!pmap_emulate_ad_bits(pmap))
  6610. return (TRUE);
  6611. KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
  6612. /*
  6613. * XWR = 010 or 110 will cause an unconditional EPT misconfiguration
  6614. * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
  6615. * if the EPT_PG_WRITE bit is set.
  6616. */
  6617. if ((pte & EPT_PG_WRITE) != 0)
  6618. return (FALSE);
  6619. /*
  6620. * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
  6621. */
  6622. if ((pte & EPT_PG_EXECUTE) == 0 ||
  6623. ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
  6624. return (TRUE);
  6625. else
  6626. return (FALSE);
  6627. }
  6628. /*
  6629. * pmap_ts_referenced:
  6630. *
  6631. * Return a count of reference bits for a page, clearing those bits.
  6632. * It is not necessary for every reference bit to be cleared, but it
  6633. * is necessary that 0 only be returned when there are truly no
  6634. * reference bits set.
  6635. *
  6636. * As an optimization, update the page's dirty field if a modified bit is
  6637. * found while counting reference bits. This opportunistic update can be
  6638. * performed at low cost and can eliminate the need for some future calls
  6639. * to pmap_is_modified(). However, since this function stops after
  6640. * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
  6641. * dirty pages. Those dirty pages will only be detected by a future call
  6642. * to pmap_is_modified().
  6643. *
  6644. * A DI block is not needed within this function, because
  6645. * invalidations are performed before the PV list lock is
  6646. * released.
  6647. */
  6648. int
  6649. pmap_ts_referenced(vm_page_t m)
  6650. {
  6651. struct md_page *pvh;
  6652. pv_entry_t pv, pvf;
  6653. pmap_t pmap;
  6654. struct rwlock *lock;
  6655. pd_entry_t oldpde, *pde;
  6656. pt_entry_t *pte, PG_A, PG_M, PG_RW;
  6657. vm_offset_t va;
  6658. vm_paddr_t pa;
  6659. int cleared, md_gen, not_cleared, pvh_gen;
  6660. struct spglist free;
  6661. boolean_t demoted;
  6662. KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  6663. ("pmap_ts_referenced: page %p is not managed", m));
  6664. SLIST_INIT(&free);
  6665. cleared = 0;
  6666. pa = VM_PAGE_TO_PHYS(m);
  6667. lock = PHYS_TO_PV_LIST_LOCK(pa);
  6668. pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
  6669. rw_wlock(lock);
  6670. retry:
  6671. not_cleared = 0;
  6672. if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
  6673. goto small_mappings;
  6674. pv = pvf;
  6675. do {
  6676. if (pvf == NULL)
  6677. pvf = pv;
  6678. pmap = PV_PMAP(pv);
  6679. if (!PMAP_TRYLOCK(pmap)) {
  6680. pvh_gen = pvh->pv_gen;
  6681. rw_wunlock(lock);
  6682. PMAP_LOCK(pmap);
  6683. rw_wlock(lock);
  6684. if (pvh_gen != pvh->pv_gen) {
  6685. PMAP_UNLOCK(pmap);
  6686. goto retry;
  6687. }
  6688. }
  6689. PG_A = pmap_accessed_bit(pmap);
  6690. PG_M = pmap_modified_bit(pmap);
  6691. PG_RW = pmap_rw_bit(pmap);
  6692. va = pv->pv_va;
  6693. pde = pmap_pde(pmap, pv->pv_va);
  6694. oldpde = *pde;
  6695. if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
  6696. /*
  6697. * Although "oldpde" is mapping a 2MB page, because
  6698. * this function is called at a 4KB page granularity,
  6699. * we only update the 4KB page under test.
  6700. */
  6701. vm_page_dirty(m);
  6702. }
  6703. if ((oldpde & PG_A) != 0) {
  6704. /*
  6705. * Since this reference bit is shared by 512 4KB
  6706. * pages, it should not be cleared every time it is
  6707. * tested. Apply a simple "hash" function on the
  6708. * physical page number, the virtual superpage number,
  6709. * and the pmap address to select one 4KB page out of
  6710. * the 512 on which testing the reference bit will
  6711. * result in clearing that reference bit. This
  6712. * function is designed to avoid the selection of the
  6713. * same 4KB page for every 2MB page mapping.
  6714. *
  6715. * On demotion, a mapping that hasn't been referenced
  6716. * is simply destroyed. To avoid the possibility of a
  6717. * subsequent page fault on a demoted wired mapping,
  6718. * always leave its reference bit set. Moreover,
  6719. * since the superpage is wired, the current state of
  6720. * its reference bit won't affect page replacement.
  6721. */
  6722. if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
  6723. (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
  6724. (oldpde & PG_W) == 0) {
  6725. if (safe_to_clear_referenced(pmap, oldpde)) {
  6726. atomic_clear_long(pde, PG_A);
  6727. pmap_invalidate_page(pmap, pv->pv_va);
  6728. demoted = FALSE;
  6729. } else if (pmap_demote_pde_locked(pmap, pde,
  6730. pv->pv_va, &lock)) {
  6731. /*
  6732. * Remove the mapping to a single page
  6733. * so that a subsequent access may
  6734. * repromote. Since the underlying
  6735. * page table page is fully populated,
  6736. * this removal never frees a page
  6737. * table page.
  6738. */
  6739. demoted = TRUE;
  6740. va += VM_PAGE_TO_PHYS(m) - (oldpde &
  6741. PG_PS_FRAME);
  6742. pte = pmap_pde_to_pte(pde, va);
  6743. pmap_remove_pte(pmap, pte, va, *pde,
  6744. NULL, &lock);
  6745. pmap_invalidate_page(pmap, va);
  6746. } else
  6747. demoted = TRUE;
  6748. if (demoted) {
  6749. /*
  6750. * The superpage mapping was removed
  6751. * entirely and therefore 'pv' is no
  6752. * longer valid.
  6753. */
  6754. if (pvf == pv)
  6755. pvf = NULL;
  6756. pv = NULL;
  6757. }
  6758. cleared++;
  6759. KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
  6760. ("inconsistent pv lock %p %p for page %p",
  6761. lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
  6762. } else
  6763. not_cleared++;
  6764. }
  6765. PMAP_UNLOCK(pmap);
  6766. /* Rotate the PV list if it has more than one entry. */
  6767. if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
  6768. TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
  6769. TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
  6770. pvh->pv_gen++;
  6771. }
  6772. if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
  6773. goto out;
  6774. } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
  6775. small_mappings:
  6776. if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
  6777. goto out;
  6778. pv = pvf;
  6779. do {
  6780. if (pvf == NULL)
  6781. pvf = pv;
  6782. pmap = PV_PMAP(pv);
  6783. if (!PMAP_TRYLOCK(pmap)) {
  6784. pvh_gen = pvh->pv_gen;
  6785. md_gen = m->md.pv_gen;
  6786. rw_wunlock(lock);
  6787. PMAP_LOCK(pmap);
  6788. rw_wlock(lock);
  6789. if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
  6790. PMAP_UNLOCK(pmap);
  6791. goto retry;
  6792. }
  6793. }
  6794. PG_A = pmap_accessed_bit(pmap);
  6795. PG_M = pmap_modified_bit(pmap);
  6796. PG_RW = pmap_rw_bit(pmap);
  6797. pde = pmap_pde(pmap, pv->pv_va);
  6798. KASSERT((*pde & PG_PS) == 0,
  6799. ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
  6800. m));
  6801. pte = pmap_pde_to_pte(pde, pv->pv_va);
  6802. if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
  6803. vm_page_dirty(m);
  6804. if ((*pte & PG_A) != 0) {
  6805. if (safe_to_clear_referenced(pmap, *pte)) {
  6806. atomic_clear_long(pte, PG_A);
  6807. pmap_invalidate_page(pmap, pv->pv_va);
  6808. cleared++;
  6809. } else if ((*pte & PG_W) == 0) {
  6810. /*
  6811. * Wired pages cannot be paged out so
  6812. * doing accessed bit emulation for
  6813. * them is wasted effort. We do the
  6814. * hard work for unwired pages only.
  6815. */
  6816. pmap_remove_pte(pmap, pte, pv->pv_va,
  6817. *pde, &free, &lock);
  6818. pmap_invalidate_page(pmap, pv->pv_va);
  6819. cleared++;
  6820. if (pvf == pv)
  6821. pvf = NULL;
  6822. pv = NULL;
  6823. KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
  6824. ("inconsistent pv lock %p %p for page %p",
  6825. lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
  6826. } else
  6827. not_cleared++;
  6828. }
  6829. PMAP_UNLOCK(pmap);
  6830. /* Rotate the PV list if it has more than one entry. */
  6831. if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
  6832. TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
  6833. TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
  6834. m->md.pv_gen++;
  6835. }
  6836. } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
  6837. not_cleared < PMAP_TS_REFERENCED_MAX);
  6838. out:
  6839. rw_wunlock(lock);
  6840. vm_page_free_pages_toq(&free, true);
  6841. return (cleared + not_cleared);
  6842. }
  6843. /*
  6844. * Apply the given advice to the specified range of addresses within the
  6845. * given pmap. Depending on the advice, clear the referenced and/or
  6846. * modified flags in each mapping and set the mapped page's dirty field.
  6847. */
  6848. void
  6849. pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
  6850. {
  6851. struct rwlock *lock;
  6852. pml4_entry_t *pml4e;
  6853. pdp_entry_t *pdpe;
  6854. pd_entry_t oldpde, *pde;
  6855. pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
  6856. vm_offset_t va, va_next;
  6857. vm_page_t m;
  6858. bool anychanged;
  6859. if (advice != MADV_DONTNEED && advice != MADV_FREE)
  6860. return;
  6861. /*
  6862. * A/D bit emulation requires an alternate code path when clearing
  6863. * the modified and accessed bits below. Since this function is
  6864. * advisory in nature we skip it entirely for pmaps that require
  6865. * A/D bit emulation.
  6866. */
  6867. if (pmap_emulate_ad_bits(pmap))
  6868. return;
  6869. PG_A = pmap_accessed_bit(pmap);
  6870. PG_G = pmap_global_bit(pmap);
  6871. PG_M = pmap_modified_bit(pmap);
  6872. PG_V = pmap_valid_bit(pmap);
  6873. PG_RW = pmap_rw_bit(pmap);
  6874. anychanged = false;
  6875. pmap_delayed_invl_start();
  6876. PMAP_LOCK(pmap);
  6877. for (; sva < eva; sva = va_next) {
  6878. pml4e = pmap_pml4e(pmap, sva);
  6879. if ((*pml4e & PG_V) == 0) {
  6880. va_next = (sva + NBPML4) & ~PML4MASK;
  6881. if (va_next < sva)
  6882. va_next = eva;
  6883. continue;
  6884. }
  6885. pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
  6886. if ((*pdpe & PG_V) == 0) {
  6887. va_next = (sva + NBPDP) & ~PDPMASK;
  6888. if (va_next < sva)
  6889. va_next = eva;
  6890. continue;
  6891. }
  6892. va_next = (sva + NBPDR) & ~PDRMASK;
  6893. if (va_next < sva)
  6894. va_next = eva;
  6895. pde = pmap_pdpe_to_pde(pdpe, sva);
  6896. oldpde = *pde;
  6897. if ((oldpde & PG_V) == 0)
  6898. continue;
  6899. else if ((oldpde & PG_PS) != 0) {
  6900. if ((oldpde & PG_MANAGED) == 0)
  6901. continue;
  6902. lock = NULL;
  6903. if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
  6904. if (lock != NULL)
  6905. rw_wunlock(lock);
  6906. /*
  6907. * The large page mapping was destroyed.
  6908. */
  6909. continue;
  6910. }
  6911. /*
  6912. * Unless the page mappings are wired, remove the
  6913. * mapping to a single page so that a subsequent
  6914. * access may repromote. Choosing the last page
  6915. * within the address range [sva, min(va_next, eva))
  6916. * generally results in more repromotions. Since the
  6917. * underlying page table page is fully populated, this
  6918. * removal never frees a page table page.
  6919. */
  6920. if ((oldpde & PG_W) == 0) {
  6921. va = eva;
  6922. if (va > va_next)
  6923. va = va_next;
  6924. va -= PAGE_SIZE;
  6925. KASSERT(va >= sva,
  6926. ("pmap_advise: no address gap"));
  6927. pte = pmap_pde_to_pte(pde, va);
  6928. KASSERT((*pte & PG_V) != 0,
  6929. ("pmap_advise: invalid PTE"));
  6930. pmap_remove_pte(pmap, pte, va, *pde, NULL,
  6931. &lock);
  6932. anychanged = true;
  6933. }
  6934. if (lock != NULL)
  6935. rw_wunlock(lock);
  6936. }
  6937. if (va_next > eva)
  6938. va_next = eva;
  6939. va = va_next;
  6940. for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
  6941. sva += PAGE_SIZE) {
  6942. if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
  6943. goto maybe_invlrng;
  6944. else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
  6945. if (advice == MADV_DONTNEED) {
  6946. /*
  6947. * Future calls to pmap_is_modified()
  6948. * can be avoided by making the page
  6949. * dirty now.
  6950. */
  6951. m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
  6952. vm_page_dirty(m);
  6953. }
  6954. atomic_clear_long(pte, PG_M | PG_A);
  6955. } else if ((*pte & PG_A) != 0)
  6956. atomic_clear_long(pte, PG_A);
  6957. else
  6958. goto maybe_invlrng;
  6959. if ((*pte & PG_G) != 0) {
  6960. if (va == va_next)
  6961. va = sva;
  6962. } else
  6963. anychanged = true;
  6964. continue;
  6965. maybe_invlrng:
  6966. if (va != va_next) {
  6967. pmap_invalidate_range(pmap, va, sva);
  6968. va = va_next;
  6969. }
  6970. }
  6971. if (va != va_next)
  6972. pmap_invalidate_range(pmap, va, sva);
  6973. }
  6974. if (anychanged)
  6975. pmap_invalidate_all(pmap);
  6976. PMAP_UNLOCK(pmap);
  6977. pmap_delayed_invl_finish();
  6978. }
  6979. /*
  6980. * Clear the modify bits on the specified physical page.
  6981. */
  6982. void
  6983. pmap_clear_modify(vm_page_t m)
  6984. {
  6985. struct md_page *pvh;
  6986. pmap_t pmap;
  6987. pv_entry_t next_pv, pv;
  6988. pd_entry_t oldpde, *pde;
  6989. pt_entry_t *pte, PG_M, PG_RW;
  6990. struct rwlock *lock;
  6991. vm_offset_t va;
  6992. int md_gen, pvh_gen;
  6993. KASSERT((m->oflags & VPO_UNMANAGED) == 0,
  6994. ("pmap_clear_modify: page %p is not managed", m));
  6995. VM_OBJECT_ASSERT_WLOCKED(m->object);
  6996. KASSERT(!vm_page_xbusied(m),
  6997. ("pmap_clear_modify: page %p is exclusive busied", m));
  6998. /*
  6999. * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
  7000. * If the object containing the page is locked and the page is not
  7001. * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
  7002. */
  7003. if ((m->aflags & PGA_WRITEABLE) == 0)
  7004. return;
  7005. pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
  7006. pa_to_pvh(VM_PAGE_TO_PHYS(m));
  7007. lock = VM_PAGE_TO_PV_LIST_LOCK(m);
  7008. rw_wlock(lock);
  7009. restart:
  7010. TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
  7011. pmap = PV_PMAP(pv);
  7012. if (!PMAP_TRYLOCK(pmap)) {
  7013. pvh_gen = pvh->pv_gen;
  7014. rw_wunlock(lock);
  7015. PMAP_LOCK(pmap);
  7016. rw_wlock(lock);
  7017. if (pvh_gen != pvh->pv_gen) {
  7018. PMAP_UNLOCK(pmap);
  7019. goto restart;
  7020. }
  7021. }
  7022. PG_M = pmap_modified_bit(pmap);
  7023. PG_RW = pmap_rw_bit(pmap);
  7024. va = pv->pv_va;
  7025. pde = pmap_pde(pmap, va);
  7026. oldpde = *pde;
  7027. /* If oldpde has PG_RW set, then it also has PG_M set. */
  7028. if ((oldpde & PG_RW) != 0 &&
  7029. pmap_demote_pde_locked(pmap, pde, va, &lock) &&
  7030. (oldpde & PG_W) == 0) {
  7031. /*
  7032. * Write protect the mapping to a single page so that
  7033. * a subsequent write access may repromote.
  7034. */
  7035. va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME);
  7036. pte = pmap_pde_to_pte(pde, va);
  7037. atomic_clear_long(pte, PG_M | PG_RW);
  7038. vm_page_dirty(m);
  7039. pmap_invalidate_page(pmap, va);
  7040. }
  7041. PMAP_UNLOCK(pmap);
  7042. }
  7043. TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
  7044. pmap = PV_PMAP(pv);
  7045. if (!PMAP_TRYLOCK(pmap)) {
  7046. md_gen = m->md.pv_gen;
  7047. pvh_gen = pvh->pv_gen;
  7048. rw_wunlock(lock);
  7049. PMAP_LOCK(pmap);
  7050. rw_wlock(lock);
  7051. if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
  7052. PMAP_UNLOCK(pmap);
  7053. goto restart;
  7054. }
  7055. }
  7056. PG_M = pmap_modified_bit(pmap);
  7057. PG_RW = pmap_rw_bit(pmap);
  7058. pde = pmap_pde(pmap, pv->pv_va);
  7059. KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
  7060. " a 2mpage in page %p's pv list", m));
  7061. pte = pmap_pde_to_pte(pde, pv->pv_va);
  7062. if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
  7063. atomic_clear_long(pte, PG_M);
  7064. pmap_invalidate_page(pmap, pv->pv_va);
  7065. }
  7066. PMAP_UNLOCK(pmap);
  7067. }
  7068. rw_wunlock(lock);
  7069. }
  7070. /*
  7071. * Miscellaneous support routines follow
  7072. */
  7073. /* Adjust the properties for a leaf page table entry. */
  7074. static __inline void
  7075. pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask)
  7076. {
  7077. u_long opte, npte;
  7078. opte = *(u_long *)pte;
  7079. do {
  7080. npte = opte & ~mask;
  7081. npte |= bits;
  7082. } while (npte != opte && !atomic_fcmpset_long((u_long *)pte, &opte,
  7083. npte));
  7084. }
  7085. /*
  7086. * Map a set of physical memory pages into the kernel virtual
  7087. * address space. Return a pointer to where it is mapped. This
  7088. * routine is intended to be used for mapping device memory,
  7089. * NOT real memory.
  7090. */
  7091. static void *
  7092. pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags)
  7093. {
  7094. struct pmap_preinit_mapping *ppim;
  7095. vm_offset_t va, offset;
  7096. vm_size_t tmpsize;
  7097. int i;
  7098. offset = pa & PAGE_MASK;
  7099. size = round_page(offset + size);
  7100. pa = trunc_page(pa);
  7101. if (!pmap_initialized) {
  7102. va = 0;
  7103. for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
  7104. ppim = pmap_preinit_mapping + i;
  7105. if (ppim->va == 0) {
  7106. ppim->pa = pa;
  7107. ppim->sz = size;
  7108. ppim->mode = mode;
  7109. ppim->va = virtual_avail;
  7110. virtual_avail += size;
  7111. va = ppim->va;
  7112. break;
  7113. }
  7114. }
  7115. if (va == 0)
  7116. panic("%s: too many preinit mappings", __func__);
  7117. } else {
  7118. /*
  7119. * If we have a preinit mapping, re-use it.
  7120. */
  7121. for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
  7122. ppim = pmap_preinit_mapping + i;
  7123. if (ppim->pa == pa && ppim->sz == size &&
  7124. (ppim->mode == mode ||
  7125. (flags & MAPDEV_SETATTR) == 0))
  7126. return ((void *)(ppim->va + offset));
  7127. }
  7128. /*
  7129. * If the specified range of physical addresses fits within
  7130. * the direct map window, use the direct map.
  7131. */
  7132. if (pa < dmaplimit && pa + size <= dmaplimit) {
  7133. va = PHYS_TO_DMAP(pa);
  7134. if ((flags & MAPDEV_SETATTR) != 0) {
  7135. PMAP_LOCK(kernel_pmap);
  7136. i = pmap_change_props_locked(va, size,
  7137. PROT_NONE, mode, flags);
  7138. PMAP_UNLOCK(kernel_pmap);
  7139. } else
  7140. i = 0;
  7141. if (!i)
  7142. return ((void *)(va + offset));
  7143. }
  7144. va = kva_alloc(size);
  7145. if (va == 0)
  7146. panic("%s: Couldn't allocate KVA", __func__);
  7147. }
  7148. for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
  7149. pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
  7150. pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
  7151. if ((flags & MAPDEV_FLUSHCACHE) != 0)
  7152. pmap_invalidate_cache_range(va, va + tmpsize);
  7153. return ((void *)(va + offset));
  7154. }
  7155. void *
  7156. pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
  7157. {
  7158. return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE |
  7159. MAPDEV_SETATTR));
  7160. }
  7161. void *
  7162. pmap_mapdev(vm_paddr_t pa, vm_size_t size)
  7163. {
  7164. return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
  7165. }
  7166. void *
  7167. pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size)
  7168. {
  7169. return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE,
  7170. MAPDEV_SETATTR));
  7171. }
  7172. void *
  7173. pmap_mapbios(vm_paddr_t pa, vm_size_t size)
  7174. {
  7175. return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK,
  7176. MAPDEV_FLUSHCACHE));
  7177. }
  7178. void
  7179. pmap_unmapdev(vm_offset_t va, vm_size_t size)
  7180. {
  7181. struct pmap_preinit_mapping *ppim;
  7182. vm_offset_t offset;
  7183. int i;
  7184. /* If we gave a direct map region in pmap_mapdev, do nothing */
  7185. if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
  7186. return;
  7187. offset = va & PAGE_MASK;
  7188. size = round_page(offset + size);
  7189. va = trunc_page(va);
  7190. for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
  7191. ppim = pmap_preinit_mapping + i;
  7192. if (ppim->va == va && ppim->sz == size) {
  7193. if (pmap_initialized)
  7194. return;
  7195. ppim->pa = 0;
  7196. ppim->va = 0;
  7197. ppim->sz = 0;
  7198. ppim->mode = 0;
  7199. if (va + size == virtual_avail)
  7200. virtual_avail = va;
  7201. return;
  7202. }
  7203. }
  7204. if (pmap_initialized)
  7205. kva_free(va, size);
  7206. }
  7207. /*
  7208. * Tries to demote a 1GB page mapping.
  7209. */
  7210. static boolean_t
  7211. pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
  7212. {
  7213. pdp_entry_t newpdpe, oldpdpe;
  7214. pd_entry_t *firstpde, newpde, *pde;
  7215. pt_entry_t PG_A, PG_M, PG_RW, PG_V;
  7216. vm_paddr_t pdpgpa;
  7217. vm_page_t pdpg;
  7218. PG_A = pmap_accessed_bit(pmap);
  7219. PG_M = pmap_modified_bit(pmap);
  7220. PG_V = pmap_valid_bit(pmap);
  7221. PG_RW = pmap_rw_bit(pmap);
  7222. PMAP_LOCK_ASSERT(pmap, MA_OWNED);
  7223. oldpdpe = *pdpe;
  7224. KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
  7225. ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
  7226. if ((pdpg = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
  7227. VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
  7228. CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
  7229. " in pmap %p", va, pmap);
  7230. return (FALSE);
  7231. }
  7232. pdpgpa = VM_PAGE_TO_PHYS(pdpg);
  7233. firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa);
  7234. newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
  7235. KASSERT((oldpdpe & PG_A) != 0,
  7236. ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
  7237. KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
  7238. ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
  7239. newpde = oldpdpe;
  7240. /*
  7241. * Initialize the page directory page.
  7242. */
  7243. for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
  7244. *pde = newpde;
  7245. newpde += NBPDR;
  7246. }
  7247. /*
  7248. * Demote the mapping.
  7249. */
  7250. *pdpe = newpdpe;
  7251. /*
  7252. * Invalidate a stale recursive mapping of the page directory page.
  7253. */
  7254. pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
  7255. pmap_pdpe_demotions++;
  7256. CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
  7257. " in pmap %p", va, pmap);
  7258. return (TRUE);
  7259. }
  7260. /*
  7261. * Sets the memory attribute for the specified page.
  7262. */
  7263. void
  7264. pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
  7265. {
  7266. m->md.pat_mode = ma;
  7267. /*
  7268. * If "m" is a normal page, update its direct mapping. This update
  7269. * can be relied upon to perform any cache operations that are
  7270. * required for data coherence.
  7271. */
  7272. if ((m->flags & PG_FICTITIOUS) == 0 &&
  7273. pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
  7274. m->md.pat_mode))
  7275. panic("memory attribute change on the direct map failed");
  7276. }
  7277. /*
  7278. * Changes the specified virtual address range's memory type to that given by
  7279. * the parameter "mode". The specified virtual address range must be
  7280. * completely contained within either the direct map or the kernel map. If
  7281. * the virtual address range is contained within the kernel map, then the
  7282. * memory type for each of the corresponding ranges of the direct map is also
  7283. * changed. (The corresponding ranges of the direct map are those ranges that
  7284. * map the same physical pages as the specified virtual address range.) These
  7285. * changes to the direct map are necessary because Intel describes the
  7286. * behavior of their processors as "undefined" if two or more mappings to the
  7287. * same physical page have different memory types.
  7288. *
  7289. * Returns zero if the change completed successfully, and either EINVAL or
  7290. * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
  7291. * of the virtual address range was not mapped, and ENOMEM is returned if
  7292. * there was insufficient memory available to complete the change. In the
  7293. * latter case, the memory type may have been changed on some part of the
  7294. * virtual address range or the direct map.
  7295. */
  7296. int
  7297. pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
  7298. {
  7299. int error;
  7300. PMAP_LOCK(kernel_pmap);
  7301. error = pmap_change_props_locked(va, size, PROT_NONE, mode,
  7302. MAPDEV_FLUSHCACHE);
  7303. PMAP_UNLOCK(kernel_pmap);
  7304. return (error);
  7305. }
  7306. /*
  7307. * Changes the specified virtual address range's protections to those
  7308. * specified by "prot". Like pmap_change_attr(), protections for aliases
  7309. * in the direct map are updated as well. Protections on aliasing mappings may
  7310. * be a subset of the requested protections; for example, mappings in the direct
  7311. * map are never executable.
  7312. */
  7313. int
  7314. pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
  7315. {
  7316. int error;
  7317. /* Only supported within the kernel map. */
  7318. if (va < VM_MIN_KERNEL_ADDRESS)
  7319. return (EINVAL);
  7320. PMAP_LOCK(kernel_pmap);
  7321. error = pmap_change_props_locked(va, size, prot, -1,
  7322. MAPDEV_ASSERTVALID);
  7323. PMAP_UNLOCK(kernel_pmap);
  7324. return (error);
  7325. }
  7326. static int
  7327. pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
  7328. int mode, int flags)
  7329. {
  7330. vm_offset_t base, offset, tmpva;
  7331. vm_paddr_t pa_start, pa_end, pa_end1;
  7332. pdp_entry_t *pdpe;
  7333. pd_entry_t *pde, pde_bits, pde_mask;
  7334. pt_entry_t *pte, pte_bits, pte_mask;
  7335. int error;
  7336. bool changed;
  7337. PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
  7338. base = trunc_page(va);
  7339. offset = va & PAGE_MASK;
  7340. size = round_page(offset + size);
  7341. /*
  7342. * Only supported on kernel virtual addresses, including the direct
  7343. * map but excluding the recursive map.
  7344. */
  7345. if (base < DMAP_MIN_ADDRESS)
  7346. return (EINVAL);
  7347. /*
  7348. * Construct our flag sets and masks. "bits" is the subset of
  7349. * "mask" that will be set in each modified PTE.
  7350. *
  7351. * Mappings in the direct map are never allowed to be executable.
  7352. */
  7353. pde_bits = pte_bits = 0;
  7354. pde_mask = pte_mask = 0;
  7355. if (mode != -1) {
  7356. pde_bits |= pmap_cache_bits(kernel_pmap, mode, true);
  7357. pde_mask |= X86_PG_PDE_CACHE;
  7358. pte_bits |= pmap_cache_bits(kernel_pmap, mode, false);
  7359. pte_mask |= X86_PG_PTE_CACHE;
  7360. }
  7361. if (prot != VM_PROT_NONE) {
  7362. if ((prot & VM_PROT_WRITE) != 0) {
  7363. pde_bits |= X86_PG_RW;
  7364. pte_bits |= X86_PG_RW;
  7365. }
  7366. if ((prot & VM_PROT_EXECUTE) == 0 ||
  7367. va < VM_MIN_KERNEL_ADDRESS) {
  7368. pde_bits |= pg_nx;
  7369. pte_bits |= pg_nx;
  7370. }
  7371. pde_mask |= X86_PG_RW | pg_nx;
  7372. pte_mask |= X86_PG_RW | pg_nx;
  7373. }
  7374. /*
  7375. * Pages that aren't mapped aren't supported. Also break down 2MB pages
  7376. * into 4KB pages if required.
  7377. */
  7378. for (tmpva = base; tmpva < base + size; ) {
  7379. pdpe = pmap_pdpe(kernel_pmap, tmpva);
  7380. if (pdpe == NULL || *pdpe == 0) {
  7381. KASSERT((flags & MAPDEV_ASSERTVALID) == 0,
  7382. ("%s: addr %#lx is not mapped", __func__, tmpva));
  7383. return (EINVAL);
  7384. }
  7385. if (*pdpe & PG_PS) {
  7386. /*
  7387. * If the current 1GB page already has the required
  7388. * properties, then we need not demote this page. Just
  7389. * increment tmpva to the next 1GB page frame.
  7390. */
  7391. if ((*pdpe & pde_mask) == pde_bits) {
  7392. tmpva = trunc_1gpage(tmpva) + NBPDP;
  7393. continue;
  7394. }
  7395. /*
  7396. * If the current offset aligns with a 1GB page frame
  7397. * and there is at least 1GB left within the range, then
  7398. * we need not break down this page into 2MB pages.
  7399. */
  7400. if ((tmpva & PDPMASK) == 0 &&
  7401. tmpva + PDPMASK < base + size) {
  7402. tmpva += NBPDP;
  7403. continue;
  7404. }
  7405. if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
  7406. return (ENOMEM);
  7407. }
  7408. pde = pmap_pdpe_to_pde(pdpe, tmpva);
  7409. if (*pde == 0) {
  7410. KASSERT((flags & MAPDEV_ASSERTVALID) == 0,
  7411. ("%s: addr %#lx is not mapped", __func__, tmpva));
  7412. return (EINVAL);
  7413. }
  7414. if (*pde & PG_PS) {
  7415. /*
  7416. * If the current 2MB page already has the required
  7417. * properties, then we need not demote this page. Just
  7418. * increment tmpva to the next 2MB page frame.
  7419. */
  7420. if ((*pde & pde_mask) == pde_bits) {
  7421. tmpva = trunc_2mpage(tmpva) + NBPDR;
  7422. continue;
  7423. }
  7424. /*
  7425. * If the current offset aligns with a 2MB page frame
  7426. * and there is at leas