HardenedBSD src tree https://hardenedbsd.org/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

1820 lines
45 KiB

  1. /*-
  2. * Copyright (c) 1996, by Steve Passe
  3. * Copyright (c) 2003, by Peter Wemm
  4. * All rights reserved.
  5. *
  6. * Redistribution and use in source and binary forms, with or without
  7. * modification, are permitted provided that the following conditions
  8. * are met:
  9. * 1. Redistributions of source code must retain the above copyright
  10. * notice, this list of conditions and the following disclaimer.
  11. * 2. The name of the developer may NOT be used to endorse or promote products
  12. * derived from this software without specific prior written permission.
  13. *
  14. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24. * SUCH DAMAGE.
  25. */
  26. #include <sys/cdefs.h>
  27. __FBSDID("$FreeBSD$");
  28. #ifdef __i386__
  29. #include "opt_apic.h"
  30. #endif
  31. #include "opt_cpu.h"
  32. #include "opt_kstack_pages.h"
  33. #include "opt_pax.h"
  34. #include "opt_pmap.h"
  35. #include "opt_sched.h"
  36. #include "opt_smp.h"
  37. #include <sys/param.h>
  38. #include <sys/systm.h>
  39. #include <sys/bus.h>
  40. #include <sys/cons.h> /* cngetc() */
  41. #include <sys/cpuset.h>
  42. #ifdef GPROF
  43. #include <sys/gmon.h>
  44. #endif
  45. #include <sys/kdb.h>
  46. #include <sys/kernel.h>
  47. #include <sys/ktr.h>
  48. #include <sys/lock.h>
  49. #include <sys/malloc.h>
  50. #include <sys/memrange.h>
  51. #include <sys/mutex.h>
  52. #include <sys/pcpu.h>
  53. #include <sys/proc.h>
  54. #include <sys/sched.h>
  55. #include <sys/smp.h>
  56. #include <sys/sysctl.h>
  57. #include <vm/vm.h>
  58. #include <vm/vm_param.h>
  59. #include <vm/pmap.h>
  60. #include <vm/vm_kern.h>
  61. #include <vm/vm_extern.h>
  62. #include <vm/vm_map.h>
  63. #include <x86/apicreg.h>
  64. #include <machine/clock.h>
  65. #include <machine/cpu.h>
  66. #include <machine/cputypes.h>
  67. #include <x86/mca.h>
  68. #include <machine/md_var.h>
  69. #include <machine/pcb.h>
  70. #include <machine/psl.h>
  71. #include <machine/smp.h>
  72. #include <machine/specialreg.h>
  73. #include <x86/ucode.h>
  74. static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items");
  75. /* lock region used by kernel profiling */
  76. int mcount_lock;
  77. int mp_naps; /* # of Applications processors */
  78. int boot_cpu_id = -1; /* designated BSP */
  79. /* AP uses this during bootstrap. Do not staticize. */
  80. char *bootSTK;
  81. int bootAP;
  82. /* Free these after use */
  83. void *bootstacks[MAXCPU];
  84. void *dpcpu;
  85. struct pcb stoppcbs[MAXCPU];
  86. struct susppcb **susppcbs;
  87. #ifdef COUNT_IPIS
  88. /* Interrupt counts. */
  89. static u_long *ipi_preempt_counts[MAXCPU];
  90. static u_long *ipi_ast_counts[MAXCPU];
  91. u_long *ipi_invltlb_counts[MAXCPU];
  92. u_long *ipi_invlrng_counts[MAXCPU];
  93. u_long *ipi_invlpg_counts[MAXCPU];
  94. u_long *ipi_invlcache_counts[MAXCPU];
  95. u_long *ipi_rendezvous_counts[MAXCPU];
  96. static u_long *ipi_hardclock_counts[MAXCPU];
  97. #endif
  98. /* Default cpu_ops implementation. */
  99. struct cpu_ops cpu_ops;
  100. /*
  101. * Local data and functions.
  102. */
  103. static volatile cpuset_t ipi_stop_nmi_pending;
  104. volatile cpuset_t resuming_cpus;
  105. volatile cpuset_t toresume_cpus;
  106. /* used to hold the AP's until we are ready to release them */
  107. struct mtx ap_boot_mtx;
  108. /* Set to 1 once we're ready to let the APs out of the pen. */
  109. volatile int aps_ready = 0;
  110. /*
  111. * Store data from cpu_add() until later in the boot when we actually setup
  112. * the APs.
  113. */
  114. struct cpu_info *cpu_info;
  115. int *apic_cpuids;
  116. int cpu_apic_ids[MAXCPU];
  117. _Static_assert(MAXCPU <= MAX_APIC_ID,
  118. "MAXCPU cannot be larger that MAX_APIC_ID");
  119. _Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID,
  120. "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID");
  121. /* Holds pending bitmap based IPIs per CPU */
  122. volatile u_int cpu_ipi_pending[MAXCPU];
  123. static void release_aps(void *dummy);
  124. static void cpustop_handler_post(u_int cpu);
  125. #ifdef PAX_HARDENING
  126. static int hyperthreading_allowed;
  127. #else
  128. static int hyperthreading_allowed = 1;
  129. #endif
  130. SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
  131. &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");
  132. static struct topo_node topo_root;
  133. static int pkg_id_shift;
  134. static int node_id_shift;
  135. static int core_id_shift;
  136. static int disabled_cpus;
  137. struct cache_info {
  138. int id_shift;
  139. int present;
  140. } static caches[MAX_CACHE_LEVELS];
  141. unsigned int boot_address;
  142. #define MiB(v) (v ## ULL << 20)
  143. void
  144. mem_range_AP_init(void)
  145. {
  146. if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
  147. mem_range_softc.mr_op->initAP(&mem_range_softc);
  148. }
  149. /*
  150. * Round up to the next power of two, if necessary, and then
  151. * take log2.
  152. * Returns -1 if argument is zero.
  153. */
  154. static __inline int
  155. mask_width(u_int x)
  156. {
  157. return (fls(x << (1 - powerof2(x))) - 1);
  158. }
  159. /*
  160. * Add a cache level to the cache topology description.
  161. */
  162. static int
  163. add_deterministic_cache(int type, int level, int share_count)
  164. {
  165. if (type == 0)
  166. return (0);
  167. if (type > 3) {
  168. printf("unexpected cache type %d\n", type);
  169. return (1);
  170. }
  171. if (type == 2) /* ignore instruction cache */
  172. return (1);
  173. if (level == 0 || level > MAX_CACHE_LEVELS) {
  174. printf("unexpected cache level %d\n", type);
  175. return (1);
  176. }
  177. if (caches[level - 1].present) {
  178. printf("WARNING: multiple entries for L%u data cache\n", level);
  179. printf("%u => %u\n", caches[level - 1].id_shift,
  180. mask_width(share_count));
  181. }
  182. caches[level - 1].id_shift = mask_width(share_count);
  183. caches[level - 1].present = 1;
  184. if (caches[level - 1].id_shift > pkg_id_shift) {
  185. printf("WARNING: L%u data cache covers more "
  186. "APIC IDs than a package (%u > %u)\n", level,
  187. caches[level - 1].id_shift, pkg_id_shift);
  188. caches[level - 1].id_shift = pkg_id_shift;
  189. }
  190. if (caches[level - 1].id_shift < core_id_shift) {
  191. printf("WARNING: L%u data cache covers fewer "
  192. "APIC IDs than a core (%u < %u)\n", level,
  193. caches[level - 1].id_shift, core_id_shift);
  194. caches[level - 1].id_shift = core_id_shift;
  195. }
  196. return (1);
  197. }
  198. /*
  199. * Determine topology of processing units and caches for AMD CPUs.
  200. * See:
  201. * - AMD CPUID Specification (Publication # 25481)
  202. * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559)
  203. * - BKDG For AMD Family 10h Processors (Publication # 31116)
  204. * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301)
  205. * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751)
  206. * - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945)
  207. */
  208. static void
  209. topo_probe_amd(void)
  210. {
  211. u_int p[4];
  212. uint64_t v;
  213. int level;
  214. int nodes_per_socket;
  215. int share_count;
  216. int type;
  217. int i;
  218. /* No multi-core capability. */
  219. if ((amd_feature2 & AMDID2_CMP) == 0)
  220. return;
  221. /* For families 10h and newer. */
  222. pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
  223. AMDID_COREID_SIZE_SHIFT;
  224. /* For 0Fh family. */
  225. if (pkg_id_shift == 0)
  226. pkg_id_shift =
  227. mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);
  228. /*
  229. * Families prior to 16h define the following value as
  230. * cores per compute unit and we don't really care about the AMD
  231. * compute units at the moment. Perhaps we should treat them as
  232. * cores and cores within the compute units as hardware threads,
  233. * but that's up for debate.
  234. * Later families define the value as threads per compute unit,
  235. * so we are following AMD's nomenclature here.
  236. */
  237. if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 &&
  238. CPUID_TO_FAMILY(cpu_id) >= 0x16) {
  239. cpuid_count(0x8000001e, 0, p);
  240. share_count = ((p[1] >> 8) & 0xff) + 1;
  241. core_id_shift = mask_width(share_count);
  242. /*
  243. * For Zen (17h), gather Nodes per Processor. Each node is a
  244. * Zeppelin die; TR and EPYC CPUs will have multiple dies per
  245. * package. Communication latency between dies is higher than
  246. * within them.
  247. */
  248. nodes_per_socket = ((p[2] >> 8) & 0x7) + 1;
  249. node_id_shift = pkg_id_shift - mask_width(nodes_per_socket);
  250. }
  251. if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
  252. for (i = 0; ; i++) {
  253. cpuid_count(0x8000001d, i, p);
  254. type = p[0] & 0x1f;
  255. level = (p[0] >> 5) & 0x7;
  256. share_count = 1 + ((p[0] >> 14) & 0xfff);
  257. if (!add_deterministic_cache(type, level, share_count))
  258. break;
  259. }
  260. } else {
  261. if (cpu_exthigh >= 0x80000005) {
  262. cpuid_count(0x80000005, 0, p);
  263. if (((p[2] >> 24) & 0xff) != 0) {
  264. caches[0].id_shift = 0;
  265. caches[0].present = 1;
  266. }
  267. }
  268. if (cpu_exthigh >= 0x80000006) {
  269. cpuid_count(0x80000006, 0, p);
  270. if (((p[2] >> 16) & 0xffff) != 0) {
  271. caches[1].id_shift = 0;
  272. caches[1].present = 1;
  273. }
  274. if (((p[3] >> 18) & 0x3fff) != 0) {
  275. nodes_per_socket = 1;
  276. if ((amd_feature2 & AMDID2_NODE_ID) != 0) {
  277. /*
  278. * Handle multi-node processors that
  279. * have multiple chips, each with its
  280. * own L3 cache, on the same die.
  281. */
  282. v = rdmsr(0xc001100c);
  283. nodes_per_socket = 1 + ((v >> 3) & 0x7);
  284. }
  285. caches[2].id_shift =
  286. pkg_id_shift - mask_width(nodes_per_socket);
  287. caches[2].present = 1;
  288. }
  289. }
  290. }
  291. }
  292. /*
  293. * Determine topology of processing units for Intel CPUs
  294. * using CPUID Leaf 1 and Leaf 4, if supported.
  295. * See:
  296. * - Intel 64 Architecture Processor Topology Enumeration
  297. * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
  298. * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
  299. * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
  300. */
  301. static void
  302. topo_probe_intel_0x4(void)
  303. {
  304. u_int p[4];
  305. int max_cores;
  306. int max_logical;
  307. /* Both zero and one here mean one logical processor per package. */
  308. max_logical = (cpu_feature & CPUID_HTT) != 0 ?
  309. (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
  310. if (max_logical <= 1)
  311. return;
  312. if (cpu_high >= 0x4) {
  313. cpuid_count(0x04, 0, p);
  314. max_cores = ((p[0] >> 26) & 0x3f) + 1;
  315. } else
  316. max_cores = 1;
  317. core_id_shift = mask_width(max_logical/max_cores);
  318. KASSERT(core_id_shift >= 0,
  319. ("intel topo: max_cores > max_logical\n"));
  320. pkg_id_shift = core_id_shift + mask_width(max_cores);
  321. }
  322. /*
  323. * Determine topology of processing units for Intel CPUs
  324. * using CPUID Leaf 11, if supported.
  325. * See:
  326. * - Intel 64 Architecture Processor Topology Enumeration
  327. * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
  328. * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
  329. * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
  330. */
  331. static void
  332. topo_probe_intel_0xb(void)
  333. {
  334. u_int p[4];
  335. int bits;
  336. int type;
  337. int i;
  338. /* Fall back if CPU leaf 11 doesn't really exist. */
  339. cpuid_count(0x0b, 0, p);
  340. if (p[1] == 0) {
  341. topo_probe_intel_0x4();
  342. return;
  343. }
  344. /* We only support three levels for now. */
  345. for (i = 0; ; i++) {
  346. cpuid_count(0x0b, i, p);
  347. bits = p[0] & 0x1f;
  348. type = (p[2] >> 8) & 0xff;
  349. if (type == 0)
  350. break;
  351. /* TODO: check for duplicate (re-)assignment */
  352. if (type == CPUID_TYPE_SMT)
  353. core_id_shift = bits;
  354. else if (type == CPUID_TYPE_CORE)
  355. pkg_id_shift = bits;
  356. else
  357. printf("unknown CPU level type %d\n", type);
  358. }
  359. if (pkg_id_shift < core_id_shift) {
  360. printf("WARNING: core covers more APIC IDs than a package\n");
  361. core_id_shift = pkg_id_shift;
  362. }
  363. }
  364. /*
  365. * Determine topology of caches for Intel CPUs.
  366. * See:
  367. * - Intel 64 Architecture Processor Topology Enumeration
  368. * - Intel 64 and IA-32 Architectures Software Developer’s Manual
  369. * Volume 2A: Instruction Set Reference, A-M,
  370. * CPUID instruction
  371. */
  372. static void
  373. topo_probe_intel_caches(void)
  374. {
  375. u_int p[4];
  376. int level;
  377. int share_count;
  378. int type;
  379. int i;
  380. if (cpu_high < 0x4) {
  381. /*
  382. * Available cache level and sizes can be determined
  383. * via CPUID leaf 2, but that requires a huge table of hardcoded
  384. * values, so for now just assume L1 and L2 caches potentially
  385. * shared only by HTT processing units, if HTT is present.
  386. */
  387. caches[0].id_shift = pkg_id_shift;
  388. caches[0].present = 1;
  389. caches[1].id_shift = pkg_id_shift;
  390. caches[1].present = 1;
  391. return;
  392. }
  393. for (i = 0; ; i++) {
  394. cpuid_count(0x4, i, p);
  395. type = p[0] & 0x1f;
  396. level = (p[0] >> 5) & 0x7;
  397. share_count = 1 + ((p[0] >> 14) & 0xfff);
  398. if (!add_deterministic_cache(type, level, share_count))
  399. break;
  400. }
  401. }
  402. /*
  403. * Determine topology of processing units and caches for Intel CPUs.
  404. * See:
  405. * - Intel 64 Architecture Processor Topology Enumeration
  406. */
  407. static void
  408. topo_probe_intel(void)
  409. {
  410. /*
  411. * Note that 0x1 <= cpu_high < 4 case should be
  412. * compatible with topo_probe_intel_0x4() logic when
  413. * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
  414. * or it should trigger the fallback otherwise.
  415. */
  416. if (cpu_high >= 0xb)
  417. topo_probe_intel_0xb();
  418. else if (cpu_high >= 0x1)
  419. topo_probe_intel_0x4();
  420. topo_probe_intel_caches();
  421. }
  422. /*
  423. * Topology information is queried only on BSP, on which this
  424. * code runs and for which it can query CPUID information.
  425. * Then topology is extrapolated on all packages using an
  426. * assumption that APIC ID to hardware component ID mapping is
  427. * homogenious.
  428. * That doesn't necesserily imply that the topology is uniform.
  429. */
  430. void
  431. topo_probe(void)
  432. {
  433. static int cpu_topo_probed = 0;
  434. struct x86_topo_layer {
  435. int type;
  436. int subtype;
  437. int id_shift;
  438. } topo_layers[MAX_CACHE_LEVELS + 4];
  439. struct topo_node *parent;
  440. struct topo_node *node;
  441. int layer;
  442. int nlayers;
  443. int node_id;
  444. int i;
  445. if (cpu_topo_probed)
  446. return;
  447. CPU_ZERO(&logical_cpus_mask);
  448. if (mp_ncpus <= 1)
  449. ; /* nothing */
  450. else if (cpu_vendor_id == CPU_VENDOR_AMD ||
  451. cpu_vendor_id == CPU_VENDOR_HYGON)
  452. topo_probe_amd();
  453. else if (cpu_vendor_id == CPU_VENDOR_INTEL)
  454. topo_probe_intel();
  455. KASSERT(pkg_id_shift >= core_id_shift,
  456. ("bug in APIC topology discovery"));
  457. nlayers = 0;
  458. bzero(topo_layers, sizeof(topo_layers));
  459. topo_layers[nlayers].type = TOPO_TYPE_PKG;
  460. topo_layers[nlayers].id_shift = pkg_id_shift;
  461. if (bootverbose)
  462. printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
  463. nlayers++;
  464. if (pkg_id_shift > node_id_shift && node_id_shift != 0) {
  465. topo_layers[nlayers].type = TOPO_TYPE_GROUP;
  466. topo_layers[nlayers].id_shift = node_id_shift;
  467. if (bootverbose)
  468. printf("Node ID shift: %u\n",
  469. topo_layers[nlayers].id_shift);
  470. nlayers++;
  471. }
  472. /*
  473. * Consider all caches to be within a package/chip
  474. * and "in front" of all sub-components like
  475. * cores and hardware threads.
  476. */
  477. for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
  478. if (caches[i].present) {
  479. if (node_id_shift != 0)
  480. KASSERT(caches[i].id_shift <= node_id_shift,
  481. ("bug in APIC topology discovery"));
  482. KASSERT(caches[i].id_shift <= pkg_id_shift,
  483. ("bug in APIC topology discovery"));
  484. KASSERT(caches[i].id_shift >= core_id_shift,
  485. ("bug in APIC topology discovery"));
  486. topo_layers[nlayers].type = TOPO_TYPE_CACHE;
  487. topo_layers[nlayers].subtype = i + 1;
  488. topo_layers[nlayers].id_shift = caches[i].id_shift;
  489. if (bootverbose)
  490. printf("L%u cache ID shift: %u\n",
  491. topo_layers[nlayers].subtype,
  492. topo_layers[nlayers].id_shift);
  493. nlayers++;
  494. }
  495. }
  496. if (pkg_id_shift > core_id_shift) {
  497. topo_layers[nlayers].type = TOPO_TYPE_CORE;
  498. topo_layers[nlayers].id_shift = core_id_shift;
  499. if (bootverbose)
  500. printf("Core ID shift: %u\n",
  501. topo_layers[nlayers].id_shift);
  502. nlayers++;
  503. }
  504. topo_layers[nlayers].type = TOPO_TYPE_PU;
  505. topo_layers[nlayers].id_shift = 0;
  506. nlayers++;
  507. topo_init_root(&topo_root);
  508. for (i = 0; i <= max_apic_id; ++i) {
  509. if (!cpu_info[i].cpu_present)
  510. continue;
  511. parent = &topo_root;
  512. for (layer = 0; layer < nlayers; ++layer) {
  513. node_id = i >> topo_layers[layer].id_shift;
  514. parent = topo_add_node_by_hwid(parent, node_id,
  515. topo_layers[layer].type,
  516. topo_layers[layer].subtype);
  517. }
  518. }
  519. parent = &topo_root;
  520. for (layer = 0; layer < nlayers; ++layer) {
  521. node_id = boot_cpu_id >> topo_layers[layer].id_shift;
  522. node = topo_find_node_by_hwid(parent, node_id,
  523. topo_layers[layer].type,
  524. topo_layers[layer].subtype);
  525. topo_promote_child(node);
  526. parent = node;
  527. }
  528. cpu_topo_probed = 1;
  529. }
  530. /*
  531. * Assign logical CPU IDs to local APICs.
  532. */
  533. void
  534. assign_cpu_ids(void)
  535. {
  536. struct topo_node *node;
  537. u_int smt_mask;
  538. int nhyper;
  539. smt_mask = (1u << core_id_shift) - 1;
  540. /*
  541. * Assign CPU IDs to local APIC IDs and disable any CPUs
  542. * beyond MAXCPU. CPU 0 is always assigned to the BSP.
  543. */
  544. mp_ncpus = 0;
  545. nhyper = 0;
  546. TOPO_FOREACH(node, &topo_root) {
  547. if (node->type != TOPO_TYPE_PU)
  548. continue;
  549. if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
  550. cpu_info[node->hwid].cpu_hyperthread = 1;
  551. if (resource_disabled("lapic", node->hwid)) {
  552. if (node->hwid != boot_cpu_id)
  553. cpu_info[node->hwid].cpu_disabled = 1;
  554. else
  555. printf("Cannot disable BSP, APIC ID = %d\n",
  556. node->hwid);
  557. }
  558. if (!hyperthreading_allowed &&
  559. cpu_info[node->hwid].cpu_hyperthread)
  560. cpu_info[node->hwid].cpu_disabled = 1;
  561. if (mp_ncpus >= MAXCPU)
  562. cpu_info[node->hwid].cpu_disabled = 1;
  563. if (cpu_info[node->hwid].cpu_disabled) {
  564. disabled_cpus++;
  565. continue;
  566. }
  567. if (cpu_info[node->hwid].cpu_hyperthread)
  568. nhyper++;
  569. cpu_apic_ids[mp_ncpus] = node->hwid;
  570. apic_cpuids[node->hwid] = mp_ncpus;
  571. topo_set_pu_id(node, mp_ncpus);
  572. mp_ncpus++;
  573. }
  574. KASSERT(mp_maxid >= mp_ncpus - 1,
  575. ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
  576. mp_ncpus));
  577. mp_ncores = mp_ncpus - nhyper;
  578. smp_threads_per_core = mp_ncpus / mp_ncores;
  579. }
  580. /*
  581. * Print various information about the SMP system hardware and setup.
  582. */
  583. void
  584. cpu_mp_announce(void)
  585. {
  586. struct topo_node *node;
  587. const char *hyperthread;
  588. struct topo_analysis topology;
  589. printf("FreeBSD/SMP: ");
  590. if (topo_analyze(&topo_root, 1, &topology)) {
  591. printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]);
  592. if (topology.entities[TOPO_LEVEL_GROUP] > 1)
  593. printf(" x %d groups",
  594. topology.entities[TOPO_LEVEL_GROUP]);
  595. if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1)
  596. printf(" x %d cache groups",
  597. topology.entities[TOPO_LEVEL_CACHEGROUP]);
  598. if (topology.entities[TOPO_LEVEL_CORE] > 0)
  599. printf(" x %d core(s)",
  600. topology.entities[TOPO_LEVEL_CORE]);
  601. if (topology.entities[TOPO_LEVEL_THREAD] > 1)
  602. printf(" x %d hardware threads",
  603. topology.entities[TOPO_LEVEL_THREAD]);
  604. } else {
  605. printf("Non-uniform topology");
  606. }
  607. printf("\n");
  608. if (disabled_cpus) {
  609. printf("FreeBSD/SMP Online: ");
  610. if (topo_analyze(&topo_root, 0, &topology)) {
  611. printf("%d package(s)",
  612. topology.entities[TOPO_LEVEL_PKG]);
  613. if (topology.entities[TOPO_LEVEL_GROUP] > 1)
  614. printf(" x %d groups",
  615. topology.entities[TOPO_LEVEL_GROUP]);
  616. if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1)
  617. printf(" x %d cache groups",
  618. topology.entities[TOPO_LEVEL_CACHEGROUP]);
  619. if (topology.entities[TOPO_LEVEL_CORE] > 0)
  620. printf(" x %d core(s)",
  621. topology.entities[TOPO_LEVEL_CORE]);
  622. if (topology.entities[TOPO_LEVEL_THREAD] > 1)
  623. printf(" x %d hardware threads",
  624. topology.entities[TOPO_LEVEL_THREAD]);
  625. } else {
  626. printf("Non-uniform topology");
  627. }
  628. printf("\n");
  629. }
  630. if (!bootverbose)
  631. return;
  632. TOPO_FOREACH(node, &topo_root) {
  633. switch (node->type) {
  634. case TOPO_TYPE_PKG:
  635. printf("Package HW ID = %u\n", node->hwid);
  636. break;
  637. case TOPO_TYPE_CORE:
  638. printf("\tCore HW ID = %u\n", node->hwid);
  639. break;
  640. case TOPO_TYPE_PU:
  641. if (cpu_info[node->hwid].cpu_hyperthread)
  642. hyperthread = "/HT";
  643. else
  644. hyperthread = "";
  645. if (node->subtype == 0)
  646. printf("\t\tCPU (AP%s): APIC ID: %u"
  647. "(disabled)\n", hyperthread, node->hwid);
  648. else if (node->id == 0)
  649. printf("\t\tCPU0 (BSP): APIC ID: %u\n",
  650. node->hwid);
  651. else
  652. printf("\t\tCPU%u (AP%s): APIC ID: %u\n",
  653. node->id, hyperthread, node->hwid);
  654. break;
  655. default:
  656. /* ignored */
  657. break;
  658. }
  659. }
  660. }
  661. /*
  662. * Add a scheduling group, a group of logical processors sharing
  663. * a particular cache (and, thus having an affinity), to the scheduling
  664. * topology.
  665. * This function recursively works on lower level caches.
  666. */
  667. static void
  668. x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root)
  669. {
  670. struct topo_node *node;
  671. int nchildren;
  672. int ncores;
  673. int i;
  674. KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE ||
  675. root->type == TOPO_TYPE_GROUP,
  676. ("x86topo_add_sched_group: bad type: %u", root->type));
  677. CPU_COPY(&root->cpuset, &cg_root->cg_mask);
  678. cg_root->cg_count = root->cpu_count;
  679. if (root->type == TOPO_TYPE_SYSTEM)
  680. cg_root->cg_level = CG_SHARE_NONE;
  681. else
  682. cg_root->cg_level = root->subtype;
  683. /*
  684. * Check how many core nodes we have under the given root node.
  685. * If we have multiple logical processors, but not multiple
  686. * cores, then those processors must be hardware threads.
  687. */
  688. ncores = 0;
  689. node = root;
  690. while (node != NULL) {
  691. if (node->type != TOPO_TYPE_CORE) {
  692. node = topo_next_node(root, node);
  693. continue;
  694. }
  695. ncores++;
  696. node = topo_next_nonchild_node(root, node);
  697. }
  698. if (cg_root->cg_level != CG_SHARE_NONE &&
  699. root->cpu_count > 1 && ncores < 2)
  700. cg_root->cg_flags = CG_FLAG_SMT;
  701. /*
  702. * Find out how many cache nodes we have under the given root node.
  703. * We ignore cache nodes that cover all the same processors as the
  704. * root node. Also, we do not descend below found cache nodes.
  705. * That is, we count top-level "non-redundant" caches under the root
  706. * node.
  707. */
  708. nchildren = 0;
  709. node = root;
  710. while (node != NULL) {
  711. if ((node->type != TOPO_TYPE_GROUP &&
  712. node->type != TOPO_TYPE_CACHE) ||
  713. (root->type != TOPO_TYPE_SYSTEM &&
  714. CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
  715. node = topo_next_node(root, node);
  716. continue;
  717. }
  718. nchildren++;
  719. node = topo_next_nonchild_node(root, node);
  720. }
  721. cg_root->cg_child = smp_topo_alloc(nchildren);
  722. cg_root->cg_children = nchildren;
  723. /*
  724. * Now find again the same cache nodes as above and recursively
  725. * build scheduling topologies for them.
  726. */
  727. node = root;
  728. i = 0;
  729. while (node != NULL) {
  730. if ((node->type != TOPO_TYPE_GROUP &&
  731. node->type != TOPO_TYPE_CACHE) ||
  732. (root->type != TOPO_TYPE_SYSTEM &&
  733. CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
  734. node = topo_next_node(root, node);
  735. continue;
  736. }
  737. cg_root->cg_child[i].cg_parent = cg_root;
  738. x86topo_add_sched_group(node, &cg_root->cg_child[i]);
  739. i++;
  740. node = topo_next_nonchild_node(root, node);
  741. }
  742. }
  743. /*
  744. * Build the MI scheduling topology from the discovered hardware topology.
  745. */
  746. struct cpu_group *
  747. cpu_topo(void)
  748. {
  749. struct cpu_group *cg_root;
  750. if (mp_ncpus <= 1)
  751. return (smp_topo_none());
  752. cg_root = smp_topo_alloc(1);
  753. x86topo_add_sched_group(&topo_root, cg_root);
  754. return (cg_root);
  755. }
  756. static void
  757. cpu_alloc(void *dummy __unused)
  758. {
  759. /*
  760. * Dynamically allocate the arrays that depend on the
  761. * maximum APIC ID.
  762. */
  763. cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS,
  764. M_WAITOK | M_ZERO);
  765. apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS,
  766. M_WAITOK | M_ZERO);
  767. }
  768. SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL);
  769. /*
  770. * Add a logical CPU to the topology.
  771. */
  772. void
  773. cpu_add(u_int apic_id, char boot_cpu)
  774. {
  775. if (apic_id > max_apic_id) {
  776. panic("SMP: APIC ID %d too high", apic_id);
  777. return;
  778. }
  779. KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice",
  780. apic_id));
  781. cpu_info[apic_id].cpu_present = 1;
  782. if (boot_cpu) {
  783. KASSERT(boot_cpu_id == -1,
  784. ("CPU %u claims to be BSP, but CPU %u already is", apic_id,
  785. boot_cpu_id));
  786. boot_cpu_id = apic_id;
  787. cpu_info[apic_id].cpu_bsp = 1;
  788. }
  789. if (bootverbose)
  790. printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" :
  791. "AP");
  792. }
  793. void
  794. cpu_mp_setmaxid(void)
  795. {
  796. /*
  797. * mp_ncpus and mp_maxid should be already set by calls to cpu_add().
  798. * If there were no calls to cpu_add() assume this is a UP system.
  799. */
  800. if (mp_ncpus == 0)
  801. mp_ncpus = 1;
  802. }
  803. int
  804. cpu_mp_probe(void)
  805. {
  806. /*
  807. * Always record BSP in CPU map so that the mbuf init code works
  808. * correctly.
  809. */
  810. CPU_SETOF(0, &all_cpus);
  811. return (mp_ncpus > 1);
  812. }
  813. /* Allocate memory for the AP trampoline. */
  814. void
  815. alloc_ap_trampoline(vm_paddr_t *physmap, unsigned int *physmap_idx)
  816. {
  817. unsigned int i;
  818. bool allocated;
  819. allocated = false;
  820. for (i = *physmap_idx; i <= *physmap_idx; i -= 2) {
  821. /*
  822. * Find a memory region big enough and below the 1MB boundary
  823. * for the trampoline code.
  824. * NB: needs to be page aligned.
  825. */
  826. if (physmap[i] >= MiB(1) ||
  827. (trunc_page(physmap[i + 1]) - round_page(physmap[i])) <
  828. round_page(bootMP_size))
  829. continue;
  830. allocated = true;
  831. /*
  832. * Try to steal from the end of the region to mimic previous
  833. * behaviour, else fallback to steal from the start.
  834. */
  835. if (physmap[i + 1] < MiB(1)) {
  836. boot_address = trunc_page(physmap[i + 1]);
  837. if ((physmap[i + 1] - boot_address) < bootMP_size)
  838. boot_address -= round_page(bootMP_size);
  839. physmap[i + 1] = boot_address;
  840. } else {
  841. boot_address = round_page(physmap[i]);
  842. physmap[i] = boot_address + round_page(bootMP_size);
  843. }
  844. if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) {
  845. memmove(&physmap[i], &physmap[i + 2],
  846. sizeof(*physmap) * (*physmap_idx - i + 2));
  847. *physmap_idx -= 2;
  848. }
  849. break;
  850. }
  851. if (!allocated) {
  852. boot_address = basemem * 1024 - bootMP_size;
  853. if (bootverbose)
  854. printf(
  855. "Cannot find enough space for the boot trampoline, placing it at %#x",
  856. boot_address);
  857. }
  858. }
  859. /*
  860. * AP CPU's call this to initialize themselves.
  861. */
  862. void
  863. init_secondary_tail(void)
  864. {
  865. u_int cpuid;
  866. pmap_activate_boot(vmspace_pmap(proc0.p_vmspace));
  867. /*
  868. * On real hardware, switch to x2apic mode if possible. Do it
  869. * after aps_ready was signalled, to avoid manipulating the
  870. * mode while BSP might still want to send some IPI to us
  871. * (second startup IPI is ignored on modern hardware etc).
  872. */
  873. lapic_xapic_mode();
  874. /* Initialize the PAT MSR. */
  875. pmap_init_pat();
  876. /* set up CPU registers and state */
  877. cpu_setregs();
  878. /* set up SSE/NX */
  879. initializecpu();
  880. /* set up FPU state on the AP */
  881. #ifdef __amd64__
  882. fpuinit();
  883. #else
  884. npxinit(false);
  885. #endif
  886. if (cpu_ops.cpu_init)
  887. cpu_ops.cpu_init();
  888. /* A quick check from sanity claus */
  889. cpuid = PCPU_GET(cpuid);
  890. if (PCPU_GET(apic_id) != lapic_id()) {
  891. printf("SMP: cpuid = %d\n", cpuid);
  892. printf("SMP: actual apic_id = %d\n", lapic_id());
  893. printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
  894. panic("cpuid mismatch! boom!!");
  895. }
  896. /* Initialize curthread. */
  897. KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
  898. PCPU_SET(curthread, PCPU_GET(idlethread));
  899. mtx_lock_spin(&ap_boot_mtx);
  900. mca_init();
  901. /* Init local apic for irq's */
  902. lapic_setup(1);
  903. /* Set memory range attributes for this CPU to match the BSP */
  904. mem_range_AP_init();
  905. smp_cpus++;
  906. CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
  907. if (bootverbose)
  908. printf("SMP: AP CPU #%d Launched!\n", cpuid);
  909. else
  910. printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "",
  911. cpuid, smp_cpus == mp_ncpus ? "\n" : " ");
  912. /* Determine if we are a logical CPU. */
  913. if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread)
  914. CPU_SET(cpuid, &logical_cpus_mask);
  915. if (bootverbose)
  916. lapic_dump("AP");
  917. if (smp_cpus == mp_ncpus) {
  918. /* enable IPI's, tlb shootdown, freezes etc */
  919. atomic_store_rel_int(&smp_started, 1);
  920. }
  921. #ifdef __amd64__
  922. /*
  923. * Enable global pages TLB extension
  924. * This also implicitly flushes the TLB
  925. */
  926. load_cr4(rcr4() | CR4_PGE);
  927. if (pmap_pcid_enabled)
  928. load_cr4(rcr4() | CR4_PCIDE);
  929. load_ds(_udatasel);
  930. load_es(_udatasel);
  931. load_fs(_ufssel);
  932. #endif
  933. mtx_unlock_spin(&ap_boot_mtx);
  934. /* Wait until all the AP's are up. */
  935. while (atomic_load_acq_int(&smp_started) == 0)
  936. ia32_pause();
  937. #ifndef EARLY_AP_STARTUP
  938. /* Start per-CPU event timers. */
  939. cpu_initclocks_ap();
  940. #endif
  941. /*
  942. * Assert that smp_after_idle_runnable condition is reasonable.
  943. */
  944. MPASS(PCPU_GET(curpcb) == NULL);
  945. sched_throw(NULL);
  946. panic("scheduler returned us to %s", __func__);
  947. /* NOTREACHED */
  948. }
  949. static void
  950. smp_after_idle_runnable(void *arg __unused)
  951. {
  952. struct pcpu *pc;
  953. int cpu;
  954. for (cpu = 1; cpu < mp_ncpus; cpu++) {
  955. pc = pcpu_find(cpu);
  956. while (atomic_load_ptr(&pc->pc_curpcb) == (uintptr_t)NULL)
  957. cpu_spinwait();
  958. kmem_free((vm_offset_t)bootstacks[cpu], kstack_pages *
  959. PAGE_SIZE);
  960. }
  961. }
  962. SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY,
  963. smp_after_idle_runnable, NULL);
  964. /*
  965. * We tell the I/O APIC code about all the CPUs we want to receive
  966. * interrupts. If we don't want certain CPUs to receive IRQs we
  967. * can simply not tell the I/O APIC code about them in this function.
  968. * We also do not tell it about the BSP since it tells itself about
  969. * the BSP internally to work with UP kernels and on UP machines.
  970. */
  971. void
  972. set_interrupt_apic_ids(void)
  973. {
  974. u_int i, apic_id;
  975. for (i = 0; i < MAXCPU; i++) {
  976. apic_id = cpu_apic_ids[i];
  977. if (apic_id == -1)
  978. continue;
  979. if (cpu_info[apic_id].cpu_bsp)
  980. continue;
  981. if (cpu_info[apic_id].cpu_disabled)
  982. continue;
  983. /* Don't let hyperthreads service interrupts. */
  984. if (cpu_info[apic_id].cpu_hyperthread)
  985. continue;
  986. intr_add_cpu(i);
  987. }
  988. }
  989. #ifdef COUNT_XINVLTLB_HITS
  990. u_int xhits_gbl[MAXCPU];
  991. u_int xhits_pg[MAXCPU];
  992. u_int xhits_rng[MAXCPU];
  993. static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
  994. SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
  995. sizeof(xhits_gbl), "IU", "");
  996. SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
  997. sizeof(xhits_pg), "IU", "");
  998. SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
  999. sizeof(xhits_rng), "IU", "");
  1000. u_int ipi_global;
  1001. u_int ipi_page;
  1002. u_int ipi_range;
  1003. u_int ipi_range_size;
  1004. SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
  1005. SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
  1006. SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
  1007. SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
  1008. 0, "");
  1009. #endif /* COUNT_XINVLTLB_HITS */
  1010. /*
  1011. * Init and startup IPI.
  1012. */
  1013. void
  1014. ipi_startup(int apic_id, int vector)
  1015. {
  1016. /*
  1017. * This attempts to follow the algorithm described in the
  1018. * Intel Multiprocessor Specification v1.4 in section B.4.
  1019. * For each IPI, we allow the local APIC ~20us to deliver the
  1020. * IPI. If that times out, we panic.
  1021. */
  1022. /*
  1023. * first we do an INIT IPI: this INIT IPI might be run, resetting
  1024. * and running the target CPU. OR this INIT IPI might be latched (P5
  1025. * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
  1026. * ignored.
  1027. */
  1028. lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
  1029. APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
  1030. lapic_ipi_wait(100);
  1031. /* Explicitly deassert the INIT IPI. */
  1032. lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
  1033. APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
  1034. apic_id);
  1035. DELAY(10000); /* wait ~10mS */
  1036. /*
  1037. * next we do a STARTUP IPI: the previous INIT IPI might still be
  1038. * latched, (P5 bug) this 1st STARTUP would then terminate
  1039. * immediately, and the previously started INIT IPI would continue. OR
  1040. * the previous INIT IPI has already run. and this STARTUP IPI will
  1041. * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
  1042. * will run.
  1043. */
  1044. lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
  1045. APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
  1046. vector, apic_id);
  1047. if (!lapic_ipi_wait(100))
  1048. panic("Failed to deliver first STARTUP IPI to APIC %d",
  1049. apic_id);
  1050. DELAY(200); /* wait ~200uS */
  1051. /*
  1052. * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
  1053. * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
  1054. * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
  1055. * recognized after hardware RESET or INIT IPI.
  1056. */
  1057. lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
  1058. APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
  1059. vector, apic_id);
  1060. if (!lapic_ipi_wait(100))
  1061. panic("Failed to deliver second STARTUP IPI to APIC %d",
  1062. apic_id);
  1063. DELAY(200); /* wait ~200uS */
  1064. }
  1065. /*
  1066. * Send an IPI to specified CPU handling the bitmap logic.
  1067. */
  1068. void
  1069. ipi_send_cpu(int cpu, u_int ipi)
  1070. {
  1071. u_int bitmap, old_pending, new_pending;
  1072. KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu));
  1073. if (IPI_IS_BITMAPED(ipi)) {
  1074. bitmap = 1 << ipi;
  1075. ipi = IPI_BITMAP_VECTOR;
  1076. do {
  1077. old_pending = cpu_ipi_pending[cpu];
  1078. new_pending = old_pending | bitmap;
  1079. } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
  1080. old_pending, new_pending));
  1081. if (old_pending)
  1082. return;
  1083. }
  1084. lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
  1085. }
  1086. void
  1087. ipi_bitmap_handler(struct trapframe frame)
  1088. {
  1089. struct trapframe *oldframe;
  1090. struct thread *td;
  1091. int cpu = PCPU_GET(cpuid);
  1092. u_int ipi_bitmap;
  1093. critical_enter();
  1094. td = curthread;
  1095. td->td_intr_nesting_level++;
  1096. oldframe = td->td_intr_frame;
  1097. td->td_intr_frame = &frame;
  1098. ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
  1099. if (ipi_bitmap & (1 << IPI_PREEMPT)) {
  1100. #ifdef COUNT_IPIS
  1101. (*ipi_preempt_counts[cpu])++;
  1102. #endif
  1103. sched_preempt(td);
  1104. }
  1105. if (ipi_bitmap & (1 << IPI_AST)) {
  1106. #ifdef COUNT_IPIS
  1107. (*ipi_ast_counts[cpu])++;
  1108. #endif
  1109. /* Nothing to do for AST */
  1110. }
  1111. if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
  1112. #ifdef COUNT_IPIS
  1113. (*ipi_hardclock_counts[cpu])++;
  1114. #endif
  1115. hardclockintr();
  1116. }
  1117. td->td_intr_frame = oldframe;
  1118. td->td_intr_nesting_level--;
  1119. critical_exit();
  1120. }
  1121. /*
  1122. * send an IPI to a set of cpus.
  1123. */
  1124. void
  1125. ipi_selected(cpuset_t cpus, u_int ipi)
  1126. {
  1127. int cpu;
  1128. /*
  1129. * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
  1130. * of help in order to understand what is the source.
  1131. * Set the mask of receiving CPUs for this purpose.
  1132. */
  1133. if (ipi == IPI_STOP_HARD)
  1134. CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus);
  1135. while ((cpu = CPU_FFS(&cpus)) != 0) {
  1136. cpu--;
  1137. CPU_CLR(cpu, &cpus);
  1138. CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
  1139. ipi_send_cpu(cpu, ipi);
  1140. }
  1141. }
  1142. /*
  1143. * send an IPI to a specific CPU.
  1144. */
  1145. void
  1146. ipi_cpu(int cpu, u_int ipi)
  1147. {
  1148. /*
  1149. * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
  1150. * of help in order to understand what is the source.
  1151. * Set the mask of receiving CPUs for this purpose.
  1152. */
  1153. if (ipi == IPI_STOP_HARD)
  1154. CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending);
  1155. CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
  1156. ipi_send_cpu(cpu, ipi);
  1157. }
  1158. /*
  1159. * send an IPI to all CPUs EXCEPT myself
  1160. */
  1161. void
  1162. ipi_all_but_self(u_int ipi)
  1163. {
  1164. cpuset_t other_cpus;
  1165. other_cpus = all_cpus;
  1166. CPU_CLR(PCPU_GET(cpuid), &other_cpus);
  1167. if (IPI_IS_BITMAPED(ipi)) {
  1168. ipi_selected(other_cpus, ipi);
  1169. return;
  1170. }
  1171. /*
  1172. * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
  1173. * of help in order to understand what is the source.
  1174. * Set the mask of receiving CPUs for this purpose.
  1175. */
  1176. if (ipi == IPI_STOP_HARD)
  1177. CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus);
  1178. CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
  1179. lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
  1180. }
  1181. int
  1182. ipi_nmi_handler(void)
  1183. {
  1184. u_int cpuid;
  1185. /*
  1186. * As long as there is not a simple way to know about a NMI's
  1187. * source, if the bitmask for the current CPU is present in
  1188. * the global pending bitword an IPI_STOP_HARD has been issued
  1189. * and should be handled.
  1190. */
  1191. cpuid = PCPU_GET(cpuid);
  1192. if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending))
  1193. return (1);
  1194. CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending);
  1195. cpustop_handler();
  1196. return (0);
  1197. }
  1198. int nmi_kdb_lock;
  1199. void
  1200. nmi_call_kdb_smp(u_int type, struct trapframe *frame)
  1201. {
  1202. int cpu;
  1203. bool call_post;
  1204. cpu = PCPU_GET(cpuid);
  1205. if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) {
  1206. nmi_call_kdb(cpu, type, frame);
  1207. call_post = false;
  1208. } else {
  1209. savectx(&stoppcbs[cpu]);
  1210. CPU_SET_ATOMIC(cpu, &stopped_cpus);
  1211. while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1))
  1212. ia32_pause();
  1213. call_post = true;
  1214. }
  1215. atomic_store_rel_int(&nmi_kdb_lock, 0);
  1216. if (call_post)
  1217. cpustop_handler_post(cpu);
  1218. }
  1219. /*
  1220. * Handle an IPI_STOP by saving our current context and spinning until we
  1221. * are resumed.
  1222. */
  1223. void
  1224. cpustop_handler(void)
  1225. {
  1226. u_int cpu;
  1227. cpu = PCPU_GET(cpuid);
  1228. savectx(&stoppcbs[cpu]);
  1229. /* Indicate that we are stopped */
  1230. CPU_SET_ATOMIC(cpu, &stopped_cpus);
  1231. /* Wait for restart */
  1232. while (!CPU_ISSET(cpu, &started_cpus))
  1233. ia32_pause();
  1234. cpustop_handler_post(cpu);
  1235. }
  1236. static void
  1237. cpustop_handler_post(u_int cpu)
  1238. {
  1239. CPU_CLR_ATOMIC(cpu, &started_cpus);
  1240. CPU_CLR_ATOMIC(cpu, &stopped_cpus);
  1241. /*
  1242. * We don't broadcast TLB invalidations to other CPUs when they are
  1243. * stopped. Hence, we clear the TLB before resuming.
  1244. */
  1245. invltlb_glob();
  1246. #if defined(__amd64__) && defined(DDB)
  1247. amd64_db_resume_dbreg();
  1248. #endif
  1249. if (cpu == 0 && cpustop_restartfunc != NULL) {
  1250. cpustop_restartfunc();
  1251. cpustop_restartfunc = NULL;
  1252. }
  1253. }
  1254. /*
  1255. * Handle an IPI_SUSPEND by saving our current context and spinning until we
  1256. * are resumed.
  1257. */
  1258. void
  1259. cpususpend_handler(void)
  1260. {
  1261. u_int cpu;
  1262. mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
  1263. cpu = PCPU_GET(cpuid);
  1264. if (savectx(&susppcbs[cpu]->sp_pcb)) {
  1265. #ifdef __amd64__
  1266. fpususpend(susppcbs[cpu]->sp_fpususpend);
  1267. #else
  1268. npxsuspend(susppcbs[cpu]->sp_fpususpend);
  1269. #endif
  1270. /*
  1271. * suspended_cpus is cleared shortly after each AP is restarted
  1272. * by a Startup IPI, so that the BSP can proceed to restarting
  1273. * the next AP.
  1274. *
  1275. * resuming_cpus gets cleared when the AP completes
  1276. * initialization after having been released by the BSP.
  1277. * resuming_cpus is probably not the best name for the
  1278. * variable, because it is actually a set of processors that
  1279. * haven't resumed yet and haven't necessarily started resuming.
  1280. *
  1281. * Note that suspended_cpus is meaningful only for ACPI suspend
  1282. * as it's not really used for Xen suspend since the APs are
  1283. * automatically restored to the running state and the correct
  1284. * context. For the same reason resumectx is never called in
  1285. * that case.
  1286. */
  1287. CPU_SET_ATOMIC(cpu, &suspended_cpus);
  1288. CPU_SET_ATOMIC(cpu, &resuming_cpus);
  1289. /*
  1290. * Invalidate the cache after setting the global status bits.
  1291. * The last AP to set its bit may end up being an Owner of the
  1292. * corresponding cache line in MOESI protocol. The AP may be
  1293. * stopped before the cache line is written to the main memory.
  1294. */
  1295. wbinvd();
  1296. } else {
  1297. #ifdef __amd64__
  1298. fpuresume(susppcbs[cpu]->sp_fpususpend);
  1299. #else
  1300. npxresume(susppcbs[cpu]->sp_fpususpend);
  1301. #endif
  1302. pmap_init_pat();
  1303. initializecpu();
  1304. PCPU_SET(switchtime, 0);
  1305. PCPU_SET(switchticks, ticks);
  1306. /* Indicate that we have restarted and restored the context. */
  1307. CPU_CLR_ATOMIC(cpu, &suspended_cpus);
  1308. }
  1309. /* Wait for resume directive */
  1310. while (!CPU_ISSET(cpu, &toresume_cpus))
  1311. ia32_pause();
  1312. /* Re-apply microcode updates. */
  1313. ucode_reload();
  1314. #ifdef __i386__
  1315. /* Finish removing the identity mapping of low memory for this AP. */
  1316. invltlb_glob();
  1317. #endif
  1318. if (cpu_ops.cpu_resume)
  1319. cpu_ops.cpu_resume();
  1320. #ifdef __amd64__
  1321. if (vmm_resume_p)
  1322. vmm_resume_p();
  1323. #endif
  1324. /* Resume MCA and local APIC */
  1325. lapic_xapic_mode();
  1326. mca_resume();
  1327. lapic_setup(0);
  1328. /* Indicate that we are resumed */
  1329. CPU_CLR_ATOMIC(cpu, &resuming_cpus);
  1330. CPU_CLR_ATOMIC(cpu, &suspended_cpus);
  1331. CPU_CLR_ATOMIC(cpu, &toresume_cpus);
  1332. }
  1333. void
  1334. invlcache_handler(void)
  1335. {
  1336. uint32_t generation;
  1337. #ifdef COUNT_IPIS
  1338. (*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
  1339. #endif /* COUNT_IPIS */
  1340. /*
  1341. * Reading the generation here allows greater parallelism
  1342. * since wbinvd is a serializing instruction. Without the
  1343. * temporary, we'd wait for wbinvd to complete, then the read
  1344. * would execute, then the dependent write, which must then
  1345. * complete before return from interrupt.
  1346. */
  1347. generation = smp_tlb_generation;
  1348. wbinvd();
  1349. PCPU_SET(smp_tlb_done, generation);
  1350. }
  1351. /*
  1352. * This is called once the rest of the system is up and running and we're
  1353. * ready to let the AP's out of the pen.
  1354. */
  1355. static void
  1356. release_aps(void *dummy __unused)
  1357. {
  1358. if (mp_ncpus == 1)
  1359. return;
  1360. atomic_store_rel_int(&aps_ready, 1);
  1361. while (smp_started == 0)
  1362. ia32_pause();
  1363. }
  1364. SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
  1365. #ifdef COUNT_IPIS
  1366. /*
  1367. * Setup interrupt counters for IPI handlers.
  1368. */
  1369. static void
  1370. mp_ipi_intrcnt(void *dummy)
  1371. {
  1372. char buf[64];
  1373. int i;
  1374. CPU_FOREACH(i) {
  1375. snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
  1376. intrcnt_add(buf, &ipi_invltlb_counts[i]);
  1377. snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
  1378. intrcnt_add(buf, &ipi_invlrng_counts[i]);
  1379. snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
  1380. intrcnt_add(buf, &ipi_invlpg_counts[i]);
  1381. snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
  1382. intrcnt_add(buf, &ipi_invlcache_counts[i]);
  1383. snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
  1384. intrcnt_add(buf, &ipi_preempt_counts[i]);
  1385. snprintf(buf, sizeof(buf), "cpu%d:ast", i);
  1386. intrcnt_add(buf, &ipi_ast_counts[i]);
  1387. snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
  1388. intrcnt_add(buf, &ipi_rendezvous_counts[i]);
  1389. snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
  1390. intrcnt_add(buf, &ipi_hardclock_counts[i]);
  1391. }
  1392. }
  1393. SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
  1394. #endif
  1395. /*
  1396. * Flush the TLB on other CPU's
  1397. */
  1398. /* Variables needed for SMP tlb shootdown. */
  1399. vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
  1400. pmap_t smp_tlb_pmap;
  1401. volatile uint32_t smp_tlb_generation;
  1402. #ifdef __amd64__
  1403. #define read_eflags() read_rflags()
  1404. #endif
  1405. /*
  1406. * Used by pmap to request invalidation of TLB or cache on local and
  1407. * remote processors. Mask provides the set of remote CPUs which are
  1408. * to be signalled with the IPI specified by vector. The curcpu_cb
  1409. * callback is invoked on the calling CPU while waiting for remote
  1410. * CPUs to complete the operation.
  1411. *
  1412. * The callback function is called unconditionally on the caller's
  1413. * underlying processor, even when this processor is not set in the
  1414. * mask. So, the callback function must be prepared to handle such
  1415. * spurious invocations.
  1416. */
  1417. static void
  1418. smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
  1419. vm_offset_t addr1, vm_offset_t addr2, smp_invl_cb_t curcpu_cb)
  1420. {
  1421. cpuset_t other_cpus;
  1422. volatile uint32_t *p_cpudone;
  1423. uint32_t generation;
  1424. int cpu;
  1425. /*
  1426. * It is not necessary to signal other CPUs while booting or
  1427. * when in the debugger.
  1428. */
  1429. if (kdb_active || panicstr != NULL || !smp_started) {
  1430. curcpu_cb(pmap, addr1, addr2);
  1431. return;
  1432. }
  1433. sched_pin();
  1434. /*
  1435. * Check for other cpus. Return if none.
  1436. */
  1437. if (CPU_ISFULLSET(&mask)) {
  1438. if (mp_ncpus <= 1)
  1439. goto nospinexit;
  1440. } else {
  1441. CPU_CLR(PCPU_GET(cpuid), &mask);
  1442. if (CPU_EMPTY(&mask))
  1443. goto nospinexit;
  1444. }
  1445. if (!(read_eflags() & PSL_I))
  1446. panic("%s: interrupts disabled", __func__);
  1447. mtx_lock_spin(&smp_ipi_mtx);
  1448. smp_tlb_addr1 = addr1;
  1449. smp_tlb_addr2 = addr2;
  1450. smp_tlb_pmap = pmap;
  1451. generation = ++smp_tlb_generation;
  1452. if (CPU_ISFULLSET(&mask)) {
  1453. ipi_all_but_self(vector);
  1454. other_cpus = all_cpus;
  1455. CPU_CLR(PCPU_GET(cpuid), &other_cpus);
  1456. } else {
  1457. other_cpus = mask;
  1458. while ((cpu = CPU_FFS(&mask)) != 0) {
  1459. cpu--;
  1460. CPU_CLR(cpu, &mask);
  1461. CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
  1462. cpu, vector);
  1463. ipi_send_cpu(cpu, vector);
  1464. }
  1465. }
  1466. curcpu_cb(pmap, addr1, addr2);
  1467. while ((cpu = CPU_FFS(&other_cpus)) != 0) {
  1468. cpu--;
  1469. CPU_CLR(cpu, &other_cpus);
  1470. p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
  1471. while (*p_cpudone != generation)
  1472. ia32_pause();
  1473. }
  1474. mtx_unlock_spin(&smp_ipi_mtx);
  1475. sched_unpin();
  1476. return;
  1477. nospinexit:
  1478. curcpu_cb(pmap, addr1, addr2);
  1479. sched_unpin();
  1480. }
  1481. void
  1482. smp_masked_invltlb(cpuset_t mask, pmap_t pmap, smp_invl_cb_t curcpu_cb)
  1483. {
  1484. smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0, curcpu_cb);
  1485. #ifdef COUNT_XINVLTLB_HITS
  1486. ipi_global++;
  1487. #endif
  1488. }
  1489. void
  1490. smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap,
  1491. smp_invl_cb_t curcpu_cb)
  1492. {
  1493. smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0, curcpu_cb);
  1494. #ifdef COUNT_XINVLTLB_HITS
  1495. ipi_page++;
  1496. #endif
  1497. }
  1498. void
  1499. smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
  1500. pmap_t pmap, smp_invl_cb_t curcpu_cb)
  1501. {
  1502. smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, addr1, addr2,
  1503. curcpu_cb);
  1504. #ifdef COUNT_XINVLTLB_HITS
  1505. ipi_range++;
  1506. ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
  1507. #endif
  1508. }
  1509. void
  1510. smp_cache_flush(smp_invl_cb_t curcpu_cb)
  1511. {
  1512. smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL, 0, 0,
  1513. curcpu_cb);
  1514. }
  1515. /*
  1516. * Handlers for TLB related IPIs
  1517. */
  1518. void
  1519. invltlb_handler(void)
  1520. {
  1521. uint32_t generation;
  1522. #ifdef COUNT_XINVLTLB_HITS
  1523. xhits_gbl[PCPU_GET(cpuid)]++;
  1524. #endif /* COUNT_XINVLTLB_HITS */
  1525. #ifdef COUNT_IPIS
  1526. (*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
  1527. #endif /* COUNT_IPIS */
  1528. /*
  1529. * Reading the generation here allows greater parallelism
  1530. * since invalidating the TLB is a serializing operation.
  1531. */
  1532. generation = smp_tlb_generation;
  1533. if (smp_tlb_pmap == kernel_pmap)
  1534. invltlb_glob();
  1535. #ifdef __amd64__
  1536. else
  1537. invltlb();
  1538. #endif
  1539. PCPU_SET(smp_tlb_done, generation);
  1540. }
  1541. void
  1542. invlpg_handler(void)
  1543. {
  1544. uint32_t generation;
  1545. #ifdef COUNT_XINVLTLB_HITS
  1546. xhits_pg[PCPU_GET(cpuid)]++;
  1547. #endif /* COUNT_XINVLTLB_HITS */
  1548. #ifdef COUNT_IPIS
  1549. (*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
  1550. #endif /* COUNT_IPIS */
  1551. generation = smp_tlb_generation; /* Overlap with serialization */
  1552. #ifdef __i386__
  1553. if (smp_tlb_pmap == kernel_pmap)
  1554. #endif
  1555. invlpg(smp_tlb_addr1);
  1556. PCPU_SET(smp_tlb_done, generation);
  1557. }
  1558. void
  1559. invlrng_handler(void)
  1560. {
  1561. vm_offset_t addr, addr2;
  1562. uint32_t generation;
  1563. #ifdef COUNT_XINVLTLB_HITS
  1564. xhits_rng[PCPU_GET(cpuid)]++;
  1565. #endif /* COUNT_XINVLTLB_HITS */
  1566. #ifdef COUNT_IPIS
  1567. (*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
  1568. #endif /* COUNT_IPIS */
  1569. addr = smp_tlb_addr1;
  1570. addr2 = smp_tlb_addr2;
  1571. generation = smp_tlb_generation; /* Overlap with serialization */
  1572. #ifdef __i386__
  1573. if (smp_tlb_pmap == kernel_pmap)
  1574. #endif
  1575. do {
  1576. invlpg(addr);
  1577. addr += PAGE_SIZE;
  1578. } while (addr < addr2);
  1579. PCPU_SET(smp_tlb_done, generation);
  1580. }