HardenedBSD src tree https://hardenedbsd.org/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

2952 lines
76 KiB

  1. /*-
  2. * SPDX-License-Identifier: BSD-3-Clause
  3. *
  4. * Copyright (c) 1989 Stephen Deering
  5. * Copyright (c) 1992, 1993
  6. * The Regents of the University of California. All rights reserved.
  7. *
  8. * This code is derived from software contributed to Berkeley by
  9. * Stephen Deering of Stanford University.
  10. *
  11. * Redistribution and use in source and binary forms, with or without
  12. * modification, are permitted provided that the following conditions
  13. * are met:
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. * 2. Redistributions in binary form must reproduce the above copyright
  17. * notice, this list of conditions and the following disclaimer in the
  18. * documentation and/or other materials provided with the distribution.
  19. * 3. Neither the name of the University nor the names of its contributors
  20. * may be used to endorse or promote products derived from this software
  21. * without specific prior written permission.
  22. *
  23. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33. * SUCH DAMAGE.
  34. *
  35. * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
  36. */
  37. /*
  38. * IP multicast forwarding procedures
  39. *
  40. * Written by David Waitzman, BBN Labs, August 1988.
  41. * Modified by Steve Deering, Stanford, February 1989.
  42. * Modified by Mark J. Steiglitz, Stanford, May, 1991
  43. * Modified by Van Jacobson, LBL, January 1993
  44. * Modified by Ajit Thyagarajan, PARC, August 1993
  45. * Modified by Bill Fenner, PARC, April 1995
  46. * Modified by Ahmed Helmy, SGI, June 1996
  47. * Modified by George Edmond Eddy (Rusty), ISI, February 1998
  48. * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
  49. * Modified by Hitoshi Asaeda, WIDE, August 2000
  50. * Modified by Pavlin Radoslavov, ICSI, October 2002
  51. *
  52. * MROUTING Revision: 3.5
  53. * and PIM-SMv2 and PIM-DM support, advanced API support,
  54. * bandwidth metering and signaling
  55. */
  56. /*
  57. * TODO: Prefix functions with ipmf_.
  58. * TODO: Maintain a refcount on if_allmulti() in ifnet or in the protocol
  59. * domain attachment (if_afdata) so we can track consumers of that service.
  60. * TODO: Deprecate routing socket path for SIOCGETSGCNT and SIOCGETVIFCNT,
  61. * move it to socket options.
  62. * TODO: Cleanup LSRR removal further.
  63. * TODO: Push RSVP stubs into raw_ip.c.
  64. * TODO: Use bitstring.h for vif set.
  65. * TODO: Fix mrt6_ioctl dangling ref when dynamically loaded.
  66. * TODO: Sync ip6_mroute.c with this file.
  67. */
  68. #include <sys/cdefs.h>
  69. __FBSDID("$FreeBSD$");
  70. #include "opt_inet.h"
  71. #include "opt_mrouting.h"
  72. #define _PIM_VT 1
  73. #include <sys/param.h>
  74. #include <sys/kernel.h>
  75. #include <sys/stddef.h>
  76. #include <sys/eventhandler.h>
  77. #include <sys/lock.h>
  78. #include <sys/ktr.h>
  79. #include <sys/malloc.h>
  80. #include <sys/mbuf.h>
  81. #include <sys/module.h>
  82. #include <sys/priv.h>
  83. #include <sys/protosw.h>
  84. #include <sys/signalvar.h>
  85. #include <sys/socket.h>
  86. #include <sys/socketvar.h>
  87. #include <sys/sockio.h>
  88. #include <sys/sx.h>
  89. #include <sys/sysctl.h>
  90. #include <sys/syslog.h>
  91. #include <sys/systm.h>
  92. #include <sys/time.h>
  93. #include <sys/counter.h>
  94. #include <net/if.h>
  95. #include <net/if_var.h>
  96. #include <net/netisr.h>
  97. #include <net/route.h>
  98. #include <net/vnet.h>
  99. #include <netinet/in.h>
  100. #include <netinet/igmp.h>
  101. #include <netinet/in_systm.h>
  102. #include <netinet/in_var.h>
  103. #include <netinet/ip.h>
  104. #include <netinet/ip_encap.h>
  105. #include <netinet/ip_mroute.h>
  106. #include <netinet/ip_var.h>
  107. #include <netinet/ip_options.h>
  108. #include <netinet/pim.h>
  109. #include <netinet/pim_var.h>
  110. #include <netinet/udp.h>
  111. #include <machine/in_cksum.h>
  112. #ifndef KTR_IPMF
  113. #define KTR_IPMF KTR_INET
  114. #endif
  115. #define VIFI_INVALID ((vifi_t) -1)
  116. VNET_DEFINE_STATIC(uint32_t, last_tv_sec); /* last time we processed this */
  117. #define V_last_tv_sec VNET(last_tv_sec)
  118. static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast forwarding cache");
  119. /*
  120. * Locking. We use two locks: one for the virtual interface table and
  121. * one for the forwarding table. These locks may be nested in which case
  122. * the VIF lock must always be taken first. Note that each lock is used
  123. * to cover not only the specific data structure but also related data
  124. * structures.
  125. */
  126. static struct mtx mrouter_mtx;
  127. #define MROUTER_LOCK() mtx_lock(&mrouter_mtx)
  128. #define MROUTER_UNLOCK() mtx_unlock(&mrouter_mtx)
  129. #define MROUTER_LOCK_ASSERT() mtx_assert(&mrouter_mtx, MA_OWNED)
  130. #define MROUTER_LOCK_INIT() \
  131. mtx_init(&mrouter_mtx, "IPv4 multicast forwarding", NULL, MTX_DEF)
  132. #define MROUTER_LOCK_DESTROY() mtx_destroy(&mrouter_mtx)
  133. static int ip_mrouter_cnt; /* # of vnets with active mrouters */
  134. static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */
  135. VNET_PCPUSTAT_DEFINE_STATIC(struct mrtstat, mrtstat);
  136. VNET_PCPUSTAT_SYSINIT(mrtstat);
  137. VNET_PCPUSTAT_SYSUNINIT(mrtstat);
  138. SYSCTL_VNET_PCPUSTAT(_net_inet_ip, OID_AUTO, mrtstat, struct mrtstat,
  139. mrtstat, "IPv4 Multicast Forwarding Statistics (struct mrtstat, "
  140. "netinet/ip_mroute.h)");
  141. VNET_DEFINE_STATIC(u_long, mfchash);
  142. #define V_mfchash VNET(mfchash)
  143. #define MFCHASH(a, g) \
  144. ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \
  145. ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & V_mfchash)
  146. #define MFCHASHSIZE 256
  147. static u_long mfchashsize; /* Hash size */
  148. VNET_DEFINE_STATIC(u_char *, nexpire); /* 0..mfchashsize-1 */
  149. #define V_nexpire VNET(nexpire)
  150. VNET_DEFINE_STATIC(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl);
  151. #define V_mfchashtbl VNET(mfchashtbl)
  152. static struct mtx mfc_mtx;
  153. #define MFC_LOCK() mtx_lock(&mfc_mtx)
  154. #define MFC_UNLOCK() mtx_unlock(&mfc_mtx)
  155. #define MFC_LOCK_ASSERT() mtx_assert(&mfc_mtx, MA_OWNED)
  156. #define MFC_LOCK_INIT() \
  157. mtx_init(&mfc_mtx, "IPv4 multicast forwarding cache", NULL, MTX_DEF)
  158. #define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx)
  159. VNET_DEFINE_STATIC(vifi_t, numvifs);
  160. #define V_numvifs VNET(numvifs)
  161. VNET_DEFINE_STATIC(struct vif *, viftable);
  162. #define V_viftable VNET(viftable)
  163. /*
  164. * No one should be able to "query" this before initialisation happened in
  165. * vnet_mroute_init(), so we should still be fine.
  166. */
  167. SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_VNET | CTLFLAG_RD,
  168. &VNET_NAME(viftable), sizeof(*V_viftable) * MAXVIFS, "S,vif[MAXVIFS]",
  169. "IPv4 Multicast Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)");
  170. static struct mtx vif_mtx;
  171. #define VIF_LOCK() mtx_lock(&vif_mtx)
  172. #define VIF_UNLOCK() mtx_unlock(&vif_mtx)
  173. #define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED)
  174. #define VIF_LOCK_INIT() \
  175. mtx_init(&vif_mtx, "IPv4 multicast interfaces", NULL, MTX_DEF)
  176. #define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx)
  177. static eventhandler_tag if_detach_event_tag = NULL;
  178. VNET_DEFINE_STATIC(struct callout, expire_upcalls_ch);
  179. #define V_expire_upcalls_ch VNET(expire_upcalls_ch)
  180. #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */
  181. #define UPCALL_EXPIRE 6 /* number of timeouts */
  182. /*
  183. * Bandwidth meter variables and constants
  184. */
  185. static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters");
  186. /*
  187. * Pending timeouts are stored in a hash table, the key being the
  188. * expiration time. Periodically, the entries are analysed and processed.
  189. */
  190. #define BW_METER_BUCKETS 1024
  191. VNET_DEFINE_STATIC(struct bw_meter **, bw_meter_timers);
  192. #define V_bw_meter_timers VNET(bw_meter_timers)
  193. VNET_DEFINE_STATIC(struct callout, bw_meter_ch);
  194. #define V_bw_meter_ch VNET(bw_meter_ch)
  195. #define BW_METER_PERIOD (hz) /* periodical handling of bw meters */
  196. /*
  197. * Pending upcalls are stored in a vector which is flushed when
  198. * full, or periodically
  199. */
  200. VNET_DEFINE_STATIC(struct bw_upcall *, bw_upcalls);
  201. #define V_bw_upcalls VNET(bw_upcalls)
  202. VNET_DEFINE_STATIC(u_int, bw_upcalls_n); /* # of pending upcalls */
  203. #define V_bw_upcalls_n VNET(bw_upcalls_n)
  204. VNET_DEFINE_STATIC(struct callout, bw_upcalls_ch);
  205. #define V_bw_upcalls_ch VNET(bw_upcalls_ch)
  206. #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */
  207. VNET_PCPUSTAT_DEFINE_STATIC(struct pimstat, pimstat);
  208. VNET_PCPUSTAT_SYSINIT(pimstat);
  209. VNET_PCPUSTAT_SYSUNINIT(pimstat);
  210. SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM");
  211. SYSCTL_VNET_PCPUSTAT(_net_inet_pim, PIMCTL_STATS, stats, struct pimstat,
  212. pimstat, "PIM Statistics (struct pimstat, netinet/pim_var.h)");
  213. static u_long pim_squelch_wholepkt = 0;
  214. SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW,
  215. &pim_squelch_wholepkt, 0,
  216. "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified");
  217. static const struct encaptab *pim_encap_cookie;
  218. static int pim_encapcheck(const struct mbuf *, int, int, void *);
  219. static int pim_input(struct mbuf *, int, int, void *);
  220. static const struct encap_config ipv4_encap_cfg = {
  221. .proto = IPPROTO_PIM,
  222. .min_length = sizeof(struct ip) + PIM_MINLEN,
  223. .exact_match = 8,
  224. .check = pim_encapcheck,
  225. .input = pim_input
  226. };
  227. /*
  228. * Note: the PIM Register encapsulation adds the following in front of a
  229. * data packet:
  230. *
  231. * struct pim_encap_hdr {
  232. * struct ip ip;
  233. * struct pim_encap_pimhdr pim;
  234. * }
  235. *
  236. */
  237. struct pim_encap_pimhdr {
  238. struct pim pim;
  239. uint32_t flags;
  240. };
  241. #define PIM_ENCAP_TTL 64
  242. static struct ip pim_encap_iphdr = {
  243. #if BYTE_ORDER == LITTLE_ENDIAN
  244. sizeof(struct ip) >> 2,
  245. IPVERSION,
  246. #else
  247. IPVERSION,
  248. sizeof(struct ip) >> 2,
  249. #endif
  250. 0, /* tos */
  251. sizeof(struct ip), /* total length */
  252. 0, /* id */
  253. 0, /* frag offset */
  254. PIM_ENCAP_TTL,
  255. IPPROTO_PIM,
  256. 0, /* checksum */
  257. };
  258. static struct pim_encap_pimhdr pim_encap_pimhdr = {
  259. {
  260. PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
  261. 0, /* reserved */
  262. 0, /* checksum */
  263. },
  264. 0 /* flags */
  265. };
  266. VNET_DEFINE_STATIC(vifi_t, reg_vif_num) = VIFI_INVALID;
  267. #define V_reg_vif_num VNET(reg_vif_num)
  268. VNET_DEFINE_STATIC(struct ifnet, multicast_register_if);
  269. #define V_multicast_register_if VNET(multicast_register_if)
  270. /*
  271. * Private variables.
  272. */
  273. static u_long X_ip_mcast_src(int);
  274. static int X_ip_mforward(struct ip *, struct ifnet *, struct mbuf *,
  275. struct ip_moptions *);
  276. static int X_ip_mrouter_done(void);
  277. static int X_ip_mrouter_get(struct socket *, struct sockopt *);
  278. static int X_ip_mrouter_set(struct socket *, struct sockopt *);
  279. static int X_legal_vif_num(int);
  280. static int X_mrt_ioctl(u_long, caddr_t, int);
  281. static int add_bw_upcall(struct bw_upcall *);
  282. static int add_mfc(struct mfcctl2 *);
  283. static int add_vif(struct vifctl *);
  284. static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *);
  285. static void bw_meter_process(void);
  286. static void bw_meter_receive_packet(struct bw_meter *, int,
  287. struct timeval *);
  288. static void bw_upcalls_send(void);
  289. static int del_bw_upcall(struct bw_upcall *);
  290. static int del_mfc(struct mfcctl2 *);
  291. static int del_vif(vifi_t);
  292. static int del_vif_locked(vifi_t);
  293. static void expire_bw_meter_process(void *);
  294. static void expire_bw_upcalls_send(void *);
  295. static void expire_mfc(struct mfc *);
  296. static void expire_upcalls(void *);
  297. static void free_bw_list(struct bw_meter *);
  298. static int get_sg_cnt(struct sioc_sg_req *);
  299. static int get_vif_cnt(struct sioc_vif_req *);
  300. static void if_detached_event(void *, struct ifnet *);
  301. static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t);
  302. static int ip_mrouter_init(struct socket *, int);
  303. static __inline struct mfc *
  304. mfc_find(struct in_addr *, struct in_addr *);
  305. static void phyint_send(struct ip *, struct vif *, struct mbuf *);
  306. static struct mbuf *
  307. pim_register_prepare(struct ip *, struct mbuf *);
  308. static int pim_register_send(struct ip *, struct vif *,
  309. struct mbuf *, struct mfc *);
  310. static int pim_register_send_rp(struct ip *, struct vif *,
  311. struct mbuf *, struct mfc *);
  312. static int pim_register_send_upcall(struct ip *, struct vif *,
  313. struct mbuf *, struct mfc *);
  314. static void schedule_bw_meter(struct bw_meter *, struct timeval *);
  315. static void send_packet(struct vif *, struct mbuf *);
  316. static int set_api_config(uint32_t *);
  317. static int set_assert(int);
  318. static int socket_send(struct socket *, struct mbuf *,
  319. struct sockaddr_in *);
  320. static void unschedule_bw_meter(struct bw_meter *);
  321. /*
  322. * Kernel multicast forwarding API capabilities and setup.
  323. * If more API capabilities are added to the kernel, they should be
  324. * recorded in `mrt_api_support'.
  325. */
  326. #define MRT_API_VERSION 0x0305
  327. static const int mrt_api_version = MRT_API_VERSION;
  328. static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF |
  329. MRT_MFC_FLAGS_BORDER_VIF |
  330. MRT_MFC_RP |
  331. MRT_MFC_BW_UPCALL);
  332. VNET_DEFINE_STATIC(uint32_t, mrt_api_config);
  333. #define V_mrt_api_config VNET(mrt_api_config)
  334. VNET_DEFINE_STATIC(int, pim_assert_enabled);
  335. #define V_pim_assert_enabled VNET(pim_assert_enabled)
  336. static struct timeval pim_assert_interval = { 3, 0 }; /* Rate limit */
  337. /*
  338. * Find a route for a given origin IP address and multicast group address.
  339. * Statistics must be updated by the caller.
  340. */
  341. static __inline struct mfc *
  342. mfc_find(struct in_addr *o, struct in_addr *g)
  343. {
  344. struct mfc *rt;
  345. MFC_LOCK_ASSERT();
  346. LIST_FOREACH(rt, &V_mfchashtbl[MFCHASH(*o, *g)], mfc_hash) {
  347. if (in_hosteq(rt->mfc_origin, *o) &&
  348. in_hosteq(rt->mfc_mcastgrp, *g) &&
  349. TAILQ_EMPTY(&rt->mfc_stall))
  350. break;
  351. }
  352. return (rt);
  353. }
  354. /*
  355. * Handle MRT setsockopt commands to modify the multicast forwarding tables.
  356. */
  357. static int
  358. X_ip_mrouter_set(struct socket *so, struct sockopt *sopt)
  359. {
  360. int error, optval;
  361. vifi_t vifi;
  362. struct vifctl vifc;
  363. struct mfcctl2 mfc;
  364. struct bw_upcall bw_upcall;
  365. uint32_t i;
  366. if (so != V_ip_mrouter && sopt->sopt_name != MRT_INIT)
  367. return EPERM;
  368. error = 0;
  369. switch (sopt->sopt_name) {
  370. case MRT_INIT:
  371. error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
  372. if (error)
  373. break;
  374. error = ip_mrouter_init(so, optval);
  375. break;
  376. case MRT_DONE:
  377. error = ip_mrouter_done();
  378. break;
  379. case MRT_ADD_VIF:
  380. error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc);
  381. if (error)
  382. break;
  383. error = add_vif(&vifc);
  384. break;
  385. case MRT_DEL_VIF:
  386. error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi);
  387. if (error)
  388. break;
  389. error = del_vif(vifi);
  390. break;
  391. case MRT_ADD_MFC:
  392. case MRT_DEL_MFC:
  393. /*
  394. * select data size depending on API version.
  395. */
  396. if (sopt->sopt_name == MRT_ADD_MFC &&
  397. V_mrt_api_config & MRT_API_FLAGS_ALL) {
  398. error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2),
  399. sizeof(struct mfcctl2));
  400. } else {
  401. error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl),
  402. sizeof(struct mfcctl));
  403. bzero((caddr_t)&mfc + sizeof(struct mfcctl),
  404. sizeof(mfc) - sizeof(struct mfcctl));
  405. }
  406. if (error)
  407. break;
  408. if (sopt->sopt_name == MRT_ADD_MFC)
  409. error = add_mfc(&mfc);
  410. else
  411. error = del_mfc(&mfc);
  412. break;
  413. case MRT_ASSERT:
  414. error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
  415. if (error)
  416. break;
  417. set_assert(optval);
  418. break;
  419. case MRT_API_CONFIG:
  420. error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
  421. if (!error)
  422. error = set_api_config(&i);
  423. if (!error)
  424. error = sooptcopyout(sopt, &i, sizeof i);
  425. break;
  426. case MRT_ADD_BW_UPCALL:
  427. case MRT_DEL_BW_UPCALL:
  428. error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall,
  429. sizeof bw_upcall);
  430. if (error)
  431. break;
  432. if (sopt->sopt_name == MRT_ADD_BW_UPCALL)
  433. error = add_bw_upcall(&bw_upcall);
  434. else
  435. error = del_bw_upcall(&bw_upcall);
  436. break;
  437. default:
  438. error = EOPNOTSUPP;
  439. break;
  440. }
  441. return error;
  442. }
  443. /*
  444. * Handle MRT getsockopt commands
  445. */
  446. static int
  447. X_ip_mrouter_get(struct socket *so, struct sockopt *sopt)
  448. {
  449. int error;
  450. switch (sopt->sopt_name) {
  451. case MRT_VERSION:
  452. error = sooptcopyout(sopt, &mrt_api_version, sizeof mrt_api_version);
  453. break;
  454. case MRT_ASSERT:
  455. error = sooptcopyout(sopt, &V_pim_assert_enabled,
  456. sizeof V_pim_assert_enabled);
  457. break;
  458. case MRT_API_SUPPORT:
  459. error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support);
  460. break;
  461. case MRT_API_CONFIG:
  462. error = sooptcopyout(sopt, &V_mrt_api_config, sizeof V_mrt_api_config);
  463. break;
  464. default:
  465. error = EOPNOTSUPP;
  466. break;
  467. }
  468. return error;
  469. }
  470. /*
  471. * Handle ioctl commands to obtain information from the cache
  472. */
  473. static int
  474. X_mrt_ioctl(u_long cmd, caddr_t data, int fibnum __unused)
  475. {
  476. int error = 0;
  477. /*
  478. * Currently the only function calling this ioctl routine is rtioctl_fib().
  479. * Typically, only root can create the raw socket in order to execute
  480. * this ioctl method, however the request might be coming from a prison
  481. */
  482. error = priv_check(curthread, PRIV_NETINET_MROUTE);
  483. if (error)
  484. return (error);
  485. switch (cmd) {
  486. case (SIOCGETVIFCNT):
  487. error = get_vif_cnt((struct sioc_vif_req *)data);
  488. break;
  489. case (SIOCGETSGCNT):
  490. error = get_sg_cnt((struct sioc_sg_req *)data);
  491. break;
  492. default:
  493. error = EINVAL;
  494. break;
  495. }
  496. return error;
  497. }
  498. /*
  499. * returns the packet, byte, rpf-failure count for the source group provided
  500. */
  501. static int
  502. get_sg_cnt(struct sioc_sg_req *req)
  503. {
  504. struct mfc *rt;
  505. MFC_LOCK();
  506. rt = mfc_find(&req->src, &req->grp);
  507. if (rt == NULL) {
  508. MFC_UNLOCK();
  509. req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
  510. return EADDRNOTAVAIL;
  511. }
  512. req->pktcnt = rt->mfc_pkt_cnt;
  513. req->bytecnt = rt->mfc_byte_cnt;
  514. req->wrong_if = rt->mfc_wrong_if;
  515. MFC_UNLOCK();
  516. return 0;
  517. }
  518. /*
  519. * returns the input and output packet and byte counts on the vif provided
  520. */
  521. static int
  522. get_vif_cnt(struct sioc_vif_req *req)
  523. {
  524. vifi_t vifi = req->vifi;
  525. VIF_LOCK();
  526. if (vifi >= V_numvifs) {
  527. VIF_UNLOCK();
  528. return EINVAL;
  529. }
  530. req->icount = V_viftable[vifi].v_pkt_in;
  531. req->ocount = V_viftable[vifi].v_pkt_out;
  532. req->ibytes = V_viftable[vifi].v_bytes_in;
  533. req->obytes = V_viftable[vifi].v_bytes_out;
  534. VIF_UNLOCK();
  535. return 0;
  536. }
  537. static void
  538. if_detached_event(void *arg __unused, struct ifnet *ifp)
  539. {
  540. vifi_t vifi;
  541. u_long i;
  542. MROUTER_LOCK();
  543. if (V_ip_mrouter == NULL) {
  544. MROUTER_UNLOCK();
  545. return;
  546. }
  547. VIF_LOCK();
  548. MFC_LOCK();
  549. /*
  550. * Tear down multicast forwarder state associated with this ifnet.
  551. * 1. Walk the vif list, matching vifs against this ifnet.
  552. * 2. Walk the multicast forwarding cache (mfc) looking for
  553. * inner matches with this vif's index.
  554. * 3. Expire any matching multicast forwarding cache entries.
  555. * 4. Free vif state. This should disable ALLMULTI on the interface.
  556. */
  557. for (vifi = 0; vifi < V_numvifs; vifi++) {
  558. if (V_viftable[vifi].v_ifp != ifp)
  559. continue;
  560. for (i = 0; i < mfchashsize; i++) {
  561. struct mfc *rt, *nrt;
  562. LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
  563. if (rt->mfc_parent == vifi) {
  564. expire_mfc(rt);
  565. }
  566. }
  567. }
  568. del_vif_locked(vifi);
  569. }
  570. MFC_UNLOCK();
  571. VIF_UNLOCK();
  572. MROUTER_UNLOCK();
  573. }
  574. /*
  575. * Enable multicast forwarding.
  576. */
  577. static int
  578. ip_mrouter_init(struct socket *so, int version)
  579. {
  580. CTR3(KTR_IPMF, "%s: so_type %d, pr_protocol %d", __func__,
  581. so->so_type, so->so_proto->pr_protocol);
  582. if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP)
  583. return EOPNOTSUPP;
  584. if (version != 1)
  585. return ENOPROTOOPT;
  586. MROUTER_LOCK();
  587. if (ip_mrouter_unloading) {
  588. MROUTER_UNLOCK();
  589. return ENOPROTOOPT;
  590. }
  591. if (V_ip_mrouter != NULL) {
  592. MROUTER_UNLOCK();
  593. return EADDRINUSE;
  594. }
  595. V_mfchashtbl = hashinit_flags(mfchashsize, M_MRTABLE, &V_mfchash,
  596. HASH_NOWAIT);
  597. callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls,
  598. curvnet);
  599. callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send,
  600. curvnet);
  601. callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process,
  602. curvnet);
  603. V_ip_mrouter = so;
  604. ip_mrouter_cnt++;
  605. MROUTER_UNLOCK();
  606. CTR1(KTR_IPMF, "%s: done", __func__);
  607. return 0;
  608. }
  609. /*
  610. * Disable multicast forwarding.
  611. */
  612. static int
  613. X_ip_mrouter_done(void)
  614. {
  615. struct ifnet *ifp;
  616. u_long i;
  617. vifi_t vifi;
  618. MROUTER_LOCK();
  619. if (V_ip_mrouter == NULL) {
  620. MROUTER_UNLOCK();
  621. return EINVAL;
  622. }
  623. /*
  624. * Detach/disable hooks to the reset of the system.
  625. */
  626. V_ip_mrouter = NULL;
  627. ip_mrouter_cnt--;
  628. V_mrt_api_config = 0;
  629. VIF_LOCK();
  630. /*
  631. * For each phyint in use, disable promiscuous reception of all IP
  632. * multicasts.
  633. */
  634. for (vifi = 0; vifi < V_numvifs; vifi++) {
  635. if (!in_nullhost(V_viftable[vifi].v_lcl_addr) &&
  636. !(V_viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
  637. ifp = V_viftable[vifi].v_ifp;
  638. if_allmulti(ifp, 0);
  639. }
  640. }
  641. bzero((caddr_t)V_viftable, sizeof(*V_viftable) * MAXVIFS);
  642. V_numvifs = 0;
  643. V_pim_assert_enabled = 0;
  644. VIF_UNLOCK();
  645. callout_stop(&V_expire_upcalls_ch);
  646. callout_stop(&V_bw_upcalls_ch);
  647. callout_stop(&V_bw_meter_ch);
  648. MFC_LOCK();
  649. /*
  650. * Free all multicast forwarding cache entries.
  651. * Do not use hashdestroy(), as we must perform other cleanup.
  652. */
  653. for (i = 0; i < mfchashsize; i++) {
  654. struct mfc *rt, *nrt;
  655. LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
  656. expire_mfc(rt);
  657. }
  658. }
  659. free(V_mfchashtbl, M_MRTABLE);
  660. V_mfchashtbl = NULL;
  661. bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize);
  662. V_bw_upcalls_n = 0;
  663. bzero(V_bw_meter_timers, BW_METER_BUCKETS * sizeof(*V_bw_meter_timers));
  664. MFC_UNLOCK();
  665. V_reg_vif_num = VIFI_INVALID;
  666. MROUTER_UNLOCK();
  667. CTR1(KTR_IPMF, "%s: done", __func__);
  668. return 0;
  669. }
  670. /*
  671. * Set PIM assert processing global
  672. */
  673. static int
  674. set_assert(int i)
  675. {
  676. if ((i != 1) && (i != 0))
  677. return EINVAL;
  678. V_pim_assert_enabled = i;
  679. return 0;
  680. }
  681. /*
  682. * Configure API capabilities
  683. */
  684. int
  685. set_api_config(uint32_t *apival)
  686. {
  687. u_long i;
  688. /*
  689. * We can set the API capabilities only if it is the first operation
  690. * after MRT_INIT. I.e.:
  691. * - there are no vifs installed
  692. * - pim_assert is not enabled
  693. * - the MFC table is empty
  694. */
  695. if (V_numvifs > 0) {
  696. *apival = 0;
  697. return EPERM;
  698. }
  699. if (V_pim_assert_enabled) {
  700. *apival = 0;
  701. return EPERM;
  702. }
  703. MFC_LOCK();
  704. for (i = 0; i < mfchashsize; i++) {
  705. if (LIST_FIRST(&V_mfchashtbl[i]) != NULL) {
  706. MFC_UNLOCK();
  707. *apival = 0;
  708. return EPERM;
  709. }
  710. }
  711. MFC_UNLOCK();
  712. V_mrt_api_config = *apival & mrt_api_support;
  713. *apival = V_mrt_api_config;
  714. return 0;
  715. }
  716. /*
  717. * Add a vif to the vif table
  718. */
  719. static int
  720. add_vif(struct vifctl *vifcp)
  721. {
  722. struct vif *vifp = V_viftable + vifcp->vifc_vifi;
  723. struct sockaddr_in sin = {sizeof sin, AF_INET};
  724. struct ifaddr *ifa;
  725. struct ifnet *ifp;
  726. int error;
  727. VIF_LOCK();
  728. if (vifcp->vifc_vifi >= MAXVIFS) {
  729. VIF_UNLOCK();
  730. return EINVAL;
  731. }
  732. /* rate limiting is no longer supported by this code */
  733. if (vifcp->vifc_rate_limit != 0) {
  734. log(LOG_ERR, "rate limiting is no longer supported\n");
  735. VIF_UNLOCK();
  736. return EINVAL;
  737. }
  738. if (!in_nullhost(vifp->v_lcl_addr)) {
  739. VIF_UNLOCK();
  740. return EADDRINUSE;
  741. }
  742. if (in_nullhost(vifcp->vifc_lcl_addr)) {
  743. VIF_UNLOCK();
  744. return EADDRNOTAVAIL;
  745. }
  746. /* Find the interface with an address in AF_INET family */
  747. if (vifcp->vifc_flags & VIFF_REGISTER) {
  748. /*
  749. * XXX: Because VIFF_REGISTER does not really need a valid
  750. * local interface (e.g. it could be 127.0.0.2), we don't
  751. * check its address.
  752. */
  753. ifp = NULL;
  754. } else {
  755. sin.sin_addr = vifcp->vifc_lcl_addr;
  756. NET_EPOCH_ENTER();
  757. ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
  758. if (ifa == NULL) {
  759. NET_EPOCH_EXIT();
  760. VIF_UNLOCK();
  761. return EADDRNOTAVAIL;
  762. }
  763. ifp = ifa->ifa_ifp;
  764. NET_EPOCH_EXIT();
  765. }
  766. if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) {
  767. CTR1(KTR_IPMF, "%s: tunnels are no longer supported", __func__);
  768. VIF_UNLOCK();
  769. return EOPNOTSUPP;
  770. } else if (vifcp->vifc_flags & VIFF_REGISTER) {
  771. ifp = &V_multicast_register_if;
  772. CTR2(KTR_IPMF, "%s: add register vif for ifp %p", __func__, ifp);
  773. if (V_reg_vif_num == VIFI_INVALID) {
  774. if_initname(&V_multicast_register_if, "register_vif", 0);
  775. V_multicast_register_if.if_flags = IFF_LOOPBACK;
  776. V_reg_vif_num = vifcp->vifc_vifi;
  777. }
  778. } else { /* Make sure the interface supports multicast */
  779. if ((ifp->if_flags & IFF_MULTICAST) == 0) {
  780. VIF_UNLOCK();
  781. return EOPNOTSUPP;
  782. }
  783. /* Enable promiscuous reception of all IP multicasts from the if */
  784. error = if_allmulti(ifp, 1);
  785. if (error) {
  786. VIF_UNLOCK();
  787. return error;
  788. }
  789. }
  790. vifp->v_flags = vifcp->vifc_flags;
  791. vifp->v_threshold = vifcp->vifc_threshold;
  792. vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
  793. vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
  794. vifp->v_ifp = ifp;
  795. /* initialize per vif pkt counters */
  796. vifp->v_pkt_in = 0;
  797. vifp->v_pkt_out = 0;
  798. vifp->v_bytes_in = 0;
  799. vifp->v_bytes_out = 0;
  800. /* Adjust numvifs up if the vifi is higher than numvifs */
  801. if (V_numvifs <= vifcp->vifc_vifi)
  802. V_numvifs = vifcp->vifc_vifi + 1;
  803. VIF_UNLOCK();
  804. CTR4(KTR_IPMF, "%s: add vif %d laddr 0x%08x thresh %x", __func__,
  805. (int)vifcp->vifc_vifi, ntohl(vifcp->vifc_lcl_addr.s_addr),
  806. (int)vifcp->vifc_threshold);
  807. return 0;
  808. }
  809. /*
  810. * Delete a vif from the vif table
  811. */
  812. static int
  813. del_vif_locked(vifi_t vifi)
  814. {
  815. struct vif *vifp;
  816. VIF_LOCK_ASSERT();
  817. if (vifi >= V_numvifs) {
  818. return EINVAL;
  819. }
  820. vifp = &V_viftable[vifi];
  821. if (in_nullhost(vifp->v_lcl_addr)) {
  822. return EADDRNOTAVAIL;
  823. }
  824. if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER)))
  825. if_allmulti(vifp->v_ifp, 0);
  826. if (vifp->v_flags & VIFF_REGISTER)
  827. V_reg_vif_num = VIFI_INVALID;
  828. bzero((caddr_t)vifp, sizeof (*vifp));
  829. CTR2(KTR_IPMF, "%s: delete vif %d", __func__, (int)vifi);
  830. /* Adjust numvifs down */
  831. for (vifi = V_numvifs; vifi > 0; vifi--)
  832. if (!in_nullhost(V_viftable[vifi-1].v_lcl_addr))
  833. break;
  834. V_numvifs = vifi;
  835. return 0;
  836. }
  837. static int
  838. del_vif(vifi_t vifi)
  839. {
  840. int cc;
  841. VIF_LOCK();
  842. cc = del_vif_locked(vifi);
  843. VIF_UNLOCK();
  844. return cc;
  845. }
  846. /*
  847. * update an mfc entry without resetting counters and S,G addresses.
  848. */
  849. static void
  850. update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
  851. {
  852. int i;
  853. rt->mfc_parent = mfccp->mfcc_parent;
  854. for (i = 0; i < V_numvifs; i++) {
  855. rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
  856. rt->mfc_flags[i] = mfccp->mfcc_flags[i] & V_mrt_api_config &
  857. MRT_MFC_FLAGS_ALL;
  858. }
  859. /* set the RP address */
  860. if (V_mrt_api_config & MRT_MFC_RP)
  861. rt->mfc_rp = mfccp->mfcc_rp;
  862. else
  863. rt->mfc_rp.s_addr = INADDR_ANY;
  864. }
  865. /*
  866. * fully initialize an mfc entry from the parameter.
  867. */
  868. static void
  869. init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
  870. {
  871. rt->mfc_origin = mfccp->mfcc_origin;
  872. rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp;
  873. update_mfc_params(rt, mfccp);
  874. /* initialize pkt counters per src-grp */
  875. rt->mfc_pkt_cnt = 0;
  876. rt->mfc_byte_cnt = 0;
  877. rt->mfc_wrong_if = 0;
  878. timevalclear(&rt->mfc_last_assert);
  879. }
  880. static void
  881. expire_mfc(struct mfc *rt)
  882. {
  883. struct rtdetq *rte, *nrte;
  884. MFC_LOCK_ASSERT();
  885. free_bw_list(rt->mfc_bw_meter);
  886. TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) {
  887. m_freem(rte->m);
  888. TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link);
  889. free(rte, M_MRTABLE);
  890. }
  891. LIST_REMOVE(rt, mfc_hash);
  892. free(rt, M_MRTABLE);
  893. }
  894. /*
  895. * Add an mfc entry
  896. */
  897. static int
  898. add_mfc(struct mfcctl2 *mfccp)
  899. {
  900. struct mfc *rt;
  901. struct rtdetq *rte, *nrte;
  902. u_long hash = 0;
  903. u_short nstl;
  904. VIF_LOCK();
  905. MFC_LOCK();
  906. rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
  907. /* If an entry already exists, just update the fields */
  908. if (rt) {
  909. CTR4(KTR_IPMF, "%s: update mfc orig 0x%08x group %lx parent %x",
  910. __func__, ntohl(mfccp->mfcc_origin.s_addr),
  911. (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
  912. mfccp->mfcc_parent);
  913. update_mfc_params(rt, mfccp);
  914. MFC_UNLOCK();
  915. VIF_UNLOCK();
  916. return (0);
  917. }
  918. /*
  919. * Find the entry for which the upcall was made and update
  920. */
  921. nstl = 0;
  922. hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp);
  923. LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
  924. if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
  925. in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) &&
  926. !TAILQ_EMPTY(&rt->mfc_stall)) {
  927. CTR5(KTR_IPMF,
  928. "%s: add mfc orig 0x%08x group %lx parent %x qh %p",
  929. __func__, ntohl(mfccp->mfcc_origin.s_addr),
  930. (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
  931. mfccp->mfcc_parent,
  932. TAILQ_FIRST(&rt->mfc_stall));
  933. if (nstl++)
  934. CTR1(KTR_IPMF, "%s: multiple matches", __func__);
  935. init_mfc_params(rt, mfccp);
  936. rt->mfc_expire = 0; /* Don't clean this guy up */
  937. V_nexpire[hash]--;
  938. /* Free queued packets, but attempt to forward them first. */
  939. TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) {
  940. if (rte->ifp != NULL)
  941. ip_mdq(rte->m, rte->ifp, rt, -1);
  942. m_freem(rte->m);
  943. TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link);
  944. rt->mfc_nstall--;
  945. free(rte, M_MRTABLE);
  946. }
  947. }
  948. }
  949. /*
  950. * It is possible that an entry is being inserted without an upcall
  951. */
  952. if (nstl == 0) {
  953. CTR1(KTR_IPMF, "%s: adding mfc w/o upcall", __func__);
  954. LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
  955. if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
  956. in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) {
  957. init_mfc_params(rt, mfccp);
  958. if (rt->mfc_expire)
  959. V_nexpire[hash]--;
  960. rt->mfc_expire = 0;
  961. break; /* XXX */
  962. }
  963. }
  964. if (rt == NULL) { /* no upcall, so make a new entry */
  965. rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
  966. if (rt == NULL) {
  967. MFC_UNLOCK();
  968. VIF_UNLOCK();
  969. return (ENOBUFS);
  970. }
  971. init_mfc_params(rt, mfccp);
  972. TAILQ_INIT(&rt->mfc_stall);
  973. rt->mfc_nstall = 0;
  974. rt->mfc_expire = 0;
  975. rt->mfc_bw_meter = NULL;
  976. /* insert new entry at head of hash chain */
  977. LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash);
  978. }
  979. }
  980. MFC_UNLOCK();
  981. VIF_UNLOCK();
  982. return (0);
  983. }
  984. /*
  985. * Delete an mfc entry
  986. */
  987. static int
  988. del_mfc(struct mfcctl2 *mfccp)
  989. {
  990. struct in_addr origin;
  991. struct in_addr mcastgrp;
  992. struct mfc *rt;
  993. origin = mfccp->mfcc_origin;
  994. mcastgrp = mfccp->mfcc_mcastgrp;
  995. CTR3(KTR_IPMF, "%s: delete mfc orig 0x%08x group %lx", __func__,
  996. ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr));
  997. MFC_LOCK();
  998. rt = mfc_find(&origin, &mcastgrp);
  999. if (rt == NULL) {
  1000. MFC_UNLOCK();
  1001. return EADDRNOTAVAIL;
  1002. }
  1003. /*
  1004. * free the bw_meter entries
  1005. */
  1006. free_bw_list(rt->mfc_bw_meter);
  1007. rt->mfc_bw_meter = NULL;
  1008. LIST_REMOVE(rt, mfc_hash);
  1009. free(rt, M_MRTABLE);
  1010. MFC_UNLOCK();
  1011. return (0);
  1012. }
  1013. /*
  1014. * Send a message to the routing daemon on the multicast routing socket.
  1015. */
  1016. static int
  1017. socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
  1018. {
  1019. if (s) {
  1020. SOCKBUF_LOCK(&s->so_rcv);
  1021. if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm,
  1022. NULL) != 0) {
  1023. sorwakeup_locked(s);
  1024. return 0;
  1025. }
  1026. SOCKBUF_UNLOCK(&s->so_rcv);
  1027. }
  1028. m_freem(mm);
  1029. return -1;
  1030. }
  1031. /*
  1032. * IP multicast forwarding function. This function assumes that the packet
  1033. * pointed to by "ip" has arrived on (or is about to be sent to) the interface
  1034. * pointed to by "ifp", and the packet is to be relayed to other networks
  1035. * that have members of the packet's destination IP multicast group.
  1036. *
  1037. * The packet is returned unscathed to the caller, unless it is
  1038. * erroneous, in which case a non-zero return value tells the caller to
  1039. * discard it.
  1040. */
  1041. #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */
  1042. static int
  1043. X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m,
  1044. struct ip_moptions *imo)
  1045. {
  1046. struct mfc *rt;
  1047. int error;
  1048. vifi_t vifi;
  1049. CTR3(KTR_IPMF, "ip_mforward: delete mfc orig 0x%08x group %lx ifp %p",
  1050. ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), ifp);
  1051. if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 ||
  1052. ((u_char *)(ip + 1))[1] != IPOPT_LSRR ) {
  1053. /*
  1054. * Packet arrived via a physical interface or
  1055. * an encapsulated tunnel or a register_vif.
  1056. */
  1057. } else {
  1058. /*
  1059. * Packet arrived through a source-route tunnel.
  1060. * Source-route tunnels are no longer supported.
  1061. */
  1062. return (1);
  1063. }
  1064. VIF_LOCK();
  1065. MFC_LOCK();
  1066. if (imo && ((vifi = imo->imo_multicast_vif) < V_numvifs)) {
  1067. if (ip->ip_ttl < MAXTTL)
  1068. ip->ip_ttl++; /* compensate for -1 in *_send routines */
  1069. error = ip_mdq(m, ifp, NULL, vifi);
  1070. MFC_UNLOCK();
  1071. VIF_UNLOCK();
  1072. return error;
  1073. }
  1074. /*
  1075. * Don't forward a packet with time-to-live of zero or one,
  1076. * or a packet destined to a local-only group.
  1077. */
  1078. if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) {
  1079. MFC_UNLOCK();
  1080. VIF_UNLOCK();
  1081. return 0;
  1082. }
  1083. /*
  1084. * Determine forwarding vifs from the forwarding cache table
  1085. */
  1086. MRTSTAT_INC(mrts_mfc_lookups);
  1087. rt = mfc_find(&ip->ip_src, &ip->ip_dst);
  1088. /* Entry exists, so forward if necessary */
  1089. if (rt != NULL) {
  1090. error = ip_mdq(m, ifp, rt, -1);
  1091. MFC_UNLOCK();
  1092. VIF_UNLOCK();
  1093. return error;
  1094. } else {
  1095. /*
  1096. * If we don't have a route for packet's origin,
  1097. * Make a copy of the packet & send message to routing daemon
  1098. */
  1099. struct mbuf *mb0;
  1100. struct rtdetq *rte;
  1101. u_long hash;
  1102. int hlen = ip->ip_hl << 2;
  1103. MRTSTAT_INC(mrts_mfc_misses);
  1104. MRTSTAT_INC(mrts_no_route);
  1105. CTR2(KTR_IPMF, "ip_mforward: no mfc for (0x%08x,%lx)",
  1106. ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr));
  1107. /*
  1108. * Allocate mbufs early so that we don't do extra work if we are
  1109. * just going to fail anyway. Make sure to pullup the header so
  1110. * that other people can't step on it.
  1111. */
  1112. rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE,
  1113. M_NOWAIT|M_ZERO);
  1114. if (rte == NULL) {
  1115. MFC_UNLOCK();
  1116. VIF_UNLOCK();
  1117. return ENOBUFS;
  1118. }
  1119. mb0 = m_copypacket(m, M_NOWAIT);
  1120. if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < hlen))
  1121. mb0 = m_pullup(mb0, hlen);
  1122. if (mb0 == NULL) {
  1123. free(rte, M_MRTABLE);
  1124. MFC_UNLOCK();
  1125. VIF_UNLOCK();
  1126. return ENOBUFS;
  1127. }
  1128. /* is there an upcall waiting for this flow ? */
  1129. hash = MFCHASH(ip->ip_src, ip->ip_dst);
  1130. LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
  1131. if (in_hosteq(ip->ip_src, rt->mfc_origin) &&
  1132. in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) &&
  1133. !TAILQ_EMPTY(&rt->mfc_stall))
  1134. break;
  1135. }
  1136. if (rt == NULL) {
  1137. int i;
  1138. struct igmpmsg *im;
  1139. struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
  1140. struct mbuf *mm;
  1141. /*
  1142. * Locate the vifi for the incoming interface for this packet.
  1143. * If none found, drop packet.
  1144. */
  1145. for (vifi = 0; vifi < V_numvifs &&
  1146. V_viftable[vifi].v_ifp != ifp; vifi++)
  1147. ;
  1148. if (vifi >= V_numvifs) /* vif not found, drop packet */
  1149. goto non_fatal;
  1150. /* no upcall, so make a new entry */
  1151. rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
  1152. if (rt == NULL)
  1153. goto fail;
  1154. /* Make a copy of the header to send to the user level process */
  1155. mm = m_copym(mb0, 0, hlen, M_NOWAIT);
  1156. if (mm == NULL)
  1157. goto fail1;
  1158. /*
  1159. * Send message to routing daemon to install
  1160. * a route into the kernel table
  1161. */
  1162. im = mtod(mm, struct igmpmsg *);
  1163. im->im_msgtype = IGMPMSG_NOCACHE;
  1164. im->im_mbz = 0;
  1165. im->im_vif = vifi;
  1166. MRTSTAT_INC(mrts_upcalls);
  1167. k_igmpsrc.sin_addr = ip->ip_src;
  1168. if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) {
  1169. CTR0(KTR_IPMF, "ip_mforward: socket queue full");
  1170. MRTSTAT_INC(mrts_upq_sockfull);
  1171. fail1:
  1172. free(rt, M_MRTABLE);
  1173. fail:
  1174. free(rte, M_MRTABLE);
  1175. m_freem(mb0);
  1176. MFC_UNLOCK();
  1177. VIF_UNLOCK();
  1178. return ENOBUFS;
  1179. }
  1180. /* insert new entry at head of hash chain */
  1181. rt->mfc_origin.s_addr = ip->ip_src.s_addr;
  1182. rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr;
  1183. rt->mfc_expire = UPCALL_EXPIRE;
  1184. V_nexpire[hash]++;
  1185. for (i = 0; i < V_numvifs; i++) {
  1186. rt->mfc_ttls[i] = 0;
  1187. rt->mfc_flags[i] = 0;
  1188. }
  1189. rt->mfc_parent = -1;
  1190. /* clear the RP address */
  1191. rt->mfc_rp.s_addr = INADDR_ANY;
  1192. rt->mfc_bw_meter = NULL;
  1193. /* initialize pkt counters per src-grp */
  1194. rt->mfc_pkt_cnt = 0;
  1195. rt->mfc_byte_cnt = 0;
  1196. rt->mfc_wrong_if = 0;
  1197. timevalclear(&rt->mfc_last_assert);
  1198. TAILQ_INIT(&rt->mfc_stall);
  1199. rt->mfc_nstall = 0;
  1200. /* link into table */
  1201. LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash);
  1202. TAILQ_INSERT_HEAD(&rt->mfc_stall, rte, rte_link);
  1203. rt->mfc_nstall++;
  1204. } else {
  1205. /* determine if queue has overflowed */
  1206. if (rt->mfc_nstall > MAX_UPQ) {
  1207. MRTSTAT_INC(mrts_upq_ovflw);
  1208. non_fatal:
  1209. free(rte, M_MRTABLE);
  1210. m_freem(mb0);
  1211. MFC_UNLOCK();
  1212. VIF_UNLOCK();
  1213. return (0);
  1214. }
  1215. TAILQ_INSERT_TAIL(&rt->mfc_stall, rte, rte_link);
  1216. rt->mfc_nstall++;
  1217. }
  1218. rte->m = mb0;
  1219. rte->ifp = ifp;
  1220. MFC_UNLOCK();
  1221. VIF_UNLOCK();
  1222. return 0;
  1223. }
  1224. }
  1225. /*
  1226. * Clean up the cache entry if upcall is not serviced
  1227. */
  1228. static void
  1229. expire_upcalls(void *arg)
  1230. {
  1231. u_long i;
  1232. CURVNET_SET((struct vnet *) arg);
  1233. MFC_LOCK();
  1234. for (i = 0; i < mfchashsize; i++) {
  1235. struct mfc *rt, *nrt;
  1236. if (V_nexpire[i] == 0)
  1237. continue;
  1238. LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
  1239. if (TAILQ_EMPTY(&rt->mfc_stall))
  1240. continue;
  1241. if (rt->mfc_expire == 0 || --rt->mfc_expire > 0)
  1242. continue;
  1243. /*
  1244. * free the bw_meter entries
  1245. */
  1246. while (rt->mfc_bw_meter != NULL) {
  1247. struct bw_meter *x = rt->mfc_bw_meter;
  1248. rt->mfc_bw_meter = x->bm_mfc_next;
  1249. free(x, M_BWMETER);
  1250. }
  1251. MRTSTAT_INC(mrts_cache_cleanups);
  1252. CTR3(KTR_IPMF, "%s: expire (%lx, %lx)", __func__,
  1253. (u_long)ntohl(rt->mfc_origin.s_addr),
  1254. (u_long)ntohl(rt->mfc_mcastgrp.s_addr));
  1255. expire_mfc(rt);
  1256. }
  1257. }
  1258. MFC_UNLOCK();
  1259. callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls,
  1260. curvnet);
  1261. CURVNET_RESTORE();
  1262. }
  1263. /*
  1264. * Packet forwarding routine once entry in the cache is made
  1265. */
  1266. static int
  1267. ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif)
  1268. {
  1269. struct ip *ip = mtod(m, struct ip *);
  1270. vifi_t vifi;
  1271. int plen = ntohs(ip->ip_len);
  1272. VIF_LOCK_ASSERT();
  1273. /*
  1274. * If xmt_vif is not -1, send on only the requested vif.
  1275. *
  1276. * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.)
  1277. */
  1278. if (xmt_vif < V_numvifs) {
  1279. if (V_viftable[xmt_vif].v_flags & VIFF_REGISTER)
  1280. pim_register_send(ip, V_viftable + xmt_vif, m, rt);
  1281. else
  1282. phyint_send(ip, V_viftable + xmt_vif, m);
  1283. return 1;
  1284. }
  1285. /*
  1286. * Don't forward if it didn't arrive from the parent vif for its origin.
  1287. */
  1288. vifi = rt->mfc_parent;
  1289. if ((vifi >= V_numvifs) || (V_viftable[vifi].v_ifp != ifp)) {
  1290. CTR4(KTR_IPMF, "%s: rx on wrong ifp %p (vifi %d, v_ifp %p)",
  1291. __func__, ifp, (int)vifi, V_viftable[vifi].v_ifp);
  1292. MRTSTAT_INC(mrts_wrong_if);
  1293. ++rt->mfc_wrong_if;
  1294. /*
  1295. * If we are doing PIM assert processing, send a message
  1296. * to the routing daemon.
  1297. *
  1298. * XXX: A PIM-SM router needs the WRONGVIF detection so it
  1299. * can complete the SPT switch, regardless of the type
  1300. * of the iif (broadcast media, GRE tunnel, etc).
  1301. */
  1302. if (V_pim_assert_enabled && (vifi < V_numvifs) &&
  1303. V_viftable[vifi].v_ifp) {
  1304. if (ifp == &V_multicast_register_if)
  1305. PIMSTAT_INC(pims_rcv_registers_wrongiif);
  1306. /* Get vifi for the incoming packet */
  1307. for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp;
  1308. vifi++)
  1309. ;
  1310. if (vifi >= V_numvifs)
  1311. return 0; /* The iif is not found: ignore the packet. */
  1312. if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF)
  1313. return 0; /* WRONGVIF disabled: ignore the packet */
  1314. if (ratecheck(&rt->mfc_last_assert, &pim_assert_interval)) {
  1315. struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
  1316. struct igmpmsg *im;
  1317. int hlen = ip->ip_hl << 2;
  1318. struct mbuf *mm = m_copym(m, 0, hlen, M_NOWAIT);
  1319. if (mm && (!M_WRITABLE(mm) || mm->m_len < hlen))
  1320. mm = m_pullup(mm, hlen);
  1321. if (mm == NULL)
  1322. return ENOBUFS;
  1323. im = mtod(mm, struct igmpmsg *);
  1324. im->im_msgtype = IGMPMSG_WRONGVIF;
  1325. im->im_mbz = 0;
  1326. im->im_vif = vifi;
  1327. MRTSTAT_INC(mrts_upcalls);
  1328. k_igmpsrc.sin_addr = im->im_src;
  1329. if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) {
  1330. CTR1(KTR_IPMF, "%s: socket queue full", __func__);
  1331. MRTSTAT_INC(mrts_upq_sockfull);
  1332. return ENOBUFS;
  1333. }
  1334. }
  1335. }
  1336. return 0;
  1337. }
  1338. /* If I sourced this packet, it counts as output, else it was input. */
  1339. if (in_hosteq(ip->ip_src, V_viftable[vifi].v_lcl_addr)) {
  1340. V_viftable[vifi].v_pkt_out++;
  1341. V_viftable[vifi].v_bytes_out += plen;
  1342. } else {
  1343. V_viftable[vifi].v_pkt_in++;
  1344. V_viftable[vifi].v_bytes_in += plen;
  1345. }
  1346. rt->mfc_pkt_cnt++;
  1347. rt->mfc_byte_cnt += plen;
  1348. /*
  1349. * For each vif, decide if a copy of the packet should be forwarded.
  1350. * Forward if:
  1351. * - the ttl exceeds the vif's threshold
  1352. * - there are group members downstream on interface
  1353. */
  1354. for (vifi = 0; vifi < V_numvifs; vifi++)
  1355. if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) {
  1356. V_viftable[vifi].v_pkt_out++;
  1357. V_viftable[vifi].v_bytes_out += plen;
  1358. if (V_viftable[vifi].v_flags & VIFF_REGISTER)
  1359. pim_register_send(ip, V_viftable + vifi, m, rt);
  1360. else
  1361. phyint_send(ip, V_viftable + vifi, m);
  1362. }
  1363. /*
  1364. * Perform upcall-related bw measuring.
  1365. */
  1366. if (rt->mfc_bw_meter != NULL) {
  1367. struct bw_meter *x;
  1368. struct timeval now;
  1369. microtime(&now);
  1370. MFC_LOCK_ASSERT();
  1371. for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
  1372. bw_meter_receive_packet(x, plen, &now);
  1373. }
  1374. return 0;
  1375. }
  1376. /*
  1377. * Check if a vif number is legal/ok. This is used by in_mcast.c.
  1378. */
  1379. static int
  1380. X_legal_vif_num(int vif)
  1381. {
  1382. int ret;
  1383. ret = 0;
  1384. if (vif < 0)
  1385. return (ret);
  1386. VIF_LOCK();
  1387. if (vif < V_numvifs)
  1388. ret = 1;
  1389. VIF_UNLOCK();
  1390. return (ret);
  1391. }
  1392. /*
  1393. * Return the local address used by this vif
  1394. */
  1395. static u_long
  1396. X_ip_mcast_src(int vifi)
  1397. {
  1398. in_addr_t addr;
  1399. addr = INADDR_ANY;
  1400. if (vifi < 0)
  1401. return (addr);
  1402. VIF_LOCK();
  1403. if (vifi < V_numvifs)
  1404. addr = V_viftable[vifi].v_lcl_addr.s_addr;
  1405. VIF_UNLOCK();
  1406. return (addr);
  1407. }
  1408. static void
  1409. phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
  1410. {
  1411. struct mbuf *mb_copy;
  1412. int hlen = ip->ip_hl << 2;
  1413. VIF_LOCK_ASSERT();
  1414. /*
  1415. * Make a new reference to the packet; make sure that
  1416. * the IP header is actually copied, not just referenced,
  1417. * so that ip_output() only scribbles on the copy.
  1418. */
  1419. mb_copy = m_copypacket(m, M_NOWAIT);
  1420. if (mb_copy && (!M_WRITABLE(mb_copy) || mb_copy->m_len < hlen))
  1421. mb_copy = m_pullup(mb_copy, hlen);
  1422. if (mb_copy == NULL)
  1423. return;
  1424. send_packet(vifp, mb_copy);
  1425. }
  1426. static void
  1427. send_packet(struct vif *vifp, struct mbuf *m)
  1428. {
  1429. struct ip_moptions imo;
  1430. int error __unused;
  1431. VIF_LOCK_ASSERT();
  1432. imo.imo_multicast_ifp = vifp->v_ifp;
  1433. imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1;
  1434. imo.imo_multicast_loop = 1;
  1435. imo.imo_multicast_vif = -1;
  1436. STAILQ_INIT(&imo.imo_head);
  1437. /*
  1438. * Re-entrancy should not be a problem here, because
  1439. * the packets that we send out and are looped back at us
  1440. * should get rejected because they appear to come from
  1441. * the loopback interface, thus preventing looping.
  1442. */
  1443. error = ip_output(m, NULL, NULL, IP_FORWARDING, &imo, NULL);
  1444. CTR3(KTR_IPMF, "%s: vif %td err %d", __func__,
  1445. (ptrdiff_t)(vifp - V_viftable), error);
  1446. }
  1447. /*
  1448. * Stubs for old RSVP socket shim implementation.
  1449. */
  1450. static int
  1451. X_ip_rsvp_vif(struct socket *so __unused, struct sockopt *sopt __unused)
  1452. {
  1453. return (EOPNOTSUPP);
  1454. }
  1455. static void
  1456. X_ip_rsvp_force_done(struct socket *so __unused)
  1457. {
  1458. }
  1459. static int
  1460. X_rsvp_input(struct mbuf **mp, int *offp, int proto)
  1461. {
  1462. struct mbuf *m;
  1463. m = *mp;
  1464. *mp = NULL;
  1465. if (!V_rsvp_on)
  1466. m_freem(m);
  1467. return (IPPROTO_DONE);
  1468. }
  1469. /*
  1470. * Code for bandwidth monitors
  1471. */
  1472. /*
  1473. * Define common interface for timeval-related methods
  1474. */
  1475. #define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp)
  1476. #define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp))
  1477. #define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp))
  1478. static uint32_t
  1479. compute_bw_meter_flags(struct bw_upcall *req)
  1480. {
  1481. uint32_t flags = 0;
  1482. if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
  1483. flags |= BW_METER_UNIT_PACKETS;
  1484. if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
  1485. flags |= BW_METER_UNIT_BYTES;
  1486. if (req->bu_flags & BW_UPCALL_GEQ)
  1487. flags |= BW_METER_GEQ;
  1488. if (req->bu_flags & BW_UPCALL_LEQ)
  1489. flags |= BW_METER_LEQ;
  1490. return flags;
  1491. }
  1492. /*
  1493. * Add a bw_meter entry
  1494. */
  1495. static int
  1496. add_bw_upcall(struct bw_upcall *req)
  1497. {
  1498. struct mfc *mfc;
  1499. struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
  1500. BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
  1501. struct timeval now;
  1502. struct bw_meter *x;
  1503. uint32_t flags;
  1504. if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
  1505. return EOPNOTSUPP;
  1506. /* Test if the flags are valid */
  1507. if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
  1508. return EINVAL;
  1509. if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
  1510. return EINVAL;
  1511. if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
  1512. == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
  1513. return EINVAL;
  1514. /* Test if the threshold time interval is valid */
  1515. if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
  1516. return EINVAL;
  1517. flags = compute_bw_meter_flags(req);
  1518. /*
  1519. * Find if we have already same bw_meter entry
  1520. */
  1521. MFC_LOCK();
  1522. mfc = mfc_find(&req->bu_src, &req->bu_dst);
  1523. if (mfc == NULL) {
  1524. MFC_UNLOCK();
  1525. return EADDRNOTAVAIL;
  1526. }
  1527. for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
  1528. if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
  1529. &req->bu_threshold.b_time, ==)) &&
  1530. (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
  1531. (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
  1532. (x->bm_flags & BW_METER_USER_FLAGS) == flags) {
  1533. MFC_UNLOCK();
  1534. return 0; /* XXX Already installed */
  1535. }
  1536. }
  1537. /* Allocate the new bw_meter entry */
  1538. x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT);
  1539. if (x == NULL) {
  1540. MFC_UNLOCK();
  1541. return ENOBUFS;
  1542. }
  1543. /* Set the new bw_meter entry */
  1544. x->bm_threshold.b_time = req->bu_threshold.b_time;
  1545. microtime(&now);
  1546. x->bm_start_time = now;
  1547. x->bm_threshold.b_packets = req->bu_threshold.b_packets;
  1548. x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
  1549. x->bm_measured.b_packets = 0;
  1550. x->bm_measured.b_bytes = 0;
  1551. x->bm_flags = flags;
  1552. x->bm_time_next = NULL;
  1553. x->bm_time_hash = BW_METER_BUCKETS;
  1554. /* Add the new bw_meter entry to the front of entries for this MFC */
  1555. x->bm_mfc = mfc;
  1556. x->bm_mfc_next = mfc->mfc_bw_meter;
  1557. mfc->mfc_bw_meter = x;
  1558. schedule_bw_meter(x, &now);
  1559. MFC_UNLOCK();
  1560. return 0;
  1561. }
  1562. static void
  1563. free_bw_list(struct bw_meter *list)
  1564. {
  1565. while (list != NULL) {
  1566. struct bw_meter *x = list;
  1567. list = list->bm_mfc_next;
  1568. unschedule_bw_meter(x);
  1569. free(x, M_BWMETER);
  1570. }
  1571. }
  1572. /*
  1573. * Delete one or multiple bw_meter entries
  1574. */
  1575. static int
  1576. del_bw_upcall(struct bw_upcall *req)
  1577. {
  1578. struct mfc *mfc;
  1579. struct bw_meter *x;
  1580. if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
  1581. return EOPNOTSUPP;
  1582. MFC_LOCK();
  1583. /* Find the corresponding MFC entry */
  1584. mfc = mfc_find(&req->bu_src, &req->bu_dst);
  1585. if (mfc == NULL) {
  1586. MFC_UNLOCK();
  1587. return EADDRNOTAVAIL;
  1588. } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
  1589. /*
  1590. * Delete all bw_meter entries for this mfc
  1591. */
  1592. struct bw_meter *list;
  1593. list = mfc->mfc_bw_meter;
  1594. mfc->mfc_bw_meter = NULL;
  1595. free_bw_list(list);
  1596. MFC_UNLOCK();
  1597. return 0;
  1598. } else { /* Delete a single bw_meter entry */
  1599. struct bw_meter *prev;
  1600. uint32_t flags = 0;
  1601. flags = compute_bw_meter_flags(req);
  1602. /* Find the bw_meter entry to delete */
  1603. for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
  1604. prev = x, x = x->bm_mfc_next) {
  1605. if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
  1606. &req->bu_threshold.b_time, ==)) &&
  1607. (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
  1608. (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
  1609. (x->bm_flags & BW_METER_USER_FLAGS) == flags)
  1610. break;
  1611. }
  1612. if (x != NULL) { /* Delete entry from the list for this MFC */
  1613. if (prev != NULL)
  1614. prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/
  1615. else
  1616. x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */
  1617. unschedule_bw_meter(x);
  1618. MFC_UNLOCK();
  1619. /* Free the bw_meter entry */
  1620. free(x, M_BWMETER);
  1621. return 0;
  1622. } else {
  1623. MFC_UNLOCK();
  1624. return EINVAL;
  1625. }
  1626. }
  1627. /* NOTREACHED */
  1628. }
  1629. /*
  1630. * Perform bandwidth measurement processing that may result in an upcall
  1631. */
  1632. static void
  1633. bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
  1634. {
  1635. struct timeval delta;
  1636. MFC_LOCK_ASSERT();
  1637. delta = *nowp;
  1638. BW_TIMEVALDECR(&delta, &x->bm_start_time);
  1639. if (x->bm_flags & BW_METER_GEQ) {
  1640. /*
  1641. * Processing for ">=" type of bw_meter entry
  1642. */
  1643. if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
  1644. /* Reset the bw_meter entry */
  1645. x->bm_start_time = *nowp;
  1646. x->bm_measured.b_packets = 0;
  1647. x->bm_measured.b_bytes = 0;
  1648. x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
  1649. }
  1650. /* Record that a packet is received */
  1651. x->bm_measured.b_packets++;
  1652. x->bm_measured.b_bytes += plen;
  1653. /*
  1654. * Test if we should deliver an upcall
  1655. */
  1656. if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
  1657. if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
  1658. (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
  1659. ((x->bm_flags & BW_METER_UNIT_BYTES) &&
  1660. (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
  1661. /* Prepare an upcall for delivery */
  1662. bw_meter_prepare_upcall(x, nowp);
  1663. x->bm_flags |= BW_METER_UPCALL_DELIVERED;
  1664. }
  1665. }
  1666. } else if (x->bm_flags & BW_METER_LEQ) {
  1667. /*
  1668. * Processing for "<=" type of bw_meter entry
  1669. */
  1670. if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
  1671. /*
  1672. * We are behind time with the multicast forwarding table
  1673. * scanning for "<=" type of bw_meter entries, so test now
  1674. * if we should deliver an upcall.
  1675. */
  1676. if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
  1677. (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
  1678. ((x->bm_flags & BW_METER_UNIT_BYTES) &&
  1679. (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
  1680. /* Prepare an upcall for delivery */
  1681. bw_meter_prepare_upcall(x, nowp);
  1682. }
  1683. /* Reschedule the bw_meter entry */
  1684. unschedule_bw_meter(x);
  1685. schedule_bw_meter(x, nowp);
  1686. }
  1687. /* Record that a packet is received */
  1688. x->bm_measured.b_packets++;
  1689. x->bm_measured.b_bytes += plen;
  1690. /*
  1691. * Test if we should restart the measuring interval
  1692. */
  1693. if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
  1694. x->bm_measured.b_packets <= x->bm_threshold.b_packets) ||
  1695. (x->bm_flags & BW_METER_UNIT_BYTES &&
  1696. x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
  1697. /* Don't restart the measuring interval */
  1698. } else {
  1699. /* Do restart the measuring interval */
  1700. /*
  1701. * XXX: note that we don't unschedule and schedule, because this
  1702. * might be too much overhead per packet. Instead, when we process
  1703. * all entries for a given timer hash bin, we check whether it is
  1704. * really a timeout. If not, we reschedule at that time.
  1705. */
  1706. x->bm_start_time = *nowp;
  1707. x->bm_measured.b_packets = 0;
  1708. x->bm_measured.b_bytes = 0;
  1709. x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
  1710. }
  1711. }
  1712. }
  1713. /*
  1714. * Prepare a bandwidth-related upcall
  1715. */
  1716. static void
  1717. bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
  1718. {
  1719. struct timeval delta;
  1720. struct bw_upcall *u;
  1721. MFC_LOCK_ASSERT();
  1722. /*
  1723. * Compute the measured time interval
  1724. */
  1725. delta = *nowp;
  1726. BW_TIMEVALDECR(&delta, &x->bm_start_time);
  1727. /*
  1728. * If there are too many pending upcalls, deliver them now
  1729. */
  1730. if (V_bw_upcalls_n >= BW_UPCALLS_MAX)
  1731. bw_upcalls_send();
  1732. /*
  1733. * Set the bw_upcall entry
  1734. */
  1735. u = &V_bw_upcalls[V_bw_upcalls_n++];
  1736. u->bu_src = x->bm_mfc->mfc_origin;
  1737. u->bu_dst = x->bm_mfc->mfc_mcastgrp;
  1738. u->bu_threshold.b_time = x->bm_threshold.b_time;
  1739. u->bu_threshold.b_packets = x->bm_threshold.b_packets;
  1740. u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
  1741. u->bu_measured.b_time = delta;
  1742. u->bu_measured.b_packets = x->bm_measured.b_packets;
  1743. u->bu_measured.b_bytes = x->bm_measured.b_bytes;
  1744. u->bu_flags = 0;
  1745. if (x->bm_flags & BW_METER_UNIT_PACKETS)
  1746. u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
  1747. if (x->bm_flags & BW_METER_UNIT_BYTES)
  1748. u->bu_flags |= BW_UPCALL_UNIT_BYTES;
  1749. if (x->bm_flags & BW_METER_GEQ)
  1750. u->bu_flags |= BW_UPCALL_GEQ;
  1751. if (x->bm_flags & BW_METER_LEQ)
  1752. u->bu_flags |= BW_UPCALL_LEQ;
  1753. }
  1754. /*
  1755. * Send the pending bandwidth-related upcalls
  1756. */
  1757. static void
  1758. bw_upcalls_send(void)
  1759. {
  1760. struct mbuf *m;
  1761. int len = V_bw_upcalls_n * sizeof(V_bw_upcalls[0]);
  1762. struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
  1763. static struct igmpmsg igmpmsg = { 0, /* unused1 */
  1764. 0, /* unused2 */
  1765. IGMPMSG_BW_UPCALL,/* im_msgtype */
  1766. 0, /* im_mbz */
  1767. 0, /* im_vif */
  1768. 0, /* unused3 */
  1769. { 0 }, /* im_src */
  1770. { 0 } }; /* im_dst */
  1771. MFC_LOCK_ASSERT();
  1772. if (V_bw_upcalls_n == 0)
  1773. return; /* No pending upcalls */
  1774. V_bw_upcalls_n = 0;
  1775. /*
  1776. * Allocate a new mbuf, initialize it with the header and
  1777. * the payload for the pending calls.
  1778. */
  1779. m = m_gethdr(M_NOWAIT, MT_DATA);
  1780. if (m == NULL) {
  1781. log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
  1782. return;
  1783. }
  1784. m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg);
  1785. m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&V_bw_upcalls[0]);
  1786. /*
  1787. * Send the upcalls
  1788. * XXX do we need to set the address in k_igmpsrc ?
  1789. */
  1790. MRTSTAT_INC(mrts_upcalls);
  1791. if (socket_send(V_ip_mrouter, m, &k_igmpsrc) < 0) {
  1792. log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n");
  1793. MRTSTAT_INC(mrts_upq_sockfull);
  1794. }
  1795. }
  1796. /*
  1797. * Compute the timeout hash value for the bw_meter entries
  1798. */
  1799. #define BW_METER_TIMEHASH(bw_meter, hash) \
  1800. do { \
  1801. struct timeval next_timeval = (bw_meter)->bm_start_time; \
  1802. \
  1803. BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \
  1804. (hash) = next_timeval.tv_sec; \
  1805. if (next_timeval.tv_usec) \
  1806. (hash)++; /* XXX: make sure we don't timeout early */ \
  1807. (hash) %= BW_METER_BUCKETS; \
  1808. } while (0)
  1809. /*
  1810. * Schedule a timer to process periodically bw_meter entry of type "<="
  1811. * by linking the entry in the proper hash bucket.
  1812. */
  1813. static void
  1814. schedule_bw_meter(struct bw_meter *x, struct timeval *nowp)
  1815. {
  1816. int time_hash;
  1817. MFC_LOCK_ASSERT();
  1818. if (!(x->bm_flags & BW_METER_LEQ))
  1819. return; /* XXX: we schedule timers only for "<=" entries */
  1820. /*
  1821. * Reset the bw_meter entry
  1822. */
  1823. x->bm_start_time = *nowp;
  1824. x->bm_measured.b_packets = 0;
  1825. x->bm_measured.b_bytes = 0;
  1826. x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
  1827. /*
  1828. * Compute the timeout hash value and insert the entry
  1829. */
  1830. BW_METER_TIMEHASH(x, time_hash);
  1831. x->bm_time_next = V_bw_meter_timers[time_hash];
  1832. V_bw_meter_timers[time_hash] = x;
  1833. x->bm_time_hash = time_hash;
  1834. }
  1835. /*
  1836. * Unschedule the periodic timer that processes bw_meter entry of type "<="
  1837. * by removing the entry from the proper hash bucket.
  1838. */
  1839. static void
  1840. unschedule_bw_meter(struct bw_meter *x)
  1841. {
  1842. int time_hash;
  1843. struct bw_meter *prev, *tmp;
  1844. MFC_LOCK_ASSERT();
  1845. if (!(x->bm_flags & BW_METER_LEQ))
  1846. return; /* XXX: we schedule timers only for "<=" entries */
  1847. /*
  1848. * Compute the timeout hash value and delete the entry
  1849. */
  1850. time_hash = x->bm_time_hash;
  1851. if (time_hash >= BW_METER_BUCKETS)
  1852. return; /* Entry was not scheduled */
  1853. for (prev = NULL, tmp = V_bw_meter_timers[time_hash];
  1854. tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
  1855. if (tmp == x)
  1856. break;
  1857. if (tmp == NULL)
  1858. panic("unschedule_bw_meter: bw_meter entry not found");
  1859. if (prev != NULL)
  1860. prev->bm_time_next = x->bm_time_next;
  1861. else
  1862. V_bw_meter_timers[time_hash] = x->bm_time_next;
  1863. x->bm_time_next = NULL;
  1864. x->bm_time_hash = BW_METER_BUCKETS;
  1865. }
  1866. /*
  1867. * Process all "<=" type of bw_meter that should be processed now,
  1868. * and for each entry prepare an upcall if necessary. Each processed
  1869. * entry is rescheduled again for the (periodic) processing.
  1870. *
  1871. * This is run periodically (once per second normally). On each round,
  1872. * all the potentially matching entries are in the hash slot that we are
  1873. * looking at.
  1874. */
  1875. static void
  1876. bw_meter_process()
  1877. {
  1878. uint32_t loops;
  1879. int i;
  1880. struct timeval now, process_endtime;
  1881. microtime(&now);
  1882. if (V_last_tv_sec == now.tv_sec)
  1883. return; /* nothing to do */
  1884. loops = now.tv_sec - V_last_tv_sec;
  1885. V_last_tv_sec = now.tv_sec;
  1886. if (loops > BW_METER_BUCKETS)
  1887. loops = BW_METER_BUCKETS;
  1888. MFC_LOCK();
  1889. /*
  1890. * Process all bins of bw_meter entries from the one after the last
  1891. * processed to the current one. On entry, i points to the last bucket
  1892. * visited, so we need to increment i at the beginning of the loop.
  1893. */
  1894. for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
  1895. struct bw_meter *x, *tmp_list;
  1896. if (++i >= BW_METER_BUCKETS)
  1897. i = 0;
  1898. /* Disconnect the list of bw_meter entries from the bin */
  1899. tmp_list = V_bw_meter_timers[i];
  1900. V_bw_meter_timers[i] = NULL;
  1901. /* Process the list of bw_meter entries */
  1902. while (tmp_list != NULL) {
  1903. x = tmp_list;
  1904. tmp_list = tmp_list->bm_time_next;
  1905. /* Test if the time interval is over */
  1906. process_endtime = x->bm_start_time;
  1907. BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time);
  1908. if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
  1909. /* Not yet: reschedule, but don't reset */
  1910. int time_hash;
  1911. BW_METER_TIMEHASH(x, time_hash);
  1912. if (time_hash == i && process_endtime.tv_sec == now.tv_sec) {
  1913. /*
  1914. * XXX: somehow the bin processing is a bit ahead of time.
  1915. * Put the entry in the next bin.
  1916. */
  1917. if (++time_hash >= BW_METER_BUCKETS)
  1918. time_hash = 0;
  1919. }
  1920. x->bm_time_next = V_bw_meter_timers[time_hash];
  1921. V_bw_meter_timers[time_hash] = x;
  1922. x->bm_time_hash = time_hash;
  1923. continue;
  1924. }
  1925. /*
  1926. * Test if we should deliver an upcall
  1927. */
  1928. if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
  1929. (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
  1930. ((x->bm_flags & BW_METER_UNIT_BYTES) &&
  1931. (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
  1932. /* Prepare an upcall for delivery */
  1933. bw_meter_prepare_upcall(x, &now);
  1934. }
  1935. /*
  1936. * Reschedule for next processing
  1937. */
  1938. schedule_bw_meter(x, &now);
  1939. }
  1940. }
  1941. /* Send all upcalls that are pending delivery */
  1942. bw_upcalls_send();
  1943. MFC_UNLOCK();
  1944. }
  1945. /*
  1946. * A periodic function for sending all upcalls that are pending delivery
  1947. */
  1948. static void
  1949. expire_bw_upcalls_send(void *arg)
  1950. {
  1951. CURVNET_SET((struct vnet *) arg);
  1952. MFC_LOCK();
  1953. bw_upcalls_send();
  1954. MFC_UNLOCK();
  1955. callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send,
  1956. curvnet);
  1957. CURVNET_RESTORE();
  1958. }
  1959. /*
  1960. * A periodic function for periodic scanning of the multicast forwarding
  1961. * table for processing all "<=" bw_meter entries.
  1962. */
  1963. static void
  1964. expire_bw_meter_process(void *arg)
  1965. {
  1966. CURVNET_SET((struct vnet *) arg);
  1967. if (V_mrt_api_config & MRT_MFC_BW_UPCALL)
  1968. bw_meter_process();
  1969. callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process,
  1970. curvnet);
  1971. CURVNET_RESTORE();
  1972. }
  1973. /*
  1974. * End of bandwidth monitoring code
  1975. */
  1976. /*
  1977. * Send the packet up to the user daemon, or eventually do kernel encapsulation
  1978. *
  1979. */
  1980. static int
  1981. pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m,
  1982. struct mfc *rt)
  1983. {
  1984. struct mbuf *mb_copy, *mm;
  1985. /*
  1986. * Do not send IGMP_WHOLEPKT notifications to userland, if the
  1987. * rendezvous point was unspecified, and we were told not to.
  1988. */
  1989. if (pim_squelch_wholepkt != 0 && (V_mrt_api_config & MRT_MFC_RP) &&
  1990. in_nullhost(rt->mfc_rp))
  1991. return 0;
  1992. mb_copy = pim_register_prepare(ip, m);
  1993. if (mb_copy == NULL)
  1994. return ENOBUFS;
  1995. /*
  1996. * Send all the fragments. Note that the mbuf for each fragment
  1997. * is freed by the sending machinery.
  1998. */
  1999. for (mm = mb_copy; mm; mm = mb_copy) {
  2000. mb_copy = mm->m_nextpkt;
  2001. mm->m_nextpkt = 0;
  2002. mm = m_pullup(mm, sizeof(struct ip));
  2003. if (mm != NULL) {
  2004. ip = mtod(mm, struct ip *);
  2005. if ((V_mrt_api_config & MRT_MFC_RP) && !in_nullhost(rt->mfc_rp)) {
  2006. pim_register_send_rp(ip, vifp, mm, rt);
  2007. } else {
  2008. pim_register_send_upcall(ip, vifp, mm, rt);
  2009. }
  2010. }
  2011. }
  2012. return 0;
  2013. }
  2014. /*
  2015. * Return a copy of the data packet that is ready for PIM Register
  2016. * encapsulation.
  2017. * XXX: Note that in the returned copy the IP header is a valid one.
  2018. */
  2019. static struct mbuf *
  2020. pim_register_prepare(struct ip *ip, struct mbuf *m)
  2021. {
  2022. struct mbuf *mb_copy = NULL;
  2023. int mtu;
  2024. /* Take care of delayed checksums */
  2025. if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
  2026. in_delayed_cksum(m);
  2027. m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
  2028. }
  2029. /*
  2030. * Copy the old packet & pullup its IP header into the
  2031. * new mbuf so we can modify it.
  2032. */
  2033. mb_copy = m_copypacket(m, M_NOWAIT);
  2034. if (mb_copy == NULL)
  2035. return NULL;
  2036. mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
  2037. if (mb_copy == NULL)
  2038. return NULL;
  2039. /* take care of the TTL */
  2040. ip = mtod(mb_copy, struct ip *);
  2041. --ip->ip_ttl;
  2042. /* Compute the MTU after the PIM Register encapsulation */
  2043. mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
  2044. if (ntohs(ip->ip_len) <= mtu) {
  2045. /* Turn the IP header into a valid one */
  2046. ip->ip_sum = 0;
  2047. ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
  2048. } else {
  2049. /* Fragment the packet */
  2050. mb_copy->m_pkthdr.csum_flags |= CSUM_IP;
  2051. if (ip_fragment(ip, &mb_copy, mtu, 0) != 0) {
  2052. m_freem(mb_copy);
  2053. return NULL;
  2054. }
  2055. }
  2056. return mb_copy;
  2057. }
  2058. /*
  2059. * Send an upcall with the data packet to the user-level process.
  2060. */
  2061. static int
  2062. pim_register_send_upcall(struct ip *ip, struct vif *vifp,
  2063. struct mbuf *mb_copy, struct mfc *rt)
  2064. {
  2065. struct mbuf *mb_first;
  2066. int len = ntohs(ip->ip_len);
  2067. struct igmpmsg *im;
  2068. struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
  2069. VIF_LOCK_ASSERT();
  2070. /*
  2071. * Add a new mbuf with an upcall header
  2072. */
  2073. mb_first = m_gethdr(M_NOWAIT, MT_DATA);
  2074. if (mb_first == NULL) {
  2075. m_freem(mb_copy);
  2076. return ENOBUFS;
  2077. }
  2078. mb_first->m_data += max_linkhdr;
  2079. mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
  2080. mb_first->m_len = sizeof(struct igmpmsg);
  2081. mb_first->m_next = mb_copy;
  2082. /* Send message to routing daemon */
  2083. im = mtod(mb_first, struct igmpmsg *);
  2084. im->im_msgtype = IGMPMSG_WHOLEPKT;
  2085. im->im_mbz = 0;
  2086. im->im_vif = vifp - V_viftable;
  2087. im->im_src = ip->ip_src;
  2088. im->im_dst = ip->ip_dst;
  2089. k_igmpsrc.sin_addr = ip->ip_src;
  2090. MRTSTAT_INC(mrts_upcalls);
  2091. if (socket_send(V_ip_mrouter, mb_first, &k_igmpsrc) < 0) {
  2092. CTR1(KTR_IPMF, "%s: socket queue full", __func__);
  2093. MRTSTAT_INC(mrts_upq_sockfull);
  2094. return ENOBUFS;
  2095. }
  2096. /* Keep statistics */
  2097. PIMSTAT_INC(pims_snd_registers_msgs);
  2098. PIMSTAT_ADD(pims_snd_registers_bytes, len);
  2099. return 0;
  2100. }
  2101. /*
  2102. * Encapsulate the data packet in PIM Register message and send it to the RP.
  2103. */
  2104. static int
  2105. pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy,
  2106. struct mfc *rt)
  2107. {
  2108. struct mbuf *mb_first;
  2109. struct ip *ip_outer;
  2110. struct pim_encap_pimhdr *pimhdr;
  2111. int len = ntohs(ip->ip_len);
  2112. vifi_t vifi = rt->mfc_parent;
  2113. VIF_LOCK_ASSERT();
  2114. if ((vifi >= V_numvifs) || in_nullhost(V_viftable[vifi].v_lcl_addr)) {
  2115. m_freem(mb_copy);
  2116. return EADDRNOTAVAIL; /* The iif vif is invalid */
  2117. }
  2118. /*
  2119. * Add a new mbuf with the encapsulating header
  2120. */
  2121. mb_first = m_gethdr(M_NOWAIT, MT_DATA);
  2122. if (mb_first == NULL) {
  2123. m_freem(mb_copy);
  2124. return ENOBUFS;
  2125. }
  2126. mb_first->m_data += max_linkhdr;
  2127. mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
  2128. mb_first->m_next = mb_copy;
  2129. mb_first->m_pkthdr.len = len + mb_first->m_len;
  2130. /*
  2131. * Fill in the encapsulating IP and PIM header
  2132. */
  2133. ip_outer = mtod(mb_first, struct ip *);
  2134. *ip_outer = pim_encap_iphdr;
  2135. ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) +
  2136. sizeof(pim_encap_pimhdr));
  2137. ip_outer->ip_src = V_viftable[vifi].v_lcl_addr;
  2138. ip_outer->ip_dst = rt->mfc_rp;
  2139. /*
  2140. * Copy the inner header TOS to the outer header, and take care of the
  2141. * IP_DF bit.
  2142. */
  2143. ip_outer->ip_tos = ip->ip_tos;
  2144. if (ip->ip_off & htons(IP_DF))
  2145. ip_outer->ip_off |= htons(IP_DF);
  2146. ip_fillid(ip_outer);
  2147. pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer
  2148. + sizeof(pim_encap_iphdr));
  2149. *pimhdr = pim_encap_pimhdr;
  2150. /* If the iif crosses a border, set the Border-bit */
  2151. if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & V_mrt_api_config)
  2152. pimhdr->flags |= htonl(PIM_BORDER_REGISTER);
  2153. mb_first->m_data += sizeof(pim_encap_iphdr);
  2154. pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
  2155. mb_first->m_data -= sizeof(pim_encap_iphdr);
  2156. send_packet(vifp, mb_first);
  2157. /* Keep statistics */
  2158. PIMSTAT_INC(pims_snd_registers_msgs);
  2159. PIMSTAT_ADD(pims_snd_registers_bytes, len);
  2160. return 0;
  2161. }
  2162. /*
  2163. * pim_encapcheck() is called by the encap4_input() path at runtime to
  2164. * determine if a packet is for PIM; allowing PIM to be dynamically loaded
  2165. * into the kernel.
  2166. */
  2167. static int
  2168. pim_encapcheck(const struct mbuf *m __unused, int off __unused,
  2169. int proto __unused, void *arg __unused)
  2170. {
  2171. KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM"));
  2172. return (8); /* claim the datagram. */
  2173. }
  2174. /*
  2175. * PIM-SMv2 and PIM-DM messages processing.
  2176. * Receives and verifies the PIM control messages, and passes them
  2177. * up to the listening socket, using rip_input().
  2178. * The only message with special processing is the PIM_REGISTER message
  2179. * (used by PIM-SM): the PIM header is stripped off, and the inner packet
  2180. * is passed to if_simloop().
  2181. */
  2182. static int
  2183. pim_input(struct mbuf *m, int off, int proto, void *arg __unused)
  2184. {
  2185. struct ip *ip = mtod(m, struct ip *);
  2186. struct pim *pim;
  2187. int iphlen = off;
  2188. int minlen;
  2189. int datalen = ntohs(ip->ip_len) - iphlen;
  2190. int ip_tos;
  2191. /* Keep statistics */
  2192. PIMSTAT_INC(pims_rcv_total_msgs);
  2193. PIMSTAT_ADD(pims_rcv_total_bytes, datalen);
  2194. /*
  2195. * Validate lengths
  2196. */
  2197. if (datalen < PIM_MINLEN) {
  2198. PIMSTAT_INC(pims_rcv_tooshort);
  2199. CTR3(KTR_IPMF, "%s: short packet (%d) from 0x%08x",
  2200. __func__, datalen, ntohl(ip->ip_src.s_addr));
  2201. m_freem(m);
  2202. return (IPPROTO_DONE);
  2203. }
  2204. /*
  2205. * If the packet is at least as big as a REGISTER, go agead
  2206. * and grab the PIM REGISTER header size, to avoid another
  2207. * possible m_pullup() later.
  2208. *
  2209. * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8
  2210. * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
  2211. */
  2212. minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN);
  2213. /*
  2214. * Get the IP and PIM headers in contiguous memory, and
  2215. * possibly the PIM REGISTER header.
  2216. */
  2217. if (m->m_len < minlen && (m = m_pullup(m, minlen)) == NULL) {
  2218. CTR1(KTR_IPMF, "%s: m_pullup() failed", __func__);
  2219. return (IPPROTO_DONE);
  2220. }
  2221. /* m_pullup() may have given us a new mbuf so reset ip. */
  2222. ip = mtod(m, struct ip *);
  2223. ip_tos = ip->ip_tos;
  2224. /* adjust mbuf to point to the PIM header */
  2225. m->m_data += iphlen;
  2226. m->m_len -= iphlen;
  2227. pim = mtod(m, struct pim *);
  2228. /*
  2229. * Validate checksum. If PIM REGISTER, exclude the data packet.
  2230. *
  2231. * XXX: some older PIMv2 implementations don't make this distinction,
  2232. * so for compatibility reason perform the checksum over part of the
  2233. * message, and if error, then over the whole message.
  2234. */
  2235. if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) {
  2236. /* do nothing, checksum okay */
  2237. } else if (in_cksum(m, datalen)) {
  2238. PIMSTAT_INC(pims_rcv_badsum);
  2239. CTR1(KTR_IPMF, "%s: invalid checksum", __func__);
  2240. m_freem(m);
  2241. return (IPPROTO_DONE);
  2242. }
  2243. /* PIM version check */
  2244. if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
  2245. PIMSTAT_INC(pims_rcv_badversion);
  2246. CTR3(KTR_IPMF, "%s: bad version %d expect %d", __func__,
  2247. (int)PIM_VT_V(pim->pim_vt), PIM_VERSION);
  2248. m_freem(m);
  2249. return (IPPROTO_DONE);
  2250. }
  2251. /* restore mbuf back to the outer IP */
  2252. m->m_data -= iphlen;
  2253. m->m_len += iphlen;
  2254. if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
  2255. /*
  2256. * Since this is a REGISTER, we'll make a copy of the register
  2257. * headers ip + pim + u_int32 + encap_ip, to be passed up to the
  2258. * routing daemon.
  2259. */
  2260. struct sockaddr_in dst = { sizeof(dst), AF_INET };
  2261. struct mbuf *mcp;
  2262. struct ip *encap_ip;
  2263. u_int32_t *reghdr;
  2264. struct ifnet *vifp;
  2265. VIF_LOCK();
  2266. if ((V_reg_vif_num >= V_numvifs) || (V_reg_vif_num == VIFI_INVALID)) {
  2267. VIF_UNLOCK();
  2268. CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__,
  2269. (int)V_reg_vif_num);
  2270. m_freem(m);
  2271. return (IPPROTO_DONE);
  2272. }
  2273. /* XXX need refcnt? */
  2274. vifp = V_viftable[V_reg_vif_num].v_ifp;
  2275. VIF_UNLOCK();
  2276. /*
  2277. * Validate length
  2278. */
  2279. if (datalen < PIM_REG_MINLEN) {
  2280. PIMSTAT_INC(pims_rcv_tooshort);
  2281. PIMSTAT_INC(pims_rcv_badregisters);
  2282. CTR1(KTR_IPMF, "%s: register packet size too small", __func__);
  2283. m_freem(m);
  2284. return (IPPROTO_DONE);
  2285. }
  2286. reghdr = (u_int32_t *)(pim + 1);
  2287. encap_ip = (struct ip *)(reghdr + 1);
  2288. CTR3(KTR_IPMF, "%s: register: encap ip src 0x%08x len %d",
  2289. __func__, ntohl(encap_ip->ip_src.s_addr),
  2290. ntohs(encap_ip->ip_len));
  2291. /* verify the version number of the inner packet */
  2292. if (encap_ip->ip_v != IPVERSION) {
  2293. PIMSTAT_INC(pims_rcv_badregisters);
  2294. CTR1(KTR_IPMF, "%s: bad encap ip version", __func__);
  2295. m_freem(m);
  2296. return (IPPROTO_DONE);
  2297. }
  2298. /* verify the inner packet is destined to a mcast group */
  2299. if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) {
  2300. PIMSTAT_INC(pims_rcv_badregisters);
  2301. CTR2(KTR_IPMF, "%s: bad encap ip dest 0x%08x", __func__,
  2302. ntohl(encap_ip->ip_dst.s_addr));
  2303. m_freem(m);
  2304. return (IPPROTO_DONE);
  2305. }
  2306. /* If a NULL_REGISTER, pass it to the daemon */
  2307. if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
  2308. goto pim_input_to_daemon;
  2309. /*
  2310. * Copy the TOS from the outer IP header to the inner IP header.
  2311. */
  2312. if (encap_ip->ip_tos != ip_tos) {
  2313. /* Outer TOS -> inner TOS */
  2314. encap_ip->ip_tos = ip_tos;
  2315. /* Recompute the inner header checksum. Sigh... */
  2316. /* adjust mbuf to point to the inner IP header */
  2317. m->m_data += (iphlen + PIM_MINLEN);
  2318. m->m_len -= (iphlen + PIM_MINLEN);
  2319. encap_ip->ip_sum = 0;
  2320. encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);
  2321. /* restore mbuf to point back to the outer IP header */
  2322. m->m_data -= (iphlen + PIM_MINLEN);
  2323. m->m_len += (iphlen + PIM_MINLEN);
  2324. }
  2325. /*
  2326. * Decapsulate the inner IP packet and loopback to forward it
  2327. * as a normal multicast packet. Also, make a copy of the
  2328. * outer_iphdr + pimhdr + reghdr + encap_iphdr
  2329. * to pass to the daemon later, so it can take the appropriate
  2330. * actions (e.g., send back PIM_REGISTER_STOP).
  2331. * XXX: here m->m_data points to the outer IP header.
  2332. */
  2333. mcp = m_copym(m, 0, iphlen + PIM_REG_MINLEN, M_NOWAIT);
  2334. if (mcp == NULL) {
  2335. CTR1(KTR_IPMF, "%s: m_copym() failed", __func__);
  2336. m_freem(m);
  2337. return (IPPROTO_DONE);
  2338. }
  2339. /* Keep statistics */
  2340. /* XXX: registers_bytes include only the encap. mcast pkt */
  2341. PIMSTAT_INC(pims_rcv_registers_msgs);
  2342. PIMSTAT_ADD(pims_rcv_registers_bytes, ntohs(encap_ip->ip_len));
  2343. /*
  2344. * forward the inner ip packet; point m_data at the inner ip.
  2345. */
  2346. m_adj(m, iphlen + PIM_MINLEN);
  2347. CTR4(KTR_IPMF,
  2348. "%s: forward decap'd REGISTER: src %lx dst %lx vif %d",
  2349. __func__,
  2350. (u_long)ntohl(encap_ip->ip_src.s_addr),
  2351. (u_long)ntohl(encap_ip->ip_dst.s_addr),
  2352. (int)V_reg_vif_num);
  2353. /* NB: vifp was collected above; can it change on us? */
  2354. if_simloop(vifp, m, dst.sin_family, 0);
  2355. /* prepare the register head to send to the mrouting daemon */
  2356. m = mcp;
  2357. }
  2358. pim_input_to_daemon:
  2359. /*
  2360. * Pass the PIM message up to the daemon; if it is a Register message,
  2361. * pass the 'head' only up to the daemon. This includes the
  2362. * outer IP header, PIM header, PIM-Register header and the
  2363. * inner IP header.
  2364. * XXX: the outer IP header pkt size of a Register is not adjust to
  2365. * reflect the fact that the inner multicast data is truncated.
  2366. */
  2367. return (rip_input(&m, &off, proto));
  2368. }
  2369. static int
  2370. sysctl_mfctable(SYSCTL_HANDLER_ARGS)
  2371. {
  2372. struct mfc *rt;
  2373. int error, i;
  2374. if (req->newptr)
  2375. return (EPERM);
  2376. if (V_mfchashtbl == NULL) /* XXX unlocked */
  2377. return (0);
  2378. error = sysctl_wire_old_buffer(req, 0);
  2379. if (error)
  2380. return (error);
  2381. MFC_LOCK();
  2382. for (i = 0; i < mfchashsize; i++) {
  2383. LIST_FOREACH(rt, &V_mfchashtbl[i], mfc_hash) {
  2384. error = SYSCTL_OUT(req, rt, sizeof(struct mfc));
  2385. if (error)
  2386. goto out_locked;
  2387. }
  2388. }
  2389. out_locked:
  2390. MFC_UNLOCK();
  2391. return (error);
  2392. }
  2393. static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD,
  2394. sysctl_mfctable, "IPv4 Multicast Forwarding Table "
  2395. "(struct *mfc[mfchashsize], netinet/ip_mroute.h)");
  2396. static void
  2397. vnet_mroute_init(const void *unused __unused)
  2398. {
  2399. V_nexpire = malloc(mfchashsize, M_MRTABLE, M_WAITOK|M_ZERO);
  2400. V_viftable = mallocarray(MAXVIFS, sizeof(*V_viftable),
  2401. M_MRTABLE, M_WAITOK|M_ZERO);
  2402. V_bw_meter_timers = mallocarray(BW_METER_BUCKETS,
  2403. sizeof(*V_bw_meter_timers), M_MRTABLE, M_WAITOK|M_ZERO);
  2404. V_bw_upcalls = mallocarray(BW_UPCALLS_MAX, sizeof(*V_bw_upcalls),
  2405. M_MRTABLE, M_WAITOK|M_ZERO);
  2406. callout_init(&V_expire_upcalls_ch, 1);
  2407. callout_init(&V_bw_upcalls_ch, 1);
  2408. callout_init(&V_bw_meter_ch, 1);
  2409. }
  2410. VNET_SYSINIT(vnet_mroute_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mroute_init,
  2411. NULL);
  2412. static void
  2413. vnet_mroute_uninit(const void *unused __unused)
  2414. {
  2415. free(V_bw_upcalls, M_MRTABLE);
  2416. free(V_bw_meter_timers, M_MRTABLE);
  2417. free(V_viftable, M_MRTABLE);
  2418. free(V_nexpire, M_MRTABLE);
  2419. V_nexpire = NULL;
  2420. }
  2421. VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE,
  2422. vnet_mroute_uninit, NULL);
  2423. static int
  2424. ip_mroute_modevent(module_t mod, int type, void *unused)
  2425. {
  2426. switch (type) {
  2427. case MOD_LOAD:
  2428. MROUTER_LOCK_INIT();
  2429. if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
  2430. if_detached_event, NULL, EVENTHANDLER_PRI_ANY);
  2431. if (if_detach_event_tag == NULL) {
  2432. printf("ip_mroute: unable to register "
  2433. "ifnet_departure_event handler\n");
  2434. MROUTER_LOCK_DESTROY();
  2435. return (EINVAL);
  2436. }
  2437. MFC_LOCK_INIT();
  2438. VIF_LOCK_INIT();
  2439. mfchashsize = MFCHASHSIZE;
  2440. if (TUNABLE_ULONG_FETCH("net.inet.ip.mfchashsize", &mfchashsize) &&
  2441. !powerof2(mfchashsize)) {
  2442. printf("WARNING: %s not a power of 2; using default\n",
  2443. "net.inet.ip.mfchashsize");
  2444. mfchashsize = MFCHASHSIZE;
  2445. }
  2446. pim_squelch_wholepkt = 0;
  2447. TUNABLE_ULONG_FETCH("net.inet.pim.squelch_wholepkt",
  2448. &pim_squelch_wholepkt);
  2449. pim_encap_cookie = ip_encap_attach(&ipv4_encap_cfg, NULL, M_WAITOK);
  2450. if (pim_encap_cookie == NULL) {
  2451. printf("ip_mroute: unable to attach pim encap\n");
  2452. VIF_LOCK_DESTROY();
  2453. MFC_LOCK_DESTROY();
  2454. MROUTER_LOCK_DESTROY();
  2455. return (EINVAL);
  2456. }
  2457. ip_mcast_src = X_ip_mcast_src;
  2458. ip_mforward = X_ip_mforward;
  2459. ip_mrouter_done = X_ip_mrouter_done;
  2460. ip_mrouter_get = X_ip_mrouter_get;
  2461. ip_mrouter_set = X_ip_mrouter_set;
  2462. ip_rsvp_force_done = X_ip_rsvp_force_done;
  2463. ip_rsvp_vif = X_ip_rsvp_vif;
  2464. legal_vif_num = X_legal_vif_num;
  2465. mrt_ioctl = X_mrt_ioctl;
  2466. rsvp_input_p = X_rsvp_input;
  2467. break;
  2468. case MOD_UNLOAD:
  2469. /*
  2470. * Typically module unload happens after the user-level
  2471. * process has shutdown the kernel services (the check
  2472. * below insures someone can't just yank the module out
  2473. * from under a running process). But if the module is
  2474. * just loaded and then unloaded w/o starting up a user
  2475. * process we still need to cleanup.
  2476. */
  2477. MROUTER_LOCK();
  2478. if (ip_mrouter_cnt != 0) {
  2479. MROUTER_UNLOCK();
  2480. return (EINVAL);
  2481. }
  2482. ip_mrouter_unloading = 1;
  2483. MROUTER_UNLOCK();
  2484. EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag);
  2485. if (pim_encap_cookie) {
  2486. ip_encap_detach(pim_encap_cookie);
  2487. pim_encap_cookie = NULL;
  2488. }
  2489. ip_mcast_src = NULL;
  2490. ip_mforward = NULL;
  2491. ip_mrouter_done = NULL;
  2492. ip_mrouter_get = NULL;
  2493. ip_mrouter_set = NULL;
  2494. ip_rsvp_force_done = NULL;
  2495. ip_rsvp_vif = NULL;
  2496. legal_vif_num = NULL;
  2497. mrt_ioctl = NULL;
  2498. rsvp_input_p = NULL;
  2499. VIF_LOCK_DESTROY();
  2500. MFC_LOCK_DESTROY();
  2501. MROUTER_LOCK_DESTROY();
  2502. break;
  2503. default:
  2504. return EOPNOTSUPP;
  2505. }
  2506. return 0;
  2507. }
  2508. static moduledata_t ip_mroutemod = {
  2509. "ip_mroute",
  2510. ip_mroute_modevent,
  2511. 0
  2512. };
  2513. DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE);