--- /u/marko/p4/head/src/sys/amd64/amd64/dump_machdep.c 2008-02-27 18:27:04.000000000 +0100 +++ src/sys/amd64/amd64/dump_machdep.c 2008-02-27 11:38:49.000000000 +0100 @@ -27,6 +27,8 @@ #include __FBSDID("$FreeBSD: src/sys/amd64/amd64/dump_machdep.c,v 1.14 2008/02/15 06:26:25 scottl Exp $"); +#include "opt_vimage.h" + #include #include #include @@ -34,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -109,6 +112,7 @@ mkdumpheader(struct kerneldumpheader *kdh, uint32_t archver, uint64_t dumplen, uint32_t blksz) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX */ bzero(kdh, sizeof(*kdh)); strncpy(kdh->magic, KERNELDUMPMAGIC, sizeof(kdh->magic)); @@ -118,7 +122,7 @@ kdh->dumplength = htod64(dumplen); kdh->dumptime = htod64(time_second); kdh->blocksize = htod32(blksz); - strncpy(kdh->hostname, hostname, sizeof(kdh->hostname)); + strncpy(kdh->hostname, V_hostname, sizeof(kdh->hostname)); strncpy(kdh->versionstring, version, sizeof(kdh->versionstring)); if (panicstr != NULL) strncpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring)); --- /u/marko/p4/head/src/sys/amd64/amd64/minidump_machdep.c 2008-02-27 18:27:04.000000000 +0100 +++ src/sys/amd64/amd64/minidump_machdep.c 2008-02-27 11:38:52.000000000 +0100 @@ -27,6 +27,8 @@ #include __FBSDID("$FreeBSD: src/sys/amd64/amd64/minidump_machdep.c,v 1.4 2008/02/15 06:26:25 scottl Exp $"); +#include "opt_vimage.h" + #include #include #include @@ -34,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -85,6 +88,7 @@ mkdumpheader(struct kerneldumpheader *kdh, uint32_t archver, uint64_t dumplen, uint32_t blksz) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX */ bzero(kdh, sizeof(*kdh)); strncpy(kdh->magic, KERNELDUMPMAGIC, sizeof(kdh->magic)); @@ -94,7 +98,7 @@ kdh->dumplength = htod64(dumplen); kdh->dumptime = htod64(time_second); kdh->blocksize = htod32(blksz); - strncpy(kdh->hostname, hostname, sizeof(kdh->hostname)); + strncpy(kdh->hostname, V_hostname, sizeof(kdh->hostname)); strncpy(kdh->versionstring, version, sizeof(kdh->versionstring)); if (panicstr != NULL) strncpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring)); --- /u/marko/p4/head/src/sys/compat/linprocfs/linprocfs.c 2008-01-15 17:45:36.000000000 +0100 +++ src/sys/compat/linprocfs/linprocfs.c 2008-02-27 11:39:51.000000000 +0100 @@ -42,6 +42,9 @@ #include __FBSDID("$FreeBSD: src/sys/compat/linprocfs/linprocfs.c,v 1.118 2008/01/10 01:10:41 attilio Exp $"); +#include "opt_compat.h" +#include "opt_vimage.h" + #include #include #include @@ -70,7 +73,9 @@ #include #include #include +#include +#include #include #include @@ -87,7 +92,6 @@ #include #endif /* __i386__ || __amd64__ */ -#include "opt_compat.h" #ifdef COMPAT_LINUX32 /* XXX */ #include #else @@ -507,15 +511,16 @@ static int linprocfs_doloadavg(PFS_FILL_ARGS) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); sbuf_printf(sb, "%d.%02d %d.%02d %d.%02d %d/%d %d\n", - (int)(averunnable.ldavg[0] / averunnable.fscale), - (int)(averunnable.ldavg[0] * 100 / averunnable.fscale % 100), - (int)(averunnable.ldavg[1] / averunnable.fscale), - (int)(averunnable.ldavg[1] * 100 / averunnable.fscale % 100), - (int)(averunnable.ldavg[2] / averunnable.fscale), - (int)(averunnable.ldavg[2] * 100 / averunnable.fscale % 100), + (int)(V_averunnable.ldavg[0] / V_averunnable.fscale), + (int)(V_averunnable.ldavg[0] * 100 / V_averunnable.fscale % 100), + (int)(V_averunnable.ldavg[1] / V_averunnable.fscale), + (int)(V_averunnable.ldavg[1] * 100 / V_averunnable.fscale % 100), + (int)(V_averunnable.ldavg[2] / V_averunnable.fscale), + (int)(V_averunnable.ldavg[2] * 100 / V_averunnable.fscale % 100), 1, /* number of running tasks */ nprocs, /* number of tasks */ lastpid /* the last pid */ @@ -998,6 +1003,7 @@ static int linprocfs_donetdev(PFS_FILL_ARGS) { + INIT_VNET_NET(TD_TO_VNET(curthread)); char ifname[16]; /* XXX LINUX_IFNAMSIZ */ struct ifnet *ifp; @@ -1007,7 +1013,7 @@ "bytes packets errs drop fifo frame compressed"); IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { linux_ifname(ifp, ifname, sizeof ifname); sbuf_printf(sb, "%6.6s:", ifname); sbuf_printf(sb, "%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu ", --- /u/marko/p4/head/src/sys/compat/linux/linux_ioctl.c 2007-11-13 02:48:15.000000000 +0100 +++ src/sys/compat/linux/linux_ioctl.c 2007-12-10 11:25:48.000000000 +0100 @@ -29,6 +29,9 @@ #include __FBSDID("$FreeBSD: src/sys/compat/linux/linux_ioctl.c,v 1.139 2007/11/07 16:42:52 kib Exp $"); +#include "opt_vimage.h" +#include "opt_compat.h" + #include #include #include @@ -56,12 +59,13 @@ #include #include #include +#include + +#include #include #include #include -#include "opt_compat.h" - #ifdef COMPAT_LINUX32 #include #include @@ -2037,6 +2041,7 @@ int linux_ifname(struct ifnet *ifp, char *buffer, size_t buflen) { + INIT_VNET_NET(ifp->if_vnet); struct ifnet *ifscan; int ethno; @@ -2047,7 +2052,7 @@ /* Determine the (relative) unit number for ethernet interfaces */ ethno = 0; IFNET_RLOCK(); - TAILQ_FOREACH(ifscan, &ifnet, if_link) { + TAILQ_FOREACH(ifscan, &V_ifnet, if_link) { if (ifscan == ifp) { IFNET_RUNLOCK(); return (snprintf(buffer, buflen, "eth%d", ethno)); @@ -2070,6 +2075,7 @@ static struct ifnet * ifname_linux_to_bsd(const char *lxname, char *bsdname) { + INIT_VNET_NET(TD_TO_VNET(curthread)); struct ifnet *ifp; int len, unit; char *ep; @@ -2086,7 +2092,7 @@ index = 0; is_eth = (len == 3 && !strncmp(lxname, "eth", len)) ? 1 : 0; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* * Allow Linux programs to use FreeBSD names. Don't presume * we never have an interface named "eth", so don't make @@ -2110,6 +2116,7 @@ static int linux_ifconf(struct thread *td, struct ifconf *uifc) { + INIT_VNET_NET(TD_TO_VNET(td)); #ifdef COMPAT_LINUX32 struct l_ifconf ifc; #else @@ -2130,7 +2137,7 @@ /* handle the 'request buffer size' case */ if (ifc.ifc_buf == PTROUT(NULL)) { ifc.ifc_len = 0; - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (sa->sa_family == AF_INET) @@ -2157,7 +2164,7 @@ /* Return all AF_INET addresses of all interfaces */ IFNET_RLOCK(); /* could sleep XXX */ - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { int addrs = 0; bzero(&ifr, sizeof(ifr)); --- /u/marko/p4/head/src/sys/compat/linux/linux_misc.c 2008-02-27 18:27:10.000000000 +0100 +++ src/sys/compat/linux/linux_misc.c 2008-02-27 11:39:56.000000000 +0100 @@ -32,6 +32,7 @@ #include "opt_compat.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -63,6 +64,7 @@ #include #include #include +#include #include @@ -123,6 +125,7 @@ int linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args) { + INIT_VPROCG(TD_TO_VPROCG(td)); struct l_sysinfo sysinfo; vm_object_t object; int i, j; @@ -135,8 +138,8 @@ /* Use the information from the mib to get our load averages */ for (i = 0; i < 3; i++) - sysinfo.loads[i] = averunnable.ldavg[i] * - LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale; + sysinfo.loads[i] = V_averunnable.ldavg[i] * + LINUX_SYSINFO_LOADS_SCALE / V_averunnable.fscale; sysinfo.totalram = physmem * PAGE_SIZE; sysinfo.freeram = sysinfo.totalram - cnt.v_wire_count * PAGE_SIZE; @@ -709,6 +712,7 @@ int linux_newuname(struct thread *td, struct linux_newuname_args *args) { + INIT_VPROCG(TD_TO_VPROCG(td)); struct l_new_utsname utsname; char osname[LINUX_MAX_UTSNAME]; char osrelease[LINUX_MAX_UTSNAME]; @@ -760,7 +764,7 @@ #else /* something other than i386 or amd64 - assume we and Linux agree */ strlcpy(utsname.machine, machine, LINUX_MAX_UTSNAME); #endif /* __i386__ */ - strlcpy(utsname.domainname, domainname, LINUX_MAX_UTSNAME); + strlcpy(utsname.domainname, V_domainname, LINUX_MAX_UTSNAME); return (copyout(&utsname, args->buf, sizeof(utsname))); } --- /u/marko/p4/head/src/sys/conf/files 2008-02-27 18:27:14.000000000 +0100 +++ src/sys/conf/files 2008-02-27 11:40:17.000000000 +0100 @@ -1475,6 +1475,7 @@ kern/kern_timeout.c standard kern/kern_umtx.c standard kern/kern_uuid.c standard +kern/kern_vimage.c optional vimage kern/kern_xxx.c standard kern/link_elf.c standard kern/linker_if.m standard @@ -1838,6 +1839,7 @@ netgraph/ng_nat.c optional netgraph_nat netgraph/ng_one2many.c optional netgraph_one2many netgraph/ng_parse.c optional netgraph +netgraph/ng_pipe.c optional netgraph_pipe netgraph/ng_ppp.c optional netgraph_ppp netgraph/ng_pppoe.c optional netgraph_pppoe netgraph/ng_pptpgre.c optional netgraph_pptpgre @@ -1851,6 +1853,7 @@ netgraph/ng_tee.c optional netgraph_tee netgraph/ng_tty.c optional netgraph_tty netgraph/ng_vjc.c optional netgraph_vjc +netgraph/ng_wormhole.c optional netgraph_wormhole vimage netinet/accf_data.c optional accept_filter_data netinet/accf_http.c optional accept_filter_http netinet/if_atm.c optional atm --- /u/marko/p4/head/src/sys/conf/options 2008-02-27 18:27:14.000000000 +0100 +++ src/sys/conf/options 2008-02-27 11:40:25.000000000 +0100 @@ -457,6 +457,7 @@ NETGRAPH_NAT opt_netgraph.h NETGRAPH_NETFLOW opt_netgraph.h NETGRAPH_ONE2MANY opt_netgraph.h +NETGRAPH_PIPE opt_netgraph.h NETGRAPH_PPP opt_netgraph.h NETGRAPH_PPPOE opt_netgraph.h NETGRAPH_PPTPGRE opt_netgraph.h @@ -471,6 +472,7 @@ NETGRAPH_TTY opt_netgraph.h NETGRAPH_UI opt_netgraph.h NETGRAPH_VJC opt_netgraph.h +NETGRAPH_WORMHOLE opt_netgraph.h # NgATM options NGATM_ATM opt_netgraph.h @@ -760,3 +762,6 @@ #Disable code to dispatch tcp offloading TCP_OFFLOAD_DISABLE opt_inet.h + +# Virtualize the network stack +VIMAGE opt_vimage.h --- /u/marko/p4/head/src/sys/contrib/pf/net/pfvar.h 2007-08-31 03:45:08.000000000 +0200 +++ src/sys/contrib/pf/net/pfvar.h 2007-10-05 12:23:37.000000000 +0200 @@ -1848,5 +1848,22 @@ struct pf_os_fingerprint * pf_osfp_validate(void); +/* + * Stack virtualization support. + */ +#ifdef VIMAGE +struct vnet_pf { + struct vnet *parent_vnet; + +} +#endif + +/* + * Symbol translation macros + */ +#define INIT_VNET_PF(vnet) \ + INIT_FROM_VNET(vnet, VNET_MOD_PF, struct vnet_pf, vnet_pf) + +#define VNET_PF(sym) VSYM(vnet_pf, sym) #endif /* _NET_PFVAR_H_ */ --- /u/marko/p4/head/src/sys/ddb/db_command.c 2007-12-27 19:27:46.000000000 +0100 +++ src/sys/ddb/db_command.c 2008-01-14 19:22:48.000000000 +0100 @@ -268,24 +268,40 @@ return (result); } +/* + * Print out a sorted command table. + */ static void db_cmd_list(table) struct command_table *table; { - register struct command *cmd; - register struct command **aux_cmdp; + struct command *cmd; + struct command **aux_cmdp; + char *last; + char *next = ""; - for (cmd = table->table; cmd->name != 0; cmd++) { - db_printf("%-12s", cmd->name); - db_end_line(12); - } - if (table->aux_tablep == NULL) - return; - for (aux_cmdp = table->aux_tablep; aux_cmdp < table->aux_tablep_end; - aux_cmdp++) { - db_printf("%-12s", (*aux_cmdp)->name); - db_end_line(12); - } + do { + last = next; + for (cmd = table->table; cmd->name != 0; cmd++) { + if (strcmp(cmd->name, last) > 0 && + (last == next || strcmp(cmd->name, next) < 0)) + next = cmd->name; + } + if (table->aux_tablep != NULL) { + for (aux_cmdp = table->aux_tablep; + aux_cmdp < table->aux_tablep_end; aux_cmdp++) { + cmd = *aux_cmdp; + if (strcmp(cmd->name, last) > 0 && + (last == next || + strcmp(cmd->name, next) < 0)) + next = cmd->name; + } + } + if (next != last) { + db_printf("%-12s", next); + db_end_line(12); + } + } while (next != last); } static void --- /u/marko/p4/head/src/sys/ddb/db_textdump.c 2008-02-03 08:15:52.000000000 +0100 +++ src/sys/ddb/db_textdump.c 2008-02-27 11:40:43.000000000 +0100 @@ -60,6 +60,7 @@ __FBSDID("$FreeBSD: src/sys/ddb/db_textdump.c,v 1.3 2008/01/31 16:22:14 rwatson Exp $"); #include "opt_config.h" +#include "opt_vimage.h" #include #include @@ -68,6 +69,7 @@ #include #include #include +#include #include #include @@ -183,6 +185,7 @@ mkdumpheader(struct kerneldumpheader *kdh, uint32_t archver, uint64_t dumplen, uint32_t blksz) { + INIT_VPROCG(TD_TO_VPROCG(&thread0)); bzero(kdh, sizeof(*kdh)); strncpy(kdh->magic, TEXTDUMPMAGIC, sizeof(kdh->magic)); @@ -192,7 +195,7 @@ kdh->dumplength = htod64(dumplen); kdh->dumptime = htod64(time_second); kdh->blocksize = htod32(blksz); - strncpy(kdh->hostname, hostname, sizeof(kdh->hostname)); + strncpy(kdh->hostname, V_hostname, sizeof(kdh->hostname)); strncpy(kdh->versionstring, version, sizeof(kdh->versionstring)); if (panicstr != NULL) strncpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring)); --- /u/marko/p4/head/src/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 2008-02-27 18:27:46.000000000 +0100 +++ src/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 2008-02-27 11:41:57.000000000 +0100 @@ -136,10 +136,6 @@ #define VALIDATE_SOCK(so) #define DEBUG_WR 0 -extern int tcp_do_autorcvbuf; -extern int tcp_do_autosndbuf; -extern int tcp_autorcvbuf_max; -extern int tcp_autosndbuf_max; static void t3_send_reset(struct toepcb *toep); static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); --- /u/marko/p4/head/src/sys/dev/firewire/firewire.c 2007-10-21 13:56:14.000000000 +0200 +++ src/sys/dev/firewire/firewire.c 2007-12-10 11:25:55.000000000 +0100 @@ -35,6 +35,8 @@ * */ +#include "opt_vimage.h" + #include #include #include @@ -45,6 +47,7 @@ #include #include #include +#include #include @@ -674,6 +677,7 @@ static void fw_reset_crom(struct firewire_comm *fc) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX */ struct crom_src_buf *buf; struct crom_src *src; struct crom_chunk *root; @@ -699,7 +703,7 @@ crom_add_simple_text(src, root, &buf->vendor, "FreeBSD Project"); crom_add_entry(root, CSRKEY_HW, __FreeBSD_version); #endif - crom_add_simple_text(src, root, &buf->hw, hostname); + crom_add_simple_text(src, root, &buf->hw, V_hostname); } /* --- /u/marko/p4/head/src/sys/dev/iwi/if_iwi.c 2007-12-27 19:30:53.000000000 +0100 +++ src/sys/dev/iwi/if_iwi.c 2008-01-14 19:23:18.000000000 +0100 @@ -35,6 +35,8 @@ * http://www.intel.com/network/connectivity/products/wireless/prowireless_mobile.htm */ +#include "opt_vimage.h" + #include #include #include @@ -55,6 +57,7 @@ #include #include #include +#include #include #include @@ -189,6 +192,9 @@ static void iwi_sysctlattach(struct iwi_softc *); static void iwi_led_event(struct iwi_softc *, int); static void iwi_ledattach(struct iwi_softc *); +#ifdef VIMAGE +static void iwi_reassign(struct ifnet *, struct vnet *, char *); +#endif static int iwi_probe(device_t); static int iwi_attach(device_t); @@ -407,6 +413,9 @@ ieee80211_ifattach(ic); ic->ic_bmissthreshold = 10; /* override default */ /* override default methods */ +#ifdef VIMAGE + ifp->if_reassign = iwi_reassign; +#endif ic->ic_node_alloc = iwi_node_alloc; sc->sc_node_free = ic->ic_node_free; ic->ic_node_free = iwi_node_free; @@ -505,6 +514,28 @@ return 0; } +#ifdef VIMAGE +static void +iwi_reassign(struct ifnet *ifp, struct vnet *vnet, char *dname) +{ + struct iwi_softc *sc = ifp->if_softc; + struct ieee80211com *ic = &sc->sc_ic; + IWI_LOCK_DECL; + + IWI_LOCK(sc); + bpfdetach(ifp); + sc->sc_drvbpf = NULL; + ieee80211_reassign(ic, vnet, dname); + + CURVNET_SET_QUIET(vnet); + bpfattach2(ifp, DLT_IEEE802_11_RADIO, + sizeof (struct ieee80211_frame) + sizeof (sc->sc_txtap), + &sc->sc_drvbpf); + CURVNET_RESTORE(); + IWI_UNLOCK(sc); +} +#endif + static void iwi_dma_map_addr(void *arg, bus_dma_segment_t *segs, int nseg, int error) { --- /u/marko/p4/head/src/sys/fs/cd9660/cd9660_rrip.c 2007-08-31 03:46:55.000000000 +0200 +++ src/sys/fs/cd9660/cd9660_rrip.c 2007-10-22 18:06:27.000000000 +0200 @@ -34,6 +34,8 @@ * @(#)cd9660_rrip.c 8.6 (Berkeley) 12/5/94 */ +#include "opt_vimage.h" + #include __FBSDID("$FreeBSD: src/sys/fs/cd9660/cd9660_rrip.c,v 1.30 2007/02/11 13:54:25 rodrigc Exp $"); @@ -44,6 +46,7 @@ #include #include #include +#include #include #include @@ -113,6 +116,7 @@ ISO_RRIP_SLINK *p; ISO_RRIP_ANALYZE *ana; { + INIT_VPROCG(TD_TO_VPROCG(curthread)); ISO_RRIP_SLINK_COMPONENT *pcomp; ISO_RRIP_SLINK_COMPONENT *pcompe; int len, wlen, cont; @@ -171,8 +175,8 @@ case ISO_SUSP_CFLAG_HOST: /* Inserting hostname i.e. "kurt.tools.de" */ - inbuf = hostname; - wlen = strlen(hostname); + inbuf = V_hostname; + wlen = strlen(V_hostname); break; case ISO_SUSP_CFLAG_CONTINUE: @@ -222,6 +226,7 @@ ISO_RRIP_ALTNAME *p; ISO_RRIP_ANALYZE *ana; { + INIT_VPROCG(TD_TO_VPROCG(curthread)); char *inbuf; int wlen; int cont; @@ -243,8 +248,8 @@ case ISO_SUSP_CFLAG_HOST: /* Inserting hostname i.e. "kurt.tools.de" */ - inbuf = hostname; - wlen = strlen(hostname); + inbuf = V_hostname; + wlen = strlen(V_hostname); break; case ISO_SUSP_CFLAG_CONTINUE: --- /u/marko/p4/head/src/sys/i386/conf/.cvsignore 2007-08-31 03:47:17.000000000 +0200 +++ src/sys/i386/conf/.cvsignore 2007-10-05 12:26:12.000000000 +0200 @@ -1 +0,0 @@ -[A-Za-z0-9]* --- /u/marko/p4/head/src/sys/i386/conf/NOTES 2008-02-03 08:16:00.000000000 +0100 +++ src/sys/i386/conf/NOTES 2008-02-27 11:46:14.000000000 +0100 @@ -248,8 +248,14 @@ # # Not all device drivers support this mode of operation at the time of # this writing. See polling(4) for more details. +# +# VIMAGE adds support for maintaining multiple independent network stack +# state instances in the kernel. This feature is still in early +# experimental phase, and needs more thought, testing, and documentation. +# options DEVICE_POLLING +options VIMAGE ##################################################################### --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/i386/conf/VIMAGE 2007-10-05 12:26:12.000000000 +0200 @@ -0,0 +1,16 @@ +# +# VIMAGE - sample kernel configuration file with a virtualized network stack +# configure. +# +# $FreeBSD$ +# +include GENERIC +ident VIMAGE + +options VIMAGE + +# +# Some kernel subsystems and functions don't yet compile with VIMAGE. Remove +# from the configuration for now. +# +nooptions SCTP --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/i386/conf/TPX32V 2007-12-01 12:34:54.000000000 +0100 @@ -0,0 +1,22 @@ +include TPX32C +ident TPX32V + +options VIMAGE + +options NETGRAPH +options NETGRAPH_PIPE +options NETGRAPH_ETHER +options NETGRAPH_EIFACE +options NETGRAPH_IFACE +options NETGRAPH_BRIDGE +options NETGRAPH_SOCKET +options NETGRAPH_KSOCKET + +device wlan # 802.11 support +device wlan_wep # 802.11 WEP support +device wlan_ccmp # 802.11 CCMP support +device wlan_tkip # 802.11 TKIP support +device wlan_amrr # AMRR transmit rate control algorithm +device wlan_scan_ap # 802.11 AP mode scanning +device wlan_scan_sta # 802.11 STA mode scanning +device iwi --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/i386/conf/TPX32V_NODEBUG 2007-10-05 12:30:14.000000000 +0200 @@ -0,0 +1,4 @@ +include TPX32C_NODEBUG +ident TPX32V_NODEBUG + +options VIMAGE --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/i386/conf/TPX32C_NODEBUG 2008-01-04 14:27:24.000000000 +0100 @@ -0,0 +1,117 @@ +cpu I686_CPU +ident TPX32C + +# To statically compile in device wiring instead of /boot/device.hints +#hints "GENERIC.hints" # Default places to look for devices. + +makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols + +options SCHED_4BSD # 4BSD scheduler +options PREEMPTION # Enable kernel thread preemption +options INET # InterNETworking +options INET6 # IPv6 communications protocols +options SCTP # Stream Transmission Control Protocol +options FFS # Berkeley Fast Filesystem +options SOFTUPDATES # Enable FFS soft updates support +options UFS_ACL # Support for access control lists +options UFS_DIRHASH # Improve performance on big directories +options UFS_GJOURNAL # Enable gjournal-based UFS journaling +options MSDOSFS # MSDOS Filesystem +options CD9660 # ISO 9660 Filesystem +options PROCFS # Process filesystem (requires PSEUDOFS) +options PSEUDOFS # Pseudo-filesystem framework +options GEOM_PART_GPT # GUID Partition Tables. +options GEOM_LABEL # Provides labelization +options COMPAT_43TTY # BSD 4.3 TTY compat [KEEP THIS!] +options COMPAT_FREEBSD4 # Compatible with FreeBSD4 +options COMPAT_FREEBSD5 # Compatible with FreeBSD5 +options COMPAT_FREEBSD6 # Compatible with FreeBSD6 +options KTRACE # ktrace(1) support +options SYSVSHM # SYSV-style shared memory +options SYSVMSG # SYSV-style message queues +options SYSVSEM # SYSV-style semaphores +options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions +options KBD_INSTALL_CDEV # install a CDEV entry in /dev +options STOP_NMI # Stop CPUS using NMI instead of IPI +options AUDIT # Security event auditing + +# Debugging for use in -current +#options KDB # Enable kernel debugger support. +#options DDB # Support DDB. +#options INVARIANTS # Enable calls of extra sanity checking +#options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS +#options WITNESS # Enable checks to detect deadlocks and cycles +#options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed + +# Bus support. +device pci + +# ATA and ATAPI devices +device ata +device atadisk # ATA disk drives +device atapicd # ATAPI CDROM drives +options ATA_STATIC_ID # Static device numbering + +# SCSI peripherals +device scbus # SCSI bus (required for SCSI) +device da # Direct Access (disks) +device pass # Passthrough device (direct SCSI access) + +# atkbdc0 controls both the keyboard and the PS/2 mouse +device atkbdc # AT keyboard controller +device atkbd # AT keyboard +device psm # PS/2 mouse + +device kbdmux # keyboard multiplexer + +device vga # VGA video card driver + +# syscons is the default console driver, resembling an SCO console +device sc + +device agp # support several AGP chipsets + +# Power management support (see NOTES for more options) +device apm +# Add suspend/resume support for the i8254. +device pmtimer + +# PCCARD (PCMCIA) support +# PCMCIA and cardbus bridge support +device cbb # cardbus (yenta) bridge +device pccard # PC Card (16-bit) bus +device cardbus # CardBus (32-bit) bus + +# Serial (COM) ports +device sio # 8250, 16[45]50 based serial ports +device uart # Generic UART driver + +# Parallel port +device ppc +device ppbus # Parallel port bus (required) +device lpt # Printer +device ppi # Parallel port interface device + +# Pseudo devices. +device loop # Network loopback +device random # Entropy device +device ether # Ethernet support +device tun # Packet tunnel. +device pty # Pseudo-ttys (telnet etc) +device md # Memory "disks" +device firmware # firmware assist module + +# The `bpf' device enables the Berkeley Packet Filter. +# Be aware of the administrative consequences of enabling this! +# Note that 'bpf' is required for DHCP. +device bpf # Berkeley packet filter + +options HZ=200 + +nooptions SCTP + +options IPSEC +device enc +device crypto + +options ALT_BREAK_TO_DEBUGGER --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/i386/conf/TPX32C 2007-12-10 10:26:48.000000000 +0100 @@ -0,0 +1,118 @@ +cpu I686_CPU +ident TPX32C + +# To statically compile in device wiring instead of /boot/device.hints +#hints "GENERIC.hints" # Default places to look for devices. + +makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols + +options SCHED_ULE +options PREEMPTION # Enable kernel thread preemption +options INET # InterNETworking +options INET6 # IPv6 communications protocols +options SCTP # Stream Transmission Control Protocol +options FFS # Berkeley Fast Filesystem +options SOFTUPDATES # Enable FFS soft updates support +options UFS_ACL # Support for access control lists +options UFS_DIRHASH # Improve performance on big directories +options UFS_GJOURNAL # Enable gjournal-based UFS journaling +options MSDOSFS # MSDOS Filesystem +options CD9660 # ISO 9660 Filesystem +options PROCFS # Process filesystem (requires PSEUDOFS) +options PSEUDOFS # Pseudo-filesystem framework +options NFSCLIENT # Network Filesystem Client +options NFS_ROOT # NFS usable as /, requires NFSCLIENT +options MD_ROOT # MD is a potential root device +options GEOM_PART_GPT # GUID Partition Tables. +options GEOM_LABEL # Provides labelization +options COMPAT_43TTY # BSD 4.3 TTY compat [KEEP THIS!] +options COMPAT_FREEBSD4 # Compatible with FreeBSD4 +options COMPAT_FREEBSD5 # Compatible with FreeBSD5 +options COMPAT_FREEBSD6 # Compatible with FreeBSD6 +options KTRACE # ktrace(1) support +options SYSVSHM # SYSV-style shared memory +options SYSVMSG # SYSV-style message queues +options SYSVSEM # SYSV-style semaphores +options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions +options KBD_INSTALL_CDEV # install a CDEV entry in /dev +options STOP_NMI # Stop CPUS using NMI instead of IPI +options AUDIT # Security event auditing + +# Debugging for use in -current +options KDB # Enable kernel debugger support. +options DDB # Support DDB. +options INVARIANTS # Enable calls of extra sanity checking +options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS +options WITNESS # Enable checks to detect deadlocks and cycles +options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed + +# Bus support. +device pci + +# ATA and ATAPI devices +device ata +device atadisk # ATA disk drives +device atapicd # ATAPI CDROM drives +options ATA_STATIC_ID # Static device numbering + +# SCSI peripherals +device scbus # SCSI bus (required for SCSI) +device da # Direct Access (disks) +device pass # Passthrough device (direct SCSI access) + +# atkbdc0 controls both the keyboard and the PS/2 mouse +device atkbdc # AT keyboard controller +device atkbd # AT keyboard +device psm # PS/2 mouse + +device kbdmux # keyboard multiplexer + +device vga # VGA video card driver + +# syscons is the default console driver, resembling an SCO console +device sc + +device agp # support several AGP chipsets + +# Power management support (see NOTES for more options) +device apm +# Add suspend/resume support for the i8254. +device pmtimer + +# PCCARD (PCMCIA) support +# PCMCIA and cardbus bridge support +device cbb # cardbus (yenta) bridge +device pccard # PC Card (16-bit) bus +device cardbus # CardBus (32-bit) bus + +# Serial (COM) ports +device sio # 8250, 16[45]50 based serial ports +device uart # Generic UART driver + +# Parallel port +device ppc +device ppbus # Parallel port bus (required) +device lpt # Printer +device ppi # Parallel port interface device + +# Pseudo devices. +device loop # Network loopback +device random # Entropy device +device ether # Ethernet support +device tun # Packet tunnel. +device pty # Pseudo-ttys (telnet etc) +device md # Memory "disks" +device firmware # firmware assist module + +# The `bpf' device enables the Berkeley Packet Filter. +# Be aware of the administrative consequences of enabling this! +# Note that 'bpf' is required for DHCP. +device bpf # Berkeley packet filter + +options CONSPEED=115200 # Speed for serial console +options HZ=200 +options MAC + +nooptions SCTP +options IPSEC +device crypto --- /u/marko/p4/head/src/sys/i386/i386/dump_machdep.c 2008-02-27 18:28:47.000000000 +0100 +++ src/sys/i386/i386/dump_machdep.c 2008-02-27 11:46:17.000000000 +0100 @@ -24,6 +24,8 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include "opt_vimage.h" + #include __FBSDID("$FreeBSD: src/sys/i386/i386/dump_machdep.c,v 1.14 2008/02/15 06:26:25 scottl Exp $"); @@ -34,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -109,6 +112,7 @@ mkdumpheader(struct kerneldumpheader *kdh, uint32_t archver, uint64_t dumplen, uint32_t blksz) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX */ bzero(kdh, sizeof(*kdh)); strncpy(kdh->magic, KERNELDUMPMAGIC, sizeof(kdh->magic)); @@ -118,7 +122,7 @@ kdh->dumplength = htod64(dumplen); kdh->dumptime = htod64(time_second); kdh->blocksize = htod32(blksz); - strncpy(kdh->hostname, hostname, sizeof(kdh->hostname)); + strncpy(kdh->hostname, V_hostname, sizeof(kdh->hostname)); strncpy(kdh->versionstring, version, sizeof(kdh->versionstring)); if (panicstr != NULL) strncpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring)); --- /u/marko/p4/head/src/sys/i386/i386/minidump_machdep.c 2008-02-27 18:28:48.000000000 +0100 +++ src/sys/i386/i386/minidump_machdep.c 2008-02-27 11:46:23.000000000 +0100 @@ -24,6 +24,8 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include "opt_vimage.h" + #include __FBSDID("$FreeBSD: src/sys/i386/i386/minidump_machdep.c,v 1.5 2008/02/15 06:26:25 scottl Exp $"); @@ -34,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -83,6 +86,7 @@ mkdumpheader(struct kerneldumpheader *kdh, uint32_t archver, uint64_t dumplen, uint32_t blksz) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); bzero(kdh, sizeof(*kdh)); strncpy(kdh->magic, KERNELDUMPMAGIC, sizeof(kdh->magic)); @@ -92,7 +96,7 @@ kdh->dumplength = htod64(dumplen); kdh->dumptime = htod64(time_second); kdh->blocksize = htod32(blksz); - strncpy(kdh->hostname, hostname, sizeof(kdh->hostname)); + strncpy(kdh->hostname, V_hostname, sizeof(kdh->hostname)); strncpy(kdh->versionstring, version, sizeof(kdh->versionstring)); if (panicstr != NULL) strncpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring)); --- /u/marko/p4/head/src/sys/kern/init_main.c 2008-01-15 17:58:03.000000000 +0100 +++ src/sys/kern/init_main.c 2008-02-27 11:46:50.000000000 +0100 @@ -47,6 +47,7 @@ #include "opt_ddb.h" #include "opt_init_path.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -66,6 +67,7 @@ #include #include #include +#include #include #include #include @@ -73,6 +75,7 @@ #include #include #include +#include #include @@ -447,6 +450,11 @@ p->p_ucred->cr_uidinfo = uifind(0); p->p_ucred->cr_ruidinfo = uifind(0); p->p_ucred->cr_prison = NULL; /* Don't jail it. */ +#ifdef VIMAGE + P_TO_VIMAGE(p) = LIST_FIRST(&vimage_head); + refcount_acquire(&P_TO_VIMAGE(p)->vi_ucredrefc); + LIST_FIRST(&vprocg_head)->nprocs++; +#endif #ifdef AUDIT audit_cred_kproc0(p->p_ucred); #endif --- /u/marko/p4/head/src/sys/kern/kern_clock.c 2007-12-27 19:31:49.000000000 +0100 +++ src/sys/kern/kern_clock.c 2008-01-14 19:23:33.000000000 +0100 @@ -42,6 +42,7 @@ #include "opt_hwpmc_hooks.h" #include "opt_ntp.h" #include "opt_watchdog.h" +#include "opt_vimage.h" #include #include @@ -65,6 +66,7 @@ #include #include #include +#include #ifdef GPROF #include @@ -87,6 +89,8 @@ static int sysctl_kern_cp_time(SYSCTL_HANDLER_ARGS) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); + int error; long cp_time[CPUSTATES]; #ifdef SCTL_MASK32 @@ -100,14 +104,14 @@ if (!req->oldptr) return SYSCTL_OUT(req, 0, sizeof(cp_time32)); for (i = 0; i < CPUSTATES; i++) - cp_time32[i] = (unsigned int)cp_time[i]; + cp_time32[i] = (unsigned int)V_cp_time[i]; error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32)); } else #endif { if (!req->oldptr) - return SYSCTL_OUT(req, 0, sizeof(cp_time)); - error = SYSCTL_OUT(req, cp_time, sizeof(cp_time)); + return SYSCTL_OUT(req, 0, sizeof(V_cp_time)); + error = SYSCTL_OUT(req, V_cp_time, sizeof(V_cp_time)); } return error; } @@ -223,6 +227,11 @@ int ticks; int psratio; +#ifdef VIMAGE +u_int tot_acc_statcalls; +int last_acc_ticks; +#endif + /* * Initialize clock frequencies and start both clocks running. */ @@ -468,9 +477,15 @@ struct proc *p; long rss; long *cp_time; + int sel; td = curthread; p = td->td_proc; +#ifdef VIMAGE + INIT_VPROCG(TD_TO_VPROCG(td)); + INIT_VCPU(TD_TO_VCPU(td)); + struct vprocg *vprocg_iter; +#endif cp_time = (long *)PCPU_PTR(cp_time); if (usermode) { @@ -483,9 +498,9 @@ #endif td->td_uticks++; if (p->p_nice > NZERO) - cp_time[CP_NICE]++; + sel = CP_NICE; else - cp_time[CP_USER]++; + sel = CP_USER; } else { /* * Came from kernel mode, so we were: @@ -502,7 +517,7 @@ if ((td->td_pflags & TDP_ITHREAD) || td->td_intr_nesting_level >= 2) { td->td_iticks++; - cp_time[CP_INTR]++; + sel = CP_INTR; } else { #ifdef KSE if (p->p_flag & P_SA) @@ -511,11 +526,50 @@ td->td_pticks++; td->td_sticks++; if (!TD_IS_IDLETHREAD(td)) - cp_time[CP_SYS]++; + sel = CP_SYS; else - cp_time[CP_IDLE]++; + sel = CP_IDLE; + } + } + atomic_add_long(&V_cp_time[sel], 1); /* XXX remove atomic! */ +#ifdef VIMAGE + if (sel != CP_INTR) + sel = CP_IDLE; + /* XXX list locking? */ + LIST_FOREACH(vprocg_iter, &vprocg_head, vprocg_le) + if (vprocg != vprocg_iter) + atomic_add_long(&vprocg_iter->_cp_time[sel], 1); + + /* Per-vcpu average accounting */ + mtx_lock_spin(&vcpu_list_mtx); + tot_acc_statcalls++; + if (!TD_IS_IDLETHREAD(td)) + V_acc_statcalls++; + + /* Deccay processing every 1/16 seconds */ + if (last_acc_ticks + (hz >> 4) <= ticks) { + u_int weight_fixp; + u_int avg0; + + last_acc_ticks = ticks; + /* + * avg0, avg1 and avg2 are stored in 16.16 fixed point format. + * weight_fixp is in 1.31 format for better accuracy. + * + * avg1 loses half of its value in roughly 150 ms. + * avg2 loses half of its value in roughly 1350 ms. + */ + weight_fixp = 0x80000000 / tot_acc_statcalls; + LIST_FOREACH(vcpu, &vcpu_head, vcpu_le) { + avg0 = (weight_fixp * V_acc_statcalls) >> 15; + V_avg1_fixp = (3 * V_avg1_fixp + avg0) >> 2; + V_avg2_fixp = (31 * V_avg2_fixp + avg0) >> 5; + V_acc_statcalls = 0; } + tot_acc_statcalls = 0; } + mtx_unlock_spin(&vcpu_list_mtx); +#endif /* Update resource usage integrals and maximums. */ MPASS(p->p_vmspace != NULL); --- /u/marko/p4/head/src/sys/kern/kern_exit.c 2008-02-27 18:28:48.000000000 +0100 +++ src/sys/kern/kern_exit.c 2008-02-27 11:47:03.000000000 +0100 @@ -40,6 +40,8 @@ #include "opt_compat.h" #include "opt_ktrace.h" #include "opt_mac.h" +#include "opt_sched.h" +#include "opt_vimage.h" #include #include @@ -67,6 +69,7 @@ #include #include #include +#include #ifdef KTRACE #include #endif @@ -165,6 +168,7 @@ } KASSERT(p->p_numthreads == 1, ("exit1: proc %p exiting with %d threads", p, p->p_numthreads)); + /* * Wakeup anyone in procfs' PIOCWAIT. They should have a hold * on our vmspace, so we should block below until they have @@ -404,6 +408,10 @@ LIST_REMOVE(p, p_list); LIST_INSERT_HEAD(&zombproc, p, p_list); LIST_REMOVE(p, p_hash); +#if defined(VIMAGE) && defined(SCHED_4BSD) + if (P_TO_VPROCG(p) != P_TO_VPROCG(p->p_pptr)) + sched_load_reassign(P_TO_VPROCG(p), P_TO_VPROCG(p->p_pptr)); +#endif sx_xunlock(&allproc_lock); /* @@ -661,6 +669,7 @@ AUDIT_ARG(pid, pid); q = td->td_proc; + if (pid == 0) { PROC_LOCK(q); pid = -q->p_pgid; @@ -705,6 +714,7 @@ nfound++; PROC_SLOCK(p); if (p->p_state == PRS_ZOMBIE) { + INIT_VPROCG(P_TO_VPROCG(p)); if (rusage) { *rusage = p->p_ru; calcru(p, &rusage->ru_utime, &rusage->ru_stime); @@ -792,6 +802,9 @@ uma_zfree(proc_zone, p); sx_xlock(&allproc_lock); nprocs--; +#ifdef VIMAGE + vprocg->nprocs--; +#endif sx_xunlock(&allproc_lock); return (0); } --- /u/marko/p4/head/src/sys/kern/kern_fork.c 2007-11-16 18:15:01.000000000 +0100 +++ src/sys/kern/kern_fork.c 2007-12-10 11:26:04.000000000 +0100 @@ -39,6 +39,7 @@ #include "opt_ktrace.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -65,6 +66,7 @@ #include #include #include +#include #include #include @@ -75,7 +77,6 @@ #include #include - #ifndef _SYS_SYSPROTO_H_ struct fork_args { int dummy; @@ -332,6 +333,9 @@ * are hard-limits as to the number of processes that can run. */ nprocs++; +#ifdef VIMAGE + P_TO_VPROCG(p1)->nprocs++; +#endif /* * Find an unused process ID. We remember a range of unused IDs @@ -500,6 +504,9 @@ td2->td_sigmask = td->td_sigmask; td2->td_flags = TDF_INMEM; + td2->td_vnet = NULL; /* XXX */ + td2->td_vnet_lpush = NULL; /* XXX */ + /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. --- /u/marko/p4/head/src/sys/kern/kern_jail.c 2008-01-28 23:53:46.000000000 +0100 +++ src/sys/kern/kern_jail.c 2008-02-27 11:47:04.000000000 +0100 @@ -11,6 +11,7 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_jail.c,v 1.75 2008/01/24 08:25:58 bz Exp $"); #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -33,6 +34,8 @@ #include #include #include +#include + #include #include @@ -449,6 +452,10 @@ if (cred2->cr_prison != cred1->cr_prison) return (ESRCH); } +#ifdef VIMAGE + if (cred2->cr_vimage->v_procg != cred1->cr_vimage->v_procg) + return (ESRCH); +#endif return (0); } @@ -469,13 +476,14 @@ void getcredhostname(struct ucred *cred, char *buf, size_t size) { + INIT_VPROCG(cred->cr_vimage->v_procg); if (jailed(cred)) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_host, size); mtx_unlock(&cred->cr_prison->pr_mtx); } else - strlcpy(buf, hostname, size); + strlcpy(buf, V_hostname, size); } /* --- /u/marko/p4/head/src/sys/kern/kern_kse.c 2007-11-16 18:15:01.000000000 +0100 +++ src/sys/kern/kern_kse.c 2007-12-10 11:26:04.000000000 +0100 @@ -1043,7 +1043,10 @@ */ bcopy(&td->td_startcopy, &td2->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); + td2->td_vnet = NULL; + td2->td_vnet_lpush = NULL; sched_fork_thread(td, td2); + thread_link(td2, ku->ku_proc); bcopy(ku->ku_proc->p_comm, td2->td_name, sizeof(td2->td_name)); /* inherit parts of blocked thread's context as a good template */ --- /u/marko/p4/head/src/sys/kern/kern_linker.c 2008-01-15 18:00:08.000000000 +0100 +++ src/sys/kern/kern_linker.c 2008-02-27 11:47:07.000000000 +0100 @@ -30,6 +30,7 @@ #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -51,6 +52,9 @@ #include #include #include +#include + +#include #include @@ -950,6 +954,13 @@ if ((error = priv_check(td, PRIV_KLD_LOAD)) != 0) return (error); +#ifdef VIMAGE + if (!IS_DEFAULT_VIMAGE(TD_TO_VIMAGE(td))) + return (EPERM); + + CURVNET_SET(TD_TO_VNET(td)); +#endif + /* * If file does not contain a qualified name or any dot in it * (kldname.ko, or kldname.ver.ko) treat it as an interface @@ -977,6 +988,7 @@ *fileid = lf->id; unlock: KLD_UNLOCK(); + CURVNET_RESTORE(); return (error); } @@ -1014,6 +1026,11 @@ if ((error = priv_check(td, PRIV_KLD_UNLOAD)) != 0) return (error); + /* XXX should suser catch this for us? */ + VNET_ASSERT(IS_DEFAULT_VIMAGE(TD_TO_VIMAGE(td))); + + CURVNET_SET(TD_TO_VNET(td)); + KLD_LOCK(); lf = linker_find_file_by_id(fileid); if (lf) { @@ -1050,6 +1067,7 @@ PMC_CALL_HOOK(td, PMC_FN_KLD_UNLOAD, (void *) &pkm); #endif KLD_UNLOCK(); + CURVNET_RESTORE(); return (error); } @@ -1267,12 +1285,24 @@ lookup.symvalue = (uintptr_t)symval.value; lookup.symsize = symval.size; error = copyout(&lookup, uap->data, - sizeof(lookup)); + sizeof(lookup)); break; } } +#ifdef VIMAGE + if (lf == NULL) { + CURVNET_SET(TD_TO_VNET(td)); + error = vi_symlookup(&lookup, symstr); + CURVNET_RESTORE(); + if (error == 0) { + error = copyout(&lookup, uap->data, + sizeof(lookup)); + } + } +#else if (lf == NULL) error = ENOENT; +#endif } KLD_UNLOCK(); out: --- /u/marko/p4/head/src/sys/kern/kern_mib.c 2008-02-27 18:28:48.000000000 +0100 +++ src/sys/kern/kern_mib.c 2008-02-27 11:47:10.000000000 +0100 @@ -41,6 +41,7 @@ #include "opt_compat.h" #include "opt_posix.h" #include "opt_config.h" +#include "opt_vimage.h" #include #include @@ -53,6 +54,7 @@ #include #include #include +#include SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW, 0, "Sysctl internal magic"); @@ -206,11 +208,14 @@ SYSCTL_STRING(_hw, HW_MACHINE_ARCH, machine_arch, CTLFLAG_RD, machine_arch, 0, "System architecture"); +#ifndef VIMAGE char hostname[MAXHOSTNAMELEN]; +#endif static int sysctl_hostname(SYSCTL_HANDLER_ARGS) { + INIT_VPROCG(TD_TO_VPROCG(req->td)); struct prison *pr; char tmphostname[MAXHOSTNAMELEN]; int error; @@ -242,7 +247,7 @@ } } else error = sysctl_handle_string(oidp, - hostname, sizeof hostname, req); + V_hostname, sizeof V_hostname, req); return (error); } @@ -328,9 +333,12 @@ 0, 0, sysctl_kern_config, "", "Kernel configuration file"); #endif +#ifndef VIMAGE char domainname[MAXHOSTNAMELEN]; -SYSCTL_STRING(_kern, KERN_NISDOMAINNAME, domainname, CTLFLAG_RW, - &domainname, sizeof(domainname), "Name of the current YP/NIS domain"); +#endif +SYSCTL_V_STRING(V_PROCG, vprocg, _kern, KERN_NISDOMAINNAME, domainname, + CTLFLAG_RW, domainname, MAXHOSTNAMELEN, + "Name of the current YP/NIS domain"); u_long hostid; SYSCTL_ULONG(_kern, KERN_HOSTID, hostid, CTLFLAG_RW, &hostid, 0, "Host ID"); --- /u/marko/p4/head/src/sys/kern/kern_prot.c 2007-10-29 17:17:39.000000000 +0100 +++ src/sys/kern/kern_prot.c 2007-12-10 11:26:04.000000000 +0100 @@ -46,6 +46,7 @@ #include "opt_compat.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -67,6 +68,7 @@ #include #include #include +#include #include #include @@ -1720,6 +1722,9 @@ KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); +#ifdef VIMAGE + if (!vi_child_of(TD_TO_VIMAGE(td), P_TO_VIMAGE(p))) +#endif if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC @@ -1789,6 +1794,10 @@ */ if (jailed(cr)) prison_free(cr->cr_prison); +#ifdef VIMAGE + if (cr->cr_vimage != NULL) + refcount_release(&cr->cr_vimage->vi_ucredrefc); +#endif #ifdef AUDIT audit_cred_destroy(cr); #endif @@ -1824,6 +1833,10 @@ uihold(dest->cr_ruidinfo); if (jailed(dest)) prison_hold(dest->cr_prison); +#ifdef VIMAGE + KASSERT(src->cr_vimage != NULL, ("cr_vimage == NULL")); + refcount_acquire(&dest->cr_vimage->vi_ucredrefc); +#endif #ifdef AUDIT audit_cred_copy(src, dest); #endif --- /u/marko/p4/head/src/sys/kern/kern_switch.c 2007-11-14 19:35:22.000000000 +0100 +++ src/sys/kern/kern_switch.c 2007-12-10 11:26:05.000000000 +0100 @@ -529,6 +529,7 @@ return (NULL); } + /* * Remove the thread from the queue specified by its priority, and clear the * corresponding status bit if the queue becomes empty. --- /u/marko/p4/head/src/sys/kern/kern_synch.c 2008-01-15 18:00:09.000000000 +0100 +++ src/sys/kern/kern_synch.c 2008-02-27 11:47:20.000000000 +0100 @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_synch.c,v 1.305 2008/01/10 22:11:20 rwatson Exp $"); #include "opt_ktrace.h" +#include "opt_vimage.h" #include #include @@ -61,9 +62,12 @@ #include #include #endif +#include #include +#include + static void synch_setup(void *dummy); SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup, NULL) @@ -74,8 +78,11 @@ static struct callout loadav_callout; static struct callout lbolt_callout; +#ifndef VIMAGE struct loadavg averunnable = { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ +#endif + /* * Constants for averages over 1, 5, and 15 minutes * when sampling at 5 second intervals. @@ -513,12 +520,19 @@ int i, nrun; struct loadavg *avg; + VPROCG_ITERLOOP_BEGIN(); + INIT_VPROCG(vprocg_iter); +#ifdef VIMAGE + nrun = sched_load(vprocg_iter); +#else nrun = sched_load(); - avg = &averunnable; +#endif + avg = &V_averunnable; for (i = 0; i < 3; i++) avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; + VPROCG_ITERLOOP_END(); /* * Schedule the next update to occur after 5 seconds, but add a --- /u/marko/p4/head/src/sys/kern/kern_sysctl.c 2007-12-03 11:00:00.000000000 +0100 +++ src/sys/kern/kern_sysctl.c 2007-12-10 11:26:05.000000000 +0100 @@ -40,6 +40,7 @@ #include "opt_compat.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -52,6 +53,7 @@ #include #include #include +#include #include @@ -845,6 +847,32 @@ } +#ifdef VIMAGE +int +sysctl_handle_v_int(SYSCTL_HANDLER_V_ARGS) +{ + int tmpout, error = 0; + + SYSCTL_RESOLVE_V_ARG1(); + + /* + * Attempt to get a coherent snapshot by making a copy of the data. + */ + tmpout = *(int *)arg1; + error = SYSCTL_OUT(req, &tmpout, sizeof(int)); + + if (error || !req->newptr) + return (error); + + if (!arg1) + error = EPERM; + else + error = SYSCTL_IN(req, arg1, sizeof(int)); + return (error); +} +#endif + + /* * Based on on sysctl_handle_int() convert milliseconds into ticks. */ @@ -979,6 +1007,48 @@ return (error); } +#ifdef VIMAGE +int +sysctl_handle_v_string(SYSCTL_HANDLER_V_ARGS) +{ + int error=0; + char *tmparg; + size_t outlen; + + SYSCTL_RESOLVE_V_ARG1(); + + /* + * Attempt to get a coherent snapshot by copying to a + * temporary kernel buffer. + */ +retry: + outlen = strlen((char *)arg1)+1; + tmparg = malloc(outlen, M_SYSCTLTMP, M_WAITOK); + + if (strlcpy(tmparg, (char *)arg1, outlen) >= outlen) { + free(tmparg, M_SYSCTLTMP); + goto retry; + } + + error = SYSCTL_OUT(req, tmparg, outlen); + free(tmparg, M_SYSCTLTMP); + + if (error || !req->newptr) + return (error); + + if ((req->newlen - req->newidx) >= arg2) { + error = EINVAL; + } else { + arg2 = (req->newlen - req->newidx); + error = SYSCTL_IN(req, arg1, arg2); + ((char *)arg1)[arg2] = '\0'; + } + + return (error); +} +#endif + + /* * Handle any kind of opaque data. * arg1 points to it, arg2 is the size. @@ -1016,6 +1086,35 @@ return (error); } +#ifdef VIMAGE +int +sysctl_handle_v_opaque(SYSCTL_HANDLER_V_ARGS) +{ + int error, tries; + u_int generation; + struct sysctl_req req2; + + SYSCTL_RESOLVE_V_ARG1(); + + tries = 0; + req2 = *req; +retry: + generation = curthread->td_generation; + error = SYSCTL_OUT(req, arg1, arg2); + if (error) + return (error); + tries++; + if (generation != curthread->td_generation && tries < 3) { + *req = req2; + goto retry; + } + + error = SYSCTL_IN(req, arg1, arg2); + + return (error); +} +#endif + /* * Transfer functions to/from kernel space. * XXX: rather untested at this point @@ -1322,7 +1421,17 @@ if (error != 0) return (error); #endif +#ifndef VIMAGE error = oid->oid_handler(oid, arg1, arg2, req); +#else + if (oid->oid_v_subs) { + struct sysctl_v_oid *v_oid = (struct sysctl_v_oid *) oid; + error = v_oid->oid_handler(oid, arg1, arg2, + req, oid->oid_v_subs, + oid->oid_v_mod); + } else + error = oid->oid_handler(oid, arg1, arg2, req); +#endif return (error); } @@ -1413,6 +1522,7 @@ req.lock = REQ_LOCKED; SYSCTL_LOCK(); + CURVNET_SET(TD_TO_VNET(curthread)); do { req.oldidx = 0; @@ -1423,6 +1533,7 @@ if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); + CURVNET_RESTORE(); SYSCTL_UNLOCK(); if (error && error != ENOMEM) --- /u/marko/p4/head/src/sys/kern/kern_thread.c 2007-12-27 19:31:56.000000000 +0100 +++ src/sys/kern/kern_thread.c 2008-01-14 19:23:37.000000000 +0100 @@ -26,6 +26,8 @@ * DAMAGE. */ +#include "opt_vimage.h" + #include __FBSDID("$FreeBSD: src/sys/kern/kern_thread.c,v 1.265 2007/12/22 04:56:48 julian Exp $"); @@ -44,6 +46,7 @@ #include #include #include +#include #include --- /u/marko/p4/head/src/sys/kern/kern_timeout.c 2008-02-27 18:28:49.000000000 +0100 +++ src/sys/kern/kern_timeout.c 2008-02-27 11:47:24.000000000 +0100 @@ -73,6 +73,9 @@ struct callout_tailq *callwheel; int softticks; /* Like ticks, but for softclock(). */ struct mtx callout_lock; +#ifdef INVARIANTS +static int callwheel_initialized = 0; +#endif static struct callout *nextsoftcheck; /* Next callout to be checked. */ @@ -143,6 +146,9 @@ TAILQ_INIT(&callwheel[i]); } mtx_init(&callout_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE); +#ifdef INVARIANTS + callwheel_initialized = 1; +#endif } /* @@ -612,11 +618,36 @@ return (1); } +#ifdef INVARIANTS +/* + * Examine the entire callwhell before initializing a new handle, + * and panic if the handle was already linked in. + */ +#define CALLWHEEL_CHECK(c) \ + if (callwheel_initialized) { \ + int callwheel_iter; \ + struct callout *c_iter; \ + \ + mtx_lock_spin(&callout_lock); \ + for (callwheel_iter = 0; callwheel_iter <= callwheelmask; \ + callwheel_iter++) \ + TAILQ_FOREACH(c_iter, &callwheel[callwheel_iter], \ + c_links.tqe) \ + if (c_iter == c) \ + panic("%s() for active handle!", \ + __FUNCTION__); \ + mtx_unlock_spin(&callout_lock); \ + } +#else +#define CALLWHEEL_CHECK(c) +#endif /* INVARIANTS */ + void callout_init(c, mpsafe) struct callout *c; int mpsafe; { + CALLWHEEL_CHECK(c); bzero(c, sizeof *c); if (mpsafe) { c->c_lock = NULL; @@ -633,6 +664,7 @@ struct lock_object *lock; int flags; { + CALLWHEEL_CHECK(c); bzero(c, sizeof *c); c->c_lock = lock; KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0, --- /u/marko/p4/head/src/sys/kern/kern_uuid.c 2007-08-31 03:47:34.000000000 +0200 +++ src/sys/kern/kern_uuid.c 2007-10-22 18:06:31.000000000 +0200 @@ -27,6 +27,8 @@ #include __FBSDID("$FreeBSD: src/sys/kern/kern_uuid.c,v 1.13 2007/04/23 12:53:00 pjd Exp $"); +#include "opt_vimage.h" + #include #include #include @@ -37,7 +39,9 @@ #include #include #include +#include +#include #include #include #include @@ -87,13 +91,14 @@ static void uuid_node(uint16_t *node) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifaddr *ifa; struct sockaddr_dl *sdl; int i; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Walk the address list */ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sdl = (struct sockaddr_dl*)ifa->ifa_addr; --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/kern/kern_vimage.c 2008-02-27 18:30:30.000000000 +0100 @@ -0,0 +1,1011 @@ +/*- + * Copyright (c) 2004, 2005, 2006 University of Zagreb + * Copyright (c) 2006 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * XXX RCS tag goes here + */ + +#include "opt_ddb.h" +#include "opt_vimage.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DDB +#include +#endif + +#include +#include +#include +#include +#include + +//#define DEBUG_ORDERING + +MALLOC_DEFINE(M_VIMAGE, "vimage", "virtual image resource container"); +MALLOC_DEFINE(M_VNET, "vnet", "network stack control block"); +MALLOC_DEFINE(M_VPROCG, "vprocg", "process group control block"); +MALLOC_DEFINE(M_VCPU, "vcpu", "cpu resource control block"); + +static struct vimage *vi_alloc(struct vimage *, char *); +static int vi_destroy(struct vimage *); +static void vnet_mod_complete_registration(struct vnet_modlink *); +static int vnet_mod_constructor(struct vnet_modlink *); +static int vnet_mod_destructor(struct vnet_modlink *); + +#ifdef VI_PREALLOC_SIZE +/* + * A private memory allocator can be enabled by setting VI_PREALLOC_SIZE + * to amount of memory (in bytes) to be reserved for the allocator at + * boot time. This pool is guaranteed to reside on a 4M superpage(s) on + * i386 and amd64, thus potentially reducing TLB trashing. + * + * So far I couldn't observe any significant performance impact of using + * this allocator vs. the standard malloc(), whereas in FreeBSD 4.11 + * days I recall using "uninitialized data" storage vs. malloc() would + * be an instant win... Is it possible that these days all malloc'ed + * kernel storage is automagically placed on 4M superpages, so that this + * effort is redundant? Who knows... Therefore this code is disabled by + * default, so vi_alloc() and vi_free() simply resolve to standard + * malloc() and free(). + */ + +static void *vi_malloc(unsigned long, struct malloc_type *, int); +static void vi_free(void *, struct malloc_type *); + +struct vi_mtrack { + LIST_ENTRY(vi_mtrack) vmt_le; + char *vmt_addr; + size_t vmt_size; + int vmt_flags; +}; + +static char vi_mpool[VI_PREALLOC_SIZE]; +static struct uma_zone *vi_mtrack_zone; +static LIST_HEAD(, vi_mtrack) vi_mem_free_head; +static LIST_HEAD(, vi_mtrack) vi_mem_alloc_head; +static int vi_mpool_fail_cnt = 0; +#else +#define vi_malloc(addr, type, flags) malloc((addr), (type), (flags)) +#define vi_free(addr, type) free((addr), (type)) +#endif /* VI_PREALLOC_SIZE */ + +struct vimage_list_head vimage_head; +struct vnet_list_head vnet_head; +struct vprocg_list_head vprocg_head; +struct vcpu_list_head vcpu_head; + +struct cv vnet_list_condvar; +struct mtx vnet_list_refc_mtx; +int vnet_list_refc = 0; + +struct mtx vcpu_list_mtx; + +#define VNET_LIST_LOCK() \ + mtx_lock(&vnet_list_refc_mtx); \ + while (vnet_list_refc != 0) \ + cv_wait(&vnet_list_condvar, &vnet_list_refc_mtx); + +#define VNET_LIST_UNLOCK() \ + mtx_unlock(&vnet_list_refc_mtx); + +static u_int last_vi_id = 0; +static u_int last_vnet_id = 0; +static u_int last_vprocg_id = 0; +static u_int last_vcpu_id = 0; + +static TAILQ_HEAD(vnet_modlink_head, vnet_modlink) vnet_modlink_head; +static TAILQ_HEAD(vnet_modpending_head, vnet_modlink) vnet_modpending_head; + +void vnet_mod_register(vmi) + const struct vnet_modinfo *vmi; +{ + vnet_mod_register_multi(vmi, NULL, NULL); +} + +void vnet_mod_register_multi(vmi, iarg, iname) + const struct vnet_modinfo *vmi; + const void *iarg; + const char *iname; +{ + struct vnet_modlink *vml, *vml_iter; + + /* Do not register the same module instance more than once */ + TAILQ_FOREACH(vml_iter, &vnet_modlink_head, vml_mod_le) + if (vml_iter->vml_modinfo == vmi && vml_iter->vml_iarg == iarg) + break; + if (vml_iter != NULL) + panic("attempt to register an already registered vnet module"); + vml = vi_malloc(sizeof(struct vnet_modlink), M_VIMAGE, M_NOWAIT); + + /* + * XXX we support only statically assigned module IDs at the time. + * In principle modules should be able to get a dynamically + * assigned ID at registration time. + */ + VNET_ASSERT(vmi->vmi_id > 0 || vmi->vmi_id < VNET_MOD_MAX); + VNET_ASSERT(!((iarg == NULL) ^ (iname == NULL))); + + vml->vml_modinfo = vmi; + vml->vml_iarg = iarg; + vml->vml_iname = iname; + + /* Check whether the module we depend on is already registered */ + if (vmi->vmi_dependson != VNET_MOD_NONE) { + TAILQ_FOREACH(vml_iter, &vnet_modlink_head, vml_mod_le) + if (vml_iter->vml_modinfo->vmi_id == + vmi->vmi_dependson) + break; /* Depencency found, we are done */ + if (vml_iter == NULL) { +#ifdef DEBUG_ORDERING + printf("dependency %d missing for vnet mod %s," + "postponing registration\n", + vmi->vmi_dependson, vmi->vmi_name); +#endif /* DEBUG_ORDERING */ + TAILQ_INSERT_TAIL(&vnet_modpending_head, vml, + vml_mod_le); + return; + } + } + + vnet_mod_complete_registration(vml); +} + +void vnet_mod_complete_registration(vml) +struct vnet_modlink *vml; +{ + struct vnet_modlink *vml_iter; + + TAILQ_INSERT_TAIL(&vnet_modlink_head, vml, vml_mod_le); + + VNET_ITERLOOP_BEGIN_QUIET(); + vnet_mod_constructor(vml); + VNET_ITERLOOP_END(); + + /* Check for pending modules depending on us */ + do { + TAILQ_FOREACH(vml_iter, &vnet_modpending_head, vml_mod_le) + if (vml_iter->vml_modinfo->vmi_dependson == + vml->vml_modinfo->vmi_id) + break; + if (vml_iter != NULL) { +#ifdef DEBUG_ORDERING + printf("vnet mod %s now registering," + "dependency %d loaded\n", + vml_iter->vml_modinfo->vmi_name, + vml->vml_modinfo->vmi_id); +#endif /* DEBUG_ORDERING */ + TAILQ_REMOVE(&vnet_modpending_head, vml_iter, + vml_mod_le); + vnet_mod_complete_registration(vml_iter); + } + } while (vml_iter != NULL); +} + +void vnet_mod_deregister(vmi) + const struct vnet_modinfo *vmi; +{ + vnet_mod_deregister_multi(vmi, NULL, NULL); +} + +void vnet_mod_deregister_multi(vmi, iarg, iname) + const struct vnet_modinfo *vmi; + const void *iarg; + const char *iname; +{ + struct vnet_modlink *vml; + + TAILQ_FOREACH(vml, &vnet_modlink_head, vml_mod_le) + if (vml->vml_modinfo == vmi && vml->vml_iarg == iarg) + break; + if (vml == NULL) + panic("cannot deregister unregistered vnet module %s", + vmi->vmi_name); + + VNET_ITERLOOP_BEGIN_QUIET(); + vnet_mod_destructor(vml); + VNET_ITERLOOP_END(); + + TAILQ_REMOVE(&vnet_modlink_head, vml, vml_mod_le); + vi_free(vml, M_VIMAGE); +} + +struct vimage *vnet2vimage(vnet) + struct vnet *vnet; +{ + struct vimage *vip; + + LIST_FOREACH(vip, &vimage_head, vi_le) + if (vip->v_net == vnet) + return(vip); + + panic("vnet2vimage"); /* must never happen */ +} + +char *vnet_name(vnet) + struct vnet *vnet; +{ + return(vnet2vimage(vnet)->vi_name); +} + + +int +vi_child_of(parent, child) + struct vimage *parent, *child; +{ + if (child == parent) + return (0); + for (; child; child = child->vi_parent) + if (child == parent) + return (1); + return (0); +} + +/* + * if_reassign_common() should be called by all device specific + * ifnet reassignment routines after the interface is detached from + * current vnet and before the interface gets attached to the target + * vnet. This routine attempts to shrink if_index in current vnet, + * find an unused if_index in target vnet and calls if_grow() if + * necessary, and finally finds an unused if_xname for the target + * vnet. + * + * XXX this routine should hold a lock over if_index and return with + * such a lock held, and the caller should release that lock + * after ifattach completes! + */ +void +if_reassign_common(struct ifnet *ifp, struct vnet *new_vnet, const char *dname) +{ + /* do/while construct needed to confine scope of INIT_VNET_NET() */ + do { + INIT_VNET_NET(curvnet); + + ifnet_byindex(ifp->if_index) = NULL; + /* XXX: should be locked with if_findindex() */ + while (V_if_index > 0 && ifnet_byindex(V_if_index) == NULL) + V_if_index--; + } while (0); + + CURVNET_SET_QUIET(new_vnet); + INIT_VNET_NET(new_vnet); + /* + * Try to find an empty slot below if_index. If we fail, take + * the next slot. + * + * XXX: should be locked! + */ + for (ifp->if_index = 1; ifp->if_index <= V_if_index; ifp->if_index++) { + if (ifnet_byindex(ifp->if_index) == NULL) + break; + } + /* Catch if_index overflow. */ + if (ifp->if_index < 1) + panic("vi_if_move: if_index overflow"); + + if (ifp->if_index > V_if_index) + V_if_index = ifp->if_index; + if (V_if_index >= V_if_indexlim) + if_grow(); + ifnet_byindex(ifp->if_index) = ifp; + + /* Rename the ifnet */ + if (new_vnet == ifp->if_home_vnet) { + /* always restore the original name on return to home vnet */ + snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", ifp->if_dname, + ifp->if_dunit); + } else { + int unit = 0; + struct ifnet *iter; + + do { + snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", dname, unit); + TAILQ_FOREACH(iter, &V_ifnet, if_link) + if (strcmp(ifp->if_xname, iter->if_xname) == 0) + break; + unit++; + } while (iter); + } + CURVNET_RESTORE(); +} + +/* + * Move the interface to another vnet. The interface can be specified either + * by ifp argument, or by name contained in vi_req->vi_chroot if NULL is + * passed as ifp. The interface will be renamed to vi_req->vi_parent_name + * if vi_req->vi_parent_name is not an empty string (uff ugly ugly)... + * Similary, the target vnet can be specified either by vnet argument or + * by name. If vnet name equals to ".." or vi_req is set to NULL the + * interface is moved to the parent vnet. + */ +int +vi_if_move(vi_req, ifp, vip) + struct vi_req *vi_req; + struct ifnet *ifp; + struct vimage *vip; +{ + struct vimage *new_vip; + struct vnet *new_vnet = NULL; + + if (vi_req == NULL || strcmp(vi_req->vi_name, "..") == 0) { + if (IS_DEFAULT_VIMAGE(vip)) + return (ENXIO); + new_vnet = vip->vi_parent->v_net; + } else { + new_vip = vimage_by_name(vip, vi_req->vi_name); + if (new_vip == NULL) + return (ENXIO); + new_vnet = new_vip->v_net; + } + + if (ifp == NULL) + ifp = ifunit(vi_req->vi_chroot); + if (ifp == NULL) + return (ENXIO); + + /* Abort if driver did not provide a if_reassign() method */ + if (ifp->if_reassign == NULL) + return (ENODEV); + + if (vi_req != NULL) { + struct ifnet *t_ifp; + + CURVNET_SET_QUIET(new_vnet); + t_ifp = ifunit(vi_req->vi_if_xname); + CURVNET_RESTORE(); + if (t_ifp != NULL) + return (EEXIST); + } + + if (vi_req && strlen(vi_req->vi_if_xname) > 0) + ifp->if_reassign(ifp, new_vnet, vi_req->vi_if_xname); + else + ifp->if_reassign(ifp, new_vnet, NULL); + getmicrotime(&ifp->if_lastchange); + + /* Report the new if_xname back to the userland */ + if (vi_req != NULL) + sprintf(vi_req->vi_chroot, "%s", ifp->if_xname); + + return (0); +} + + +struct vimage * +vimage_by_name(struct vimage *top, char *name) +{ + struct vimage *vip; + char *next_name; + int namelen; + + next_name = strchr(name, '.'); + if (next_name != NULL) { + namelen = next_name - name; + next_name++; + if (namelen == 0) { + if (strlen(next_name) == 0) + return(top); /* '.' == this vimage */ + else + return(NULL); + } + } else + namelen = strlen(name); + if (namelen == 0) + return(NULL); + LIST_FOREACH(vip, &top->vi_child_head, vi_sibling) + if (strlen(vip->vi_name) == namelen && + strncmp(name, vip->vi_name, namelen) == 0) { + if (next_name != NULL) + return(vimage_by_name(vip, next_name)); + else + return(vip); + } + return(NULL); +} + + +static void +vimage_relative_name(struct vimage *top, struct vimage *where, + char *buffer, int bufflen) +{ + int used = 1; + + if (where == top) { + sprintf(buffer, "."); + return; + } else + *buffer = 0; + + do { + int namelen = strlen(where->vi_name); + + if (namelen + used + 1 >= bufflen) + panic("buffer overflow"); + + if (used > 1) { + bcopy(buffer, &buffer[namelen + 1], used); + buffer[namelen] = '.'; + used++; + } else + bcopy(buffer, &buffer[namelen], used); + bcopy(where->vi_name, buffer, namelen); + used += namelen; + where = where->vi_parent; + } while (where != top); +} + + +static struct vimage * +vimage_get_next(struct vimage *top, struct vimage *where, int recurse) +{ + struct vimage *next; + + if (recurse) { + /* Try to go deeper in the hierarchy */ + next = LIST_FIRST(&where->vi_child_head); + if (next != NULL) + return(next); + } + + do { + /* Try to find next sibling */ + next = LIST_NEXT(where, vi_sibling); + if (!recurse || next != NULL) + return(next); + + /* Nothing left on this level, go one level up */ + where = where->vi_parent; + } while (where != top->vi_parent); + + /* Nothing left to be visited, we are done */ + return(NULL); +} + + +int +vi_td_ioctl(cmd, vi_req, td) + u_long cmd; + struct vi_req *vi_req; + struct thread *td; +{ + int error; + struct vimage *vip = TD_TO_VIMAGE(td); + struct vimage *vip_r = NULL; + + error = suser(td); /* XXX replace with priv(9) */ + if (error) + return (error); + + vip_r = vimage_by_name(vip, vi_req->vi_name); + if (vip_r == NULL && !(vi_req->req_action & VI_CREATE)) + return (ESRCH); + if (vip_r != NULL && vi_req->req_action & VI_CREATE) + return (EADDRINUSE); + if (vi_req->req_action == VI_GETNEXT) { + vip_r = vimage_get_next(vip, vip_r, 0); + if (vip_r == NULL) + return (ESRCH); + } + if (vi_req->req_action == VI_GETNEXT_RECURSE) { + vip_r = vimage_get_next(vip, vip_r, 1); + if (vip_r == NULL) + return (ESRCH); + } + + if (vip_r && !vi_child_of(vip, vip_r) && /* XXX delete the rest? */ + vi_req->req_action != VI_GET && vi_req->req_action != VI_GETNEXT) + return (EPERM); + + switch (cmd) { + + case SIOCGPVIMAGE: + vimage_relative_name(vip, vip_r, vi_req->vi_name, + sizeof (vi_req->vi_name)); + bcopy(&vip_r->v_procg->_averunnable, &vi_req->averunnable, + sizeof (vi_req->averunnable)); + vi_req->vi_proc_count = vip_r->v_procg->nprocs; + vi_req->vi_if_count = vip_r->v_net->ifccnt; + vi_req->vi_sock_count = vip_r->v_net->sockcnt; + vi_req->cp_time_avg = vip_r->v_cpu->_avg2_fixp; + break; + + case SIOCSPVIMAGE: + if (vi_req->req_action == VI_DESTROY) { + error = vi_destroy(vip_r); + break; + } + + if (vi_req->req_action == VI_SWITCHTO) { + struct proc *p = td->td_proc; + struct ucred *oldcred, *newcred; + + /* + * XXX priv_check()? + * XXX allow only a single td per proc here? + */ + newcred = crget(); + PROC_LOCK(p); + oldcred = p->p_ucred; + setsugid(p); + crcopy(newcred, oldcred); + refcount_release(&newcred->cr_vimage->vi_ucredrefc); + newcred->cr_vimage = vip_r; + refcount_acquire(&newcred->cr_vimage->vi_ucredrefc); + p->p_ucred = newcred; + PROC_UNLOCK(p); + sx_xlock(&allproc_lock); + oldcred->cr_vimage->v_procg->nprocs--; + refcount_release(&oldcred->cr_vimage->vi_ucredrefc); + P_TO_VPROCG(p)->nprocs++; + sched_load_reassign(oldcred->cr_vimage->v_procg, + newcred->cr_vimage->v_procg); + sx_xunlock(&allproc_lock); + crfree(oldcred); + break; + } + + if (vi_req->req_action & VI_CREATE) { + char *dotpos; + + dotpos = strrchr(vi_req->vi_name, '.'); + if (dotpos != NULL) { + *dotpos = 0; + vip = vimage_by_name(vip, vi_req->vi_name); + if (vip == NULL) + return (ESRCH); + dotpos++; + vip_r = vi_alloc(vip, dotpos); + } else + vip_r = vi_alloc(vip, vi_req->vi_name); + if (vip_r == NULL) + return (ENOMEM); + } + + /* XXX What the hell is this doing here? */ + if (vip == vip_r && !IS_DEFAULT_VIMAGE(vip)) + return (EPERM); + } + + return (error); +} + + +int +vi_symlookup(lookup, symstr) + struct kld_sym_lookup *lookup; + char *symstr; +{ + struct vnet_modlink *vml; + + TAILQ_FOREACH(vml, &vnet_modlink_head, vml_mod_le) { + struct vnet_symmap *mapentry; + + if (vml->vml_modinfo->vmi_symmap == NULL) + continue; + + for (mapentry = vml->vml_modinfo->vmi_symmap; + mapentry->name != NULL; mapentry++) { + if (strcmp(symstr, mapentry->name) == 0) { + lookup->symvalue = + (u_long) curvnet->mod_data[vml->vml_modinfo->vmi_id]; + lookup->symvalue += mapentry->offset; + lookup->symsize = mapentry->size; + return 0; + } + } + } + + return ENOENT; +} + + +struct vimage * +vi_alloc(struct vimage *parent, char *name) +{ + struct vimage *vip; + struct vnet *vnet; + struct vprocg *vprocg; + struct vcpu *vcpu; + struct vnet_modlink *vml; + + /* + * XXX don't forget the locking + */ + + /* A brute force check whether there's enough mem for a new vimage */ + vip = malloc(512*1024, M_VIMAGE, M_NOWAIT); /* XXX aaaargh... */ + if (vip == NULL) + goto vi_alloc_done; + free(vip, M_VIMAGE); + + vip = vi_malloc(sizeof(struct vimage), M_VIMAGE, M_NOWAIT | M_ZERO); + if (vip == NULL) + panic("vi_alloc: malloc failed for vimage \"%s\"\n", name); + vip->vi_id = last_vi_id++; + LIST_INIT(&vip->vi_child_head); + sprintf(vip->vi_name, "%s", name); + vip->vi_parent = parent; + /* XXX locking */ + if (parent != NULL) + LIST_INSERT_HEAD(&parent->vi_child_head, vip, vi_sibling); + else if (!LIST_EMPTY(&vimage_head)) + panic("there can be only one default vimage!"); + LIST_INSERT_HEAD(&vimage_head, vip, vi_le); + + vnet = vi_malloc(sizeof(struct vnet), M_VNET, M_NOWAIT | M_ZERO); + if (vnet == NULL) + panic("vi_alloc: malloc failed for vnet \"%s\"\n", name); + vip->v_net = vnet; + vnet->vnet_id = last_vnet_id++; + vnet->vnet_magic_n = VNET_MAGIC_N; + + vprocg = vi_malloc(sizeof(struct vprocg), M_VPROCG, M_NOWAIT | M_ZERO); + if (vprocg == NULL) + panic("vi_alloc: malloc failed for vprocg \"%s\"\n", name); + vip->v_procg = vprocg; + vprocg->vprocg_id = last_vprocg_id++; + + vcpu = vi_malloc(sizeof(struct vcpu), M_VCPU, M_NOWAIT | M_ZERO); + if (vcpu == NULL) + panic ("vi_alloc: malloc failed for vcpu \"%s\"\n", name); + vip->v_cpu = vcpu; + vcpu->vcpu_id = last_vcpu_id++; + + /* Struct vprocg initialization - perhaps move to anther place? */ + V_averunnable.fscale = FSCALE; + + /* Initialize / attach vnet module instances. */ + CURVNET_SET_QUIET(vnet); + TAILQ_FOREACH(vml, &vnet_modlink_head, vml_mod_le) + vnet_mod_constructor(vml); + CURVNET_RESTORE(); + + VNET_LIST_LOCK(); + LIST_INSERT_HEAD(&vnet_head, vnet, vnet_le); + VNET_LIST_UNLOCK(); + + /* XXX locking */ + LIST_INSERT_HEAD(&vprocg_head, vprocg, vprocg_le); + + mtx_lock_spin(&vcpu_list_mtx); + LIST_INSERT_HEAD(&vcpu_head, vcpu, vcpu_le); + mtx_unlock_spin(&vcpu_list_mtx); + +vi_alloc_done: + return (vip); +} + + +/* + * Destroy a vnet - unlink all linked lists, free all the memory, stop all + * the timers... How can one ever be sure to have done *all* the necessary + * steps? + */ +static int +vi_destroy(struct vimage *vip) +{ + struct vnet *vnet = vip->v_net; + struct vprocg *vprocg = vip->v_procg; + struct vcpu *vcpu = vip->v_cpu; + struct ifnet *ifp, *nifp; + struct vnet_modlink *vml; + + /* XXX Beware of races -> more locking to be done... */ + if (!LIST_EMPTY(&vip->vi_child_head)) + return (EBUSY); + + if (vprocg->nprocs != 0) + return (EBUSY); + + if (vnet->sockcnt != 0) + return (EBUSY); + + if (vip->vi_ucredrefc != 0) + printf("vi_destroy: %s ucredrefc %d\n", + vip->vi_name, vip->vi_ucredrefc); + + /* Point with no return - cleanup MUST succeed! */ + /* XXX locking */ + LIST_REMOVE(vip, vi_le); + LIST_REMOVE(vip, vi_sibling); + + /* XXX locking */ + LIST_REMOVE(vprocg, vprocg_le); + + mtx_lock_spin(&vcpu_list_mtx); + LIST_REMOVE(vcpu, vcpu_le); + mtx_unlock_spin(&vcpu_list_mtx); + + VNET_LIST_LOCK(); + LIST_REMOVE(vnet, vnet_le); + VNET_LIST_UNLOCK(); + + CURVNET_SET_QUIET(vnet); + INIT_VNET_NET(vnet); + + /* + * Return all inherited interfaces to their parent vnets, + * alternatively attempt to kill cloning ifnets. + */ + TAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { + if (ifp->if_home_vnet != ifp->if_vnet) + vi_if_move(NULL, ifp, vip); + else + if_clone_destroy(ifp->if_xname); + } + + /* Detach / free per-module state instances. */ + TAILQ_FOREACH_REVERSE(vml, &vnet_modlink_head, + vnet_modlink_head, vml_mod_le) + vnet_mod_destructor(vml); + +#if 0 + free((caddr_t)vnet->ifnet_addrs, M_IFADDR); + free((caddr_t)vnet->ifindex2ifnet, M_IFADDR); +#endif + + CURVNET_RESTORE(); + + /* hopefully, we are finally OK to free the vnet container itself! */ + vnet->vnet_magic_n = 0xdeadbeef; + vi_free(vnet, M_VNET); + vi_free(vprocg, M_VPROCG); + vi_free(vcpu, M_VCPU); + vi_free(vip, M_VIMAGE); + + return (0); +} + +static int vnet_mod_constructor(vml) +struct vnet_modlink *vml; +{ + const struct vnet_modinfo *vmi = vml->vml_modinfo; + +#ifdef DEBUG_ORDERING + printf("instatiating vnet_%s", vmi->vmi_name); + if (vml->vml_iarg) + printf("/%s", vml->vml_iname); + printf(": "); + if (vmi->vmi_struct_size) + printf("malloc(%d); ", vmi->vmi_struct_size); + if (vmi->vmi_iattach != NULL) + printf("iattach()"); + printf("\n"); +#endif + + if (vmi->vmi_struct_size) { + void *mem = vi_malloc(vmi->vmi_struct_size, M_VNET, + M_NOWAIT | M_ZERO); + if (mem == NULL) /* XXX should return error, not panic */ + panic("vi_alloc: malloc for %s\n", vmi->vmi_name); + curvnet->mod_data[vmi->vmi_id] = mem; + } + + if (vmi->vmi_iattach != NULL) + vmi->vmi_iattach(vml->vml_iarg); + + return 0; +} + +static int vnet_mod_destructor(vml) +struct vnet_modlink *vml; +{ + const struct vnet_modinfo *vmi = vml->vml_modinfo; + +#ifdef DEBUG_ORDERING + printf("destroying vnet_%s", vmi->vmi_name); + if (vml->vml_iarg) + printf("/%s", vml->vml_iname); + printf(": "); + if (vmi->vmi_idetach != NULL) + printf("idetach(); "); + if (vmi->vmi_struct_size) + printf("free()"); + printf("\n"); +#endif + + if (vmi->vmi_idetach) + vmi->vmi_idetach(vml->vml_iarg); + + if (vmi->vmi_struct_size) { + if (curvnet->mod_data[vmi->vmi_id] == NULL) + panic("vi_destroy: %s\n", vmi->vmi_name); + vi_free(curvnet->mod_data[vmi->vmi_id], M_VNET); + curvnet->mod_data[vmi->vmi_id] = NULL; + } + + return 0; +} + +static void +vi_init(void *unused) +{ +#ifdef VI_PREALLOC_SIZE + struct vi_mtrack *vmt; + + /* Initialize our private memory allocator */ + LIST_INIT(&vi_mem_free_head); + LIST_INIT(&vi_mem_alloc_head); + vi_mtrack_zone = uma_zcreate("vi_mtrack", sizeof(struct vi_mtrack), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + vmt = uma_zalloc(vi_mtrack_zone, M_NOWAIT); + vmt->vmt_addr = vi_mpool; + vmt->vmt_size = VI_PREALLOC_SIZE; + LIST_INSERT_HEAD(&vi_mem_free_head, vmt, vmt_le); +#endif /* VI_PREALLOC_SIZE */ + + /* vnet module list is both forward and reverse traversable */ + TAILQ_INIT(&vnet_modlink_head); + TAILQ_INIT(&vnet_modpending_head); + + LIST_INIT(&vimage_head); + LIST_INIT(&vnet_head); + LIST_INIT(&vprocg_head); + LIST_INIT(&vcpu_head); + + mtx_init(&vnet_list_refc_mtx, "vnet_list_refc_mtx", NULL, MTX_DEF); + cv_init(&vnet_list_condvar, "vnet_list_condvar"); + + mtx_init(&vcpu_list_mtx, "vcpu_list_mtx", NULL, MTX_SPIN); + + vi_alloc(NULL, ""); /* Default vimage has no name */ + + /* We MUST clear curvnet in vi_init_done before going SMP. */ + curvnet = LIST_FIRST(&vnet_head); +} + +static void +vi_init_done(void *unused) +{ + struct vnet_modlink *vml_iter; + + curvnet = NULL; + + if (TAILQ_EMPTY(&vnet_modpending_head)) + return; + + printf("vnet modules with unresolved dependencies:\n"); + TAILQ_FOREACH(vml_iter, &vnet_modpending_head, vml_mod_le) + printf(" %s depending on %d:\n", + vml_iter->vml_modinfo->vmi_name, + vml_iter->vml_modinfo->vmi_dependson); + panic("going nowhere without my vnet modules!"); +} + +SYSINIT(vimage, SI_SUB_VIMAGE, SI_ORDER_FIRST, vi_init, NULL) +SYSINIT(vimage_done, SI_SUB_VIMAGE_DONE, SI_ORDER_FIRST, vi_init_done, NULL) + +#ifdef VI_PREALLOC_SIZE +void * +vi_malloc(unsigned long size, struct malloc_type *type, int flags) +{ + void *addr; + struct vi_mtrack *vmt = NULL; + struct vi_mtrack *vmt_iter; + + /* Attempt to find a free chunk in our private pool */ + LIST_FOREACH(vmt_iter, &vi_mem_free_head, vmt_le) + if (vmt_iter->vmt_size >= size && + (vmt == NULL || vmt_iter->vmt_size < vmt->vmt_size)) { + vmt = vmt_iter; + /* Exact fit is an optimal choice, we are done. */ + if (vmt_iter->vmt_size == size) + break; + } + + /* Not (enough) free space in our pool, resort to malloc() */ + if (vmt == NULL) { + if (vi_mpool_fail_cnt == 0) + printf("vi_mpool exhausted," + "consider increasing VI_PREALLOC_SIZE\n"); + vi_mpool_fail_cnt++; + addr = malloc(size, type, flags); + return addr; + } + + addr = vmt->vmt_addr; + if (vmt->vmt_size == size) { + /* Move the descriptor from free to allocated list */ + LIST_REMOVE(vmt, vmt_le); + LIST_INSERT_HEAD(&vi_mem_alloc_head, vmt, vmt_le); + } else { + /* Shrink the existing free space block */ + vmt->vmt_addr += size; + vmt->vmt_size -= size; + + /* Create a new descriptor and place it on allocated list */ + vmt = uma_zalloc(vi_mtrack_zone, M_NOWAIT); + vmt->vmt_addr = addr; + vmt->vmt_size = size; + LIST_INSERT_HEAD(&vi_mem_alloc_head, vmt, vmt_le); + } + + bzero(addr, size); + return addr; +} + +void +vi_free(void *addr, struct malloc_type *type) +{ + struct vi_mtrack *vmt; + + /* Attempt to find the chunk in our allocated pool */ + LIST_FOREACH(vmt, &vi_mem_alloc_head, vmt_le) + if (vmt->vmt_addr == addr) + break; + + /* Not found in our private pool, resort to free() */ + if (vmt == NULL) { + free(addr, type); + return; + } + + /* Move the descriptor from allocated to free list */ + LIST_REMOVE(vmt, vmt_le); + LIST_INSERT_HEAD(&vi_mem_free_head, vmt, vmt_le); +} +#endif /* VI_PREALLOC_SIZE */ + +#ifdef DDB +static void +db_vnet_ptr(void *arg) +{ + if (arg) + db_printf(" %p", arg); + else + db_printf(" 0"); +} + +DB_SHOW_COMMAND(vnets, db_show_vnets) +{ + db_printf(" vnet ifs socks"); + db_printf(" net inet inet6 ipsec netgraph\n"); + VNET_ITERLOOP_BEGIN_QUIET(); + db_printf("%p %3d %5d", + vnet_iter, vnet_iter->ifccnt, vnet_iter->sockcnt); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_NET]); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_INET]); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_INET6]); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_IPSEC]); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_NETGRAPH]); + db_printf("\n"); + VNET_ITERLOOP_END(); +} +#endif --- /u/marko/p4/head/src/sys/kern/kern_xxx.c 2007-08-31 03:47:34.000000000 +0200 +++ src/sys/kern/kern_xxx.c 2007-10-22 18:06:31.000000000 +0200 @@ -33,6 +33,7 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_xxx.c,v 1.49 2007/03/05 13:10:57 rwatson Exp $"); #include "opt_compat.h" +#include "opt_vimage.h" #include #include @@ -44,6 +45,7 @@ #include #include #include +#include #if defined(COMPAT_43) @@ -245,14 +247,15 @@ struct thread *td; struct getdomainname_args *uap; { + INIT_VPROCG(TD_TO_VPROCG(td)); int domainnamelen; int error; mtx_lock(&Giant); - domainnamelen = strlen(domainname) + 1; + domainnamelen = strlen(V_domainname) + 1; if ((u_int)uap->len > domainnamelen) uap->len = domainnamelen; - error = copyout(domainname, uap->domainname, uap->len); + error = copyout(V_domainname, uap->domainname, uap->len); mtx_unlock(&Giant); return (error); } @@ -269,19 +272,21 @@ struct thread *td; struct setdomainname_args *uap; { + INIT_VPROCG(TD_TO_VPROCG(td)); int error, domainnamelen; error = priv_check(td, PRIV_SETDOMAINNAME); +printf("setdomainname error=%d\n", error); if (error) return (error); mtx_lock(&Giant); - if ((u_int)uap->len > sizeof (domainname) - 1) { + if ((u_int)uap->len > sizeof (V_domainname) - 1) { error = EINVAL; goto done2; } domainnamelen = uap->len; - error = copyin(uap->domainname, domainname, uap->len); - domainname[domainnamelen] = 0; + error = copyin(uap->domainname, V_domainname, uap->len); + V_domainname[domainnamelen] = 0; done2: mtx_unlock(&Giant); return (error); --- /u/marko/p4/head/src/sys/kern/sched_4bsd.c 2007-12-27 19:31:57.000000000 +0100 +++ src/sys/kern/sched_4bsd.c 2008-01-14 19:23:38.000000000 +0100 @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD: src/sys/kern/sched_4bsd.c,v 1.112 2007/12/15 23:13:31 jeff Exp $"); #include "opt_hwpmc_hooks.h" +#include "opt_vimage.h" #include #include @@ -52,6 +53,7 @@ #include #include #include +#include #include #include @@ -102,9 +104,11 @@ ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq) static struct td_sched td_sched0; -struct mtx sched_lock; +static struct mtx sched_lock; +#ifndef VIMAGE static int sched_tdcnt; /* Total runnable threads in the system. */ +#endif static int sched_quantum; /* Roundrobin scheduling quantum in ticks. */ #define SCHED_QUANTUM (hz / 10) /* Default sched quantum */ @@ -227,18 +231,34 @@ #endif static __inline void -sched_load_add(void) +sched_load_add(struct thread *td) { - sched_tdcnt++; - CTR1(KTR_SCHED, "global load: %d", sched_tdcnt); + INIT_VPROCG(TD_TO_VPROCG(td)); + + V_sched_tdcnt++; + CTR1(KTR_SCHED, "global load: %d", V_sched_tdcnt); } static __inline void -sched_load_rem(void) +sched_load_rem(struct thread *td) { - sched_tdcnt--; - CTR1(KTR_SCHED, "global load: %d", sched_tdcnt); + INIT_VPROCG(TD_TO_VPROCG(td)); + + V_sched_tdcnt--; + CTR1(KTR_SCHED, "global load: %d", V_sched_tdcnt); } + +#ifdef VIMAGE +void +sched_load_reassign(struct vprocg *old, struct vprocg *new) +{ + mtx_lock_spin(&sched_lock); + old->_sched_tdcnt--; + new->_sched_tdcnt++; + mtx_unlock_spin(&sched_lock); +} +#endif + /* * Arrange to reschedule if necessary, taking the priorities and * schedulers into account. @@ -346,16 +366,26 @@ static void schedcpu(void) { - register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); + register fixpt_t loadfac; struct thread *td; struct proc *p; struct td_sched *ts; int awake, realstathz; +#ifndef VIMAGE + loadfac = loadfactor(averunnable.ldavg[0]); +#endif realstathz = stathz ? stathz : hz; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_SLOCK(p); +#ifdef VIMAGE + if (p->p_ucred != NULL) { + INIT_VPROCG(P_TO_VPROCG(p)); + loadfac = loadfactor(V_averunnable.ldavg[0]); + } else + loadfac = 0; +#endif FOREACH_THREAD_IN_PROC(p, td) { awake = 0; thread_lock(td); @@ -462,12 +492,13 @@ static void updatepri(struct thread *td) { + INIT_VPROCG(TD_TO_VPROCG(td)); struct td_sched *ts; fixpt_t loadfac; unsigned int newcpu; ts = td->td_sched; - loadfac = loadfactor(averunnable.ldavg[0]); + loadfac = loadfactor(V_averunnable.ldavg[0]); if (ts->ts_slptime > 5 * loadfac) td->td_estcpu = 0; else { @@ -528,7 +559,7 @@ hogticks = 2 * sched_quantum; /* Account for thread0. */ - sched_load_add(); + sched_load_add(&thread0); } /* External interfaces start here */ @@ -631,7 +662,7 @@ thread_unlock(td); mtx_lock_spin(&sched_lock); if ((child->td_proc->p_flag & P_NOLOAD) == 0) - sched_load_rem(); + sched_load_rem(td); mtx_unlock_spin(&sched_lock); } @@ -825,7 +856,7 @@ } if ((p->p_flag & P_NOLOAD) == 0) - sched_load_rem(); + sched_load_rem(td); if (newtd) newtd->td_flags |= (td->td_flags & TDF_NEEDRESCHED); @@ -867,7 +898,7 @@ newtd->td_sched->ts_flags |= TSF_DIDRUN; TD_SET_RUNNING(newtd); if ((newtd->td_proc->p_flag & P_NOLOAD) == 0) - sched_load_add(); + sched_load_add(newtd); } else { newtd = choosethread(); } @@ -1124,7 +1155,7 @@ } if ((td->td_proc->p_flag & P_NOLOAD) == 0) - sched_load_add(); + sched_load_add(td); runq_add(ts->ts_runq, ts, flags); } #else /* SMP */ @@ -1169,7 +1200,7 @@ return; } if ((td->td_proc->p_flag & P_NOLOAD) == 0) - sched_load_add(); + sched_load_add(td); runq_add(ts->ts_runq, ts, flags); maybe_resched(td); } @@ -1191,7 +1222,7 @@ curthread->td_name); if ((td->td_proc->p_flag & P_NOLOAD) == 0) - sched_load_rem(); + sched_load_rem(td); runq_remove(ts->ts_runq, ts); TD_SET_CAN_RUN(td); } @@ -1308,9 +1339,13 @@ } int +#ifdef VIMAGE +sched_load(struct vprocg *vprocg) +#else sched_load(void) +#endif { - return (sched_tdcnt); + return (V_sched_tdcnt); } int --- /u/marko/p4/head/src/sys/kern/sched_ule.c 2008-01-28 23:53:49.000000000 +0100 +++ src/sys/kern/sched_ule.c 2008-02-27 11:47:28.000000000 +0100 @@ -40,6 +40,7 @@ #include "opt_hwpmc_hooks.h" #include "opt_sched.h" +#include "opt_vimage.h" #include #include @@ -59,6 +60,7 @@ #include #include #include +#include #ifdef KTRACE #include #include @@ -286,8 +288,13 @@ /* Operations on per processor queues */ static struct td_sched * tdq_choose(struct tdq *); static void tdq_setup(struct tdq *); +#ifndef VIMAGE static void tdq_load_add(struct tdq *, struct td_sched *); static void tdq_load_rem(struct tdq *, struct td_sched *); +#else +static void tdq_load_add(struct tdq *, struct td_sched *, struct vprocg *); +static void tdq_load_rem(struct tdq *, struct td_sched *, struct vprocg *); +#endif static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int); static __inline void tdq_runq_rem(struct tdq *, struct td_sched *); void tdq_print(int cpu); @@ -459,7 +466,11 @@ * for this thread to the referenced thread queue. */ static void +#ifndef VIMAGE tdq_load_add(struct tdq *tdq, struct td_sched *ts) +#else +tdq_load_add(struct tdq *tdq, struct td_sched *ts, struct vprocg *vprocg) +#endif { int class; @@ -469,12 +480,19 @@ tdq->tdq_load++; CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load); if (class != PRI_ITHD && - (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) + (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) { #ifdef SMP tdq->tdq_group->tdg_load++; -#else +#ifdef VIMAGE + V_tdq_sysload[TDG_ID(tdq->tdq_group)]++; +#endif +#else /* !SMP */ tdq->tdq_sysload++; +#ifdef VIMAGE + V_tdq_sysload[0]++; #endif +#endif /* SMP */ + } } /* @@ -482,7 +500,11 @@ * exiting. */ static void +#ifndef VIMAGE tdq_load_rem(struct tdq *tdq, struct td_sched *ts) +#else +tdq_load_rem(struct tdq *tdq, struct td_sched *ts, struct vprocg *vprocg) +#endif { int class; @@ -490,12 +512,19 @@ TDQ_LOCK_ASSERT(tdq, MA_OWNED); class = PRI_BASE(ts->ts_thread->td_pri_class); if (class != PRI_ITHD && - (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) + (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) { #ifdef SMP tdq->tdq_group->tdg_load--; -#else +#ifdef VIMAGE + V_tdq_sysload[TDG_ID(tdq->tdq_group)]--; +#endif +#else /* !SMP */ tdq->tdq_sysload--; +#ifdef VIMAGE + V_tdq_sysload[0]--; #endif +#endif /* SMP */ + } KASSERT(tdq->tdq_load != 0, ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); tdq->tdq_load--; @@ -1330,7 +1359,11 @@ /* Add thread0's load since it's running. */ TDQ_LOCK(tdq); thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF()); +#ifndef VIMAGE tdq_load_add(tdq, &td_sched0); +#else + tdq_load_add(tdq, &td_sched0, TD_TO_VPROCG(&thread0)); +#endif TDQ_UNLOCK(tdq); } @@ -1859,7 +1892,11 @@ TD_SET_CAN_RUN(td); } else if (TD_IS_RUNNING(td)) { MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); +#ifndef VIMAGE tdq_load_rem(tdq, ts); +#else + tdq_load_rem(tdq, ts, TD_TO_VPROCG(td)); +#endif srqflag = (flags & SW_PREEMPT) ? SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : SRQ_OURSELF|SRQ_YIELDING; @@ -1871,7 +1908,11 @@ /* This thread must be going to sleep. */ TDQ_LOCK(tdq); mtx = thread_block_switch(td); +#ifndef VIMAGE tdq_load_rem(tdq, ts); +#else + tdq_load_rem(tdq, ts, TD_TO_VPROCG(td)); +#endif } /* * We enter here with the thread blocked and assigned to the @@ -2379,7 +2420,11 @@ tdq->tdq_lowpri = td->td_priority; #endif tdq_runq_add(tdq, ts, flags); +#ifndef VIMAGE tdq_load_add(tdq, ts); +#else + tdq_load_add(tdq, ts, TD_TO_VPROCG(td)); +#endif } /* @@ -2460,7 +2505,11 @@ KASSERT(TD_ON_RUNQ(td), ("sched_rem: thread not on run queue")); tdq_runq_rem(tdq, ts); +#ifndef VIMAGE tdq_load_rem(tdq, ts); +#else + tdq_load_rem(tdq, ts, TD_TO_VPROCG(td)); +#endif TD_SET_CAN_RUN(td); } @@ -2556,7 +2605,11 @@ * Return the total system load. */ int +#ifdef VIMAGE +sched_load(struct vprocg *vprocg) +#else sched_load(void) +#endif { #ifdef SMP int total; @@ -2564,13 +2617,40 @@ total = 0; for (i = 0; i <= tdg_maxid; i++) +#ifndef VIMAGE total += TDQ_GROUP(i)->tdg_load; - return (total); #else + total += V_tdq_sysload[i]; +#endif + return (total); +#else /* !SMP */ +#ifndef VIMAGE return (TDQ_SELF()->tdq_sysload); +#else + return (V_tdq_sysload[0]); #endif +#endif /* SMP */ } +#ifdef VIMAGE +void +sched_load_reassign(struct vprocg *old, struct vprocg *new) +{ +#ifdef SMP + int tdg_id; + + critical_enter(); + tdg_id = TDG_ID(tdq_cpu[curcpu].tdq_group); + old->_tdq_sysload[tdg_id]--; + new->_tdq_sysload[tdg_id]++; + critical_exit(); +#else + old->_tdq_sysload[0]--; + new->_tdq_sysload[0]++; +#endif +} +#endif + int sched_sizeof_proc(void) { @@ -2622,7 +2702,11 @@ spinlock_exit(); } else { MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); +#ifndef VIMAGE tdq_load_rem(tdq, td->td_sched); +#else + tdq_load_rem(tdq, td->td_sched, TD_TO_VPROCG(td)); +#endif lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object); } KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); --- /u/marko/p4/head/src/sys/kern/tty.c 2008-01-15 18:00:10.000000000 +0100 +++ src/sys/kern/tty.c 2008-02-27 11:47:51.000000000 +0100 @@ -75,6 +75,7 @@ #include "opt_compat.h" #include "opt_tty.h" +#include "opt_vimage.h" #include #include @@ -104,6 +105,7 @@ #include #include #include +#include #include @@ -1141,6 +1143,7 @@ if (t == tp->t_line) return (0); s = spltty(); + CURVNET_SET(TD_TO_VNET(curthread)); ttyld_close(tp, flag); tp->t_line = t; /* XXX: we should use the correct cdev here */ @@ -1156,6 +1159,7 @@ tp->t_line = TTYDISC; (void)ttyld_open(tp, tp->t_dev); } + CURVNET_RESTORE(); splx(s); return (error); break; @@ -2530,6 +2534,7 @@ void ttyinfo(struct tty *tp) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); struct timeval utime, stime; struct proc *p, *pick; struct thread *td, *picktd; @@ -2544,7 +2549,7 @@ return; /* Print load average. */ - load = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT; + load = (V_averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT; ttyprintf(tp, "load: %d.%02d ", load / 100, load % 100); /* @@ -3215,7 +3220,9 @@ goto out; goto open_top; } + CURVNET_SET(TD_TO_VNET(curthread)); error = ttyld_open(tp, dev); + CURVNET_RESTORE(); ttyldoptim(tp); if (tp->t_state & TS_ISOPEN && ISCALLOUT(dev)) tp->t_actout = TRUE; @@ -3232,7 +3239,9 @@ struct tty *tp; tp = dev->si_tty; + CURVNET_SET(TD_TO_VNET(curthread)); ttyld_close(tp, flag); + CURVNET_RESTORE(); ttyldoptim(tp); tt_close(tp); tp->t_do_timestamp = 0; --- /u/marko/p4/head/src/sys/kern/subr_pcpu.c 2007-11-14 19:35:22.000000000 +0100 +++ src/sys/kern/subr_pcpu.c 2007-12-10 11:26:05.000000000 +0100 @@ -46,6 +46,7 @@ __FBSDID("$FreeBSD: src/sys/kern/subr_pcpu.c,v 1.10 2007/11/14 06:21:23 julian Exp $"); #include "opt_ddb.h" +#include "opt_vimage.h" #include #include @@ -132,6 +133,10 @@ db_printf("none\n"); db_show_mdpcpu(pc); +#ifdef VIMAGE + db_printf("curvnet = %p\n", pc->pc_curthread->td_vnet); +#endif + #ifdef WITNESS db_printf("spin locks held:\n"); witness_list_locks(&pc->pc_spinlocks); --- /u/marko/p4/head/src/sys/kern/sys_socket.c 2008-01-15 18:00:10.000000000 +0100 +++ src/sys/kern/sys_socket.c 2008-02-27 11:47:39.000000000 +0100 @@ -33,6 +33,7 @@ __FBSDID("$FreeBSD: src/sys/kern/sys_socket.c,v 1.75 2008/01/07 20:05:18 jhb Exp $"); #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -50,7 +51,9 @@ #include #include #include +#include +#include #include #include @@ -74,16 +77,19 @@ int flags, struct thread *td) { struct socket *so = fp->f_data; -#ifdef MAC int error; +#ifdef MAC SOCK_LOCK(so); error = mac_socket_check_receive(active_cred, so); SOCK_UNLOCK(so); if (error) return (error); #endif - return (soreceive(so, 0, uio, 0, 0, 0)); + CURVNET_SET(so->so_vnet); + error = soreceive(so, 0, uio, 0, 0, 0); + CURVNET_RESTORE(); + return (error); } /* ARGSUSED */ @@ -125,6 +131,7 @@ struct socket *so = fp->f_data; int error = 0; + CURVNET_SET(so->so_vnet); switch (cmd) { case FIONBIO: SOCK_LOCK(so); @@ -205,6 +212,7 @@ (so, cmd, data, 0, td)); break; } + CURVNET_RESTORE(); return (error); } @@ -279,7 +287,8 @@ fp->f_ops = &badfileops; fp->f_data = NULL; - if (so) + if (so) { error = soclose(so); + } return (error); } --- /u/marko/p4/head/src/sys/kern/uipc_domain.c 2007-08-31 03:47:38.000000000 +0200 +++ src/sys/kern/uipc_domain.c 2007-10-22 18:06:33.000000000 +0200 @@ -29,6 +29,8 @@ * @(#)uipc_domain.c 8.2 (Berkeley) 10/18/93 */ +#include "opt_vimage.h" + #include __FBSDID("$FreeBSD: src/sys/kern/uipc_domain.c,v 1.51 2007/08/06 14:26:00 rwatson Exp $"); @@ -43,6 +45,7 @@ #include #include #include +#include #include /* @@ -64,6 +67,11 @@ SYSINIT(domainfin, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, domainfinalize, NULL) +static vnet_attach_fn net_init_domain; +#ifdef VIMAGE +static vnet_detach_fn net_detach_domain; +#endif + static struct callout pffast_callout; static struct callout pfslow_callout; @@ -100,6 +108,9 @@ .pru_sopoll = pru_sopoll_notsupp, }; +VNET_MOD_DECLARE_STATELESS(DOMAIN, domain, net_init_domain, net_detach_domain, + NET) + static void protosw_init(struct protosw *pr) { @@ -128,13 +139,12 @@ } /* - * Add a new protocol domain to the list of supported domains - * Note: you cant unload it again because a socket may be using it. - * XXX can't fail at this time. + * Initialize a domain instance. */ -static void -net_init_domain(struct domain *dp) +static int +net_init_domain(const void *arg) { + const struct domain *dp = arg; struct protosw *pr; if (dp->dom_init) @@ -148,8 +158,29 @@ max_datalen = MHLEN - max_hdr; if (max_datalen < 1) panic("%s: max_datalen < 1", __func__); + return 0; } +#ifdef VIMAGE +/* + * Detach / free a domain instance. + */ +static int +net_detach_domain(const void *arg) +{ + const struct domain *dp = arg; + struct protosw *pr; + + if (dp->dom_destroy) + (*dp->dom_destroy)(); + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_destroy) + (*pr->pr_destroy)(); + + return 0; +} +#endif + /* * Add a new protocol domain to the list of supported domains * Note: you cant unload it again because a socket may be using it. @@ -183,7 +214,11 @@ "domainfinalize()\n", dp->dom_name); #endif mtx_unlock(&dom_mtx); +#ifdef VIMAGE + vnet_mod_register_multi(&vnet_domain_modinfo, dp, dp->dom_name); +#else net_init_domain(dp); +#endif } static void --- /u/marko/p4/head/src/sys/kern/uipc_socket.c 2008-02-27 18:28:53.000000000 +0100 +++ src/sys/kern/uipc_socket.c 2008-02-27 17:58:38.000000000 +0100 @@ -101,6 +101,7 @@ #include "opt_mac.h" #include "opt_zero.h" #include "opt_compat.h" +#include "opt_vimage.h" #include #include @@ -128,6 +129,9 @@ #include #include #include +#include + +#include #include @@ -259,7 +263,7 @@ * soalloc() returns a socket with a ref count of 0. */ static struct socket * -soalloc(void) +soalloc(struct vnet *vnet) { struct socket *so; @@ -280,6 +284,10 @@ mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; +#ifdef VIMAGE + so->so_vnet = vnet; + vnet->sockcnt++; +#endif mtx_unlock(&so_global_mtx); return (so); } @@ -299,6 +307,9 @@ mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; --numopensockets; /* Could be below, but faster here. */ +#ifdef VIMAGE + so->so_vnet->sockcnt--; +#endif mtx_unlock(&so_global_mtx); if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, @@ -352,7 +363,11 @@ if (prp->pr_type != type) return (EPROTOTYPE); - so = soalloc(); +#ifdef VIMAGE + so = soalloc(TD_TO_VNET(td)); +#else + so = soalloc(NULL); +#endif if (so == NULL) return (ENOBUFS); @@ -373,7 +388,9 @@ * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. */ + CURVNET_SET(so->so_vnet); error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); + CURVNET_RESTORE(); if (error) { KASSERT(so->so_count == 1, ("socreate: so_count %d", so->so_count)); @@ -415,7 +432,12 @@ if (over) #endif return (NULL); - so = soalloc(); +#ifdef VIMAGE + VNET_ASSERT(head->so_vnet); + so = soalloc(head->so_vnet); +#else + so = soalloc(NULL); +#endif if (so == NULL) return (NULL); if ((head->so_options & SO_ACCEPTFILTER) != 0) @@ -487,8 +509,12 @@ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { + int error; - return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); + CURVNET_SET(so->so_vnet); + error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); + CURVNET_RESTORE(); + return error; } /* @@ -636,6 +662,7 @@ KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); + CURVNET_SET(so->so_vnet); funsetown(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { @@ -687,6 +714,7 @@ KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; sorele(so); + CURVNET_RESTORE(); return (error); } @@ -762,7 +790,9 @@ * biting us. */ so->so_error = 0; + CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); + CURVNET_RESTORE(); } return (error); @@ -1278,13 +1308,17 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { + int error; /* XXXRW: Temporary debugging. */ KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend, ("sosend: protocol calls sosend")); - return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, - control, flags, td)); + CURVNET_SET(so->so_vnet); + error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, + control, flags, td); + CURVNET_RESTORE(); + return (error); } /* @@ -1864,8 +1898,13 @@ if (how != SHUT_WR) sorflush(so); - if (how != SHUT_RD) - return ((*pr->pr_usrreqs->pru_shutdown)(so)); + if (how != SHUT_RD) { + int error; + CURVNET_SET(so->so_vnet); + error = (*pr->pr_usrreqs->pru_shutdown)(so); + CURVNET_RESTORE(); + return (error); + } return (0); } @@ -1889,6 +1928,7 @@ * socket buffer. Don't let our acquire be interrupted by a signal * despite any existing socket disposition on interruptable waiting. */ + CURVNET_SET(so->so_vnet); socantrcvmore(so); (void) sblock(sb, SBL_WAIT | SBL_NOINTR); @@ -1912,6 +1952,7 @@ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(asb.sb_mb); sbrelease_internal(&asb, so); + CURVNET_RESTORE(); } /* @@ -1978,8 +2019,7 @@ error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto && so->so_proto->pr_ctloutput) - return ((*so->so_proto->pr_ctloutput) - (so, sopt)); + return ((*so->so_proto->pr_ctloutput) (so, sopt)); error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { --- /u/marko/p4/head/src/sys/kern/uipc_syscalls.c 2008-02-27 18:28:54.000000000 +0100 +++ src/sys/kern/uipc_syscalls.c 2008-02-27 11:48:02.000000000 +0100 @@ -39,6 +39,7 @@ #include "opt_compat.h" #include "opt_ktrace.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -64,6 +65,7 @@ #include #include #include +#include #ifdef KTRACE #include #endif @@ -264,7 +266,9 @@ if (error) goto done; #endif + CURVNET_SET(so->so_vnet); error = solisten(so, uap->backlog, td); + CURVNET_RESTORE(); #ifdef MAC done: #endif @@ -429,7 +433,9 @@ tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); sa = 0; + CURVNET_SET(so->so_vnet); error = soaccept(so, &sa); + CURVNET_RESTORE(); if (error) { /* * return a namelen of zero for older code which might @@ -977,9 +983,11 @@ ktruio = cloneuio(&auio); #endif len = auio.uio_resid; + CURVNET_SET(so->so_vnet); error = soreceive(so, &fromsa, &auio, (struct mbuf **)0, (mp->msg_control || controlp) ? &control : (struct mbuf **)0, &mp->msg_flags); + CURVNET_RESTORE(); if (error) { if (auio.uio_resid != (int)len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) @@ -1323,7 +1331,9 @@ error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error == 0) { so = fp->f_data; + CURVNET_SET(so->so_vnet); error = sosetopt(so, &sopt); + CURVNET_RESTORE(); fdrop(fp, td); } return(error); @@ -1401,7 +1411,9 @@ error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error == 0) { so = fp->f_data; + CURVNET_SET(so->so_vnet); error = sogetopt(so, &sopt); + CURVNET_RESTORE(); *valsize = sopt.sopt_valsize; fdrop(fp, td); } @@ -1464,7 +1476,9 @@ return (error); so = fp->f_data; *sa = NULL; + CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); + CURVNET_RESTORE(); if (error) goto bad; if (*sa == NULL) @@ -1564,8 +1578,11 @@ error = ENOTCONN; goto done; } + *sa = NULL; + CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); + CURVNET_RESTORE(); if (error) goto bad; if (*sa == NULL) @@ -2184,9 +2201,11 @@ goto done; } SOCKBUF_UNLOCK(&so->so_snd); + CURVNET_SET(so->so_vnet); /* Avoid error aliasing. */ err = (*so->so_proto->pr_usrreqs->pru_send) (so, 0, m, NULL, NULL, td); + CURVNET_RESTORE(); if (err == 0) { /* * We need two counters to get the --- /u/marko/p4/head/src/sys/kern/uipc_usrreq.c 2008-01-28 23:53:50.000000000 +0100 +++ src/sys/kern/uipc_usrreq.c 2008-02-27 11:48:03.000000000 +0100 @@ -60,6 +60,7 @@ #include "opt_ddb.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -90,6 +91,7 @@ #include #include #include +#include #ifdef DDB #include @@ -1648,6 +1650,10 @@ unp_init(void) { +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); if (unp_zone == NULL) --- /u/marko/p4/head/src/sys/kern/vfs_export.c 2007-08-31 03:47:39.000000000 +0200 +++ src/sys/kern/vfs_export.c 2007-10-22 18:06:34.000000000 +0200 @@ -37,6 +37,8 @@ #include __FBSDID("$FreeBSD: src/sys/kern/vfs_export.c,v 1.341 2007/02/15 22:08:35 pjd Exp $"); +#include "opt_vimage.h" + #include #include #include @@ -50,6 +52,7 @@ #include #include #include +#include #include @@ -135,6 +138,7 @@ } #endif + CURVNET_SET(TD_TO_VNET(curthread)); /* XXX MARKO */ i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO); saddr = (struct sockaddr *) (np + 1); @@ -191,8 +195,10 @@ bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups, sizeof(np->netc_anon.cr_groups)); refcount_init(&np->netc_anon.cr_ref, 1); + CURVNET_RESTORE(); return (0); out: + CURVNET_RESTORE(); free(np, M_NETADDR); return (error); } --- /u/marko/p4/head/src/sys/kern/vfs_lookup.c 2008-02-27 18:28:54.000000000 +0100 +++ src/sys/kern/vfs_lookup.c 2008-02-27 11:48:13.000000000 +0100 @@ -40,6 +40,7 @@ #include "opt_ktrace.h" #include "opt_mac.h" #include "opt_vfs.h" +#include "opt_vimage.h" #include #include @@ -53,6 +54,7 @@ #include #include #include +#include #ifdef KTRACE #include #endif @@ -65,6 +67,15 @@ #define NAMEI_DIAGNOSTIC 1 #undef NAMEI_DIAGNOSTIC +#ifdef VIMAGE +#define IMUNES_SYMLINK_HACK +#endif + +#ifdef IMUNES_SYMLINK_HACK +SYSCTL_V_INT(V_PROCG, vprocg, _vfs, OID_AUTO, morphing_symlinks, CTLFLAG_RW, + morphing_symlinks, 0, "Resolve @ to vimage name in symlinks"); +#endif + /* * Allocation zone for namei */ @@ -129,6 +140,9 @@ struct thread *td = cnp->cn_thread; struct proc *p = td->td_proc; int vfslocked; +#ifdef IMUNES_SYMLINK_HACK + INIT_VPROCG(TD_TO_VPROCG(td)); +#endif KASSERT((cnp->cn_flags & MPSAFE) != 0 || mtx_owned(&Giant) != 0, ("NOT MPSAFE and Giant not held")); @@ -284,6 +298,25 @@ error = ENOENT; break; } +#ifdef IMUNES_SYMLINK_HACK + if (V_morphing_symlinks) { + char *sp = strchr(cp, '@'); + int vnamelen = strlen(TD_TO_VIMAGE(td)->vi_name); + + if (sp) { + if (vnamelen >= auio.uio_resid) { + if (ndp->ni_pathlen > 1) + uma_zfree(namei_zone, cp); + error = ENAMETOOLONG; + break; + } + bcopy(sp + 1, sp + vnamelen, + linklen - (sp - cp)); + bcopy(TD_TO_VIMAGE(td)->vi_name, sp, vnamelen); + linklen += (vnamelen - 1); + } + } +#endif if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); --- /u/marko/p4/head/src/sys/modules/Makefile 2008-02-27 18:28:58.000000000 +0100 +++ src/sys/modules/Makefile 2008-02-27 11:48:22.000000000 +0100 @@ -428,9 +428,6 @@ _tmpfs= tmpfs _wi= wi _xe= xe -.if ${MK_ZFS} != "no" || defined(ALL_MODULES) -_zfs= zfs -.endif .if ${MACHINE} == "i386" _aac= aac _acpi= acpi --- /u/marko/p4/head/src/sys/modules/netgraph/Makefile 2007-08-31 03:47:44.000000000 +0200 +++ src/sys/modules/netgraph/Makefile 2007-10-22 18:06:35.000000000 +0200 @@ -34,6 +34,7 @@ netflow \ netgraph \ one2many \ + pipe \ ppp \ pppoe \ pptpgre \ @@ -51,7 +52,8 @@ tty \ UI \ vjc \ - vlan + vlan \ + ${_wormhole} .if ${MACHINE_ARCH} == "i386" _sync_ar= sync_ar @@ -66,4 +68,9 @@ _mppc= mppc .endif +VIMAGE!= grep VIMAGE ${KERNBUILDDIR}/opt_vimage.h | cut -d" " -f3 || true +.if ${VIMAGE} == 1 +_wormhole= wormhole +.endif + .include --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/modules/netgraph/pipe/Makefile 2007-10-05 12:26:44.000000000 +0200 @@ -0,0 +1,6 @@ +# $FreeBSD: $ + +KMOD= ng_pipe +SRCS= ng_pipe.c + +.include --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/modules/netgraph/wormhole/Makefile 2007-10-22 18:06:35.000000000 +0200 @@ -0,0 +1,6 @@ +# $FreeBSD: $ + +KMOD= ng_wormhole +SRCS= ng_wormhole.c opt_vimage.h + +.include --- /u/marko/p4/head/src/sys/net/bpf.c 2008-02-03 08:16:01.000000000 +0100 +++ src/sys/net/bpf.c 2008-02-27 11:48:34.000000000 +0100 @@ -40,6 +40,7 @@ #include "opt_bpf.h" #include "opt_mac.h" #include "opt_netgraph.h" +#include "opt_vimage.h" #include #include @@ -61,9 +62,11 @@ #include #include #include +#include #include +#include #include #include #ifdef BPF_JITTER @@ -444,8 +447,11 @@ BPFD_UNLOCK(d); funsetown(&d->bd_sigio); mtx_lock(&bpf_mtx); - if (d->bd_bif) + if (d->bd_bif) { + CURVNET_SET(d->bd_bif->bif_ifp->if_vnet); bpf_detachd(d); + CURVNET_RESTORE(); + } mtx_unlock(&bpf_mtx); selwakeuppri(&d->bd_sel, PRINET); #ifdef MAC @@ -666,7 +672,9 @@ BPFD_UNLOCK(d); #endif + CURVNET_SET(ifp->if_vnet); error = (*ifp->if_output)(ifp, m, &dst, NULL); + CURVNET_RESTORE(); if (mc != NULL) { if (error == 0) @@ -763,6 +771,7 @@ return (EPERM); } } + CURVNET_SET(TD_TO_VNET(td)); switch (cmd) { default: @@ -1056,6 +1065,7 @@ *(u_int *)addr = d->bd_sig; break; } + CURVNET_RESTORE(); return (error); } @@ -1150,9 +1160,33 @@ struct bpf_if *bp; struct ifnet *theywant; +#define IMUNES_BPF_HACK +#if defined(VIMAGE) && defined(IMUNES_BPF_HACK) + struct vnet *target_vnet = curvnet; + char *c; + + /* Hack to support tapping in foreign vnets */ + c = rindex(ifr->ifr_name, '@'); + if ( c != NULL ) { +printf("bpf_setif: %s\n", c); + struct vimage *target_vimage; + + *c++ = 0; + target_vimage = vimage_by_name(TD_TO_VIMAGE(curthread), c); + if (target_vimage == NULL) + return ENXIO; + target_vnet = target_vimage->v_net; + } + CURVNET_SET_QUIET(target_vnet); +#endif + theywant = ifunit(ifr->ifr_name); - if (theywant == NULL || theywant->if_bpf == NULL) + if (theywant == NULL || theywant->if_bpf == NULL) { +#if defined(VIMAGE) && defined(IMUNES_BPF_HACK) + CURVNET_RESTORE(); +#endif return (ENXIO); + } bp = theywant->if_bpf; /* @@ -1174,6 +1208,9 @@ BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); +#if defined(VIMAGE) && defined(IMUNES_BPF_HACK) + CURVNET_RESTORE(); +#endif return (0); } --- /u/marko/p4/head/src/sys/net/if.c 2007-10-29 17:17:42.000000000 +0100 +++ src/sys/net/if.c 2007-12-10 11:26:08.000000000 +0100 @@ -35,6 +35,7 @@ #include "opt_inet.h" #include "opt_mac.h" #include "opt_carp.h" +#include "opt_vimage.h" #include #include @@ -56,8 +57,11 @@ #include #include #include +#include + #include +#include #include #include #include @@ -110,7 +114,6 @@ static void if_purgemaddrs(struct ifnet *); static int ifconf(u_long, caddr_t); static void if_freemulti(struct ifmultiaddr *); -static void if_grow(void); static void if_init(void *); static void if_check(void *); static void if_qflush(struct ifaltq *); @@ -134,17 +137,24 @@ extern void nd6_setmtu(struct ifnet *); #endif -int if_index = 0; -struct ifindex_entry *ifindex_table = NULL; +static int vnet_net_iattach(const void *); +#ifdef VIMAGE +static int vnet_net_idetach(const void *); +#endif + int ifqmaxlen = IFQ_MAXLEN; -struct ifnethead ifnet; /* depend on static init XXX */ -struct ifgrouphead ifg_head; struct mtx ifnet_lock; static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; +#ifndef VIMAGE +int if_index = 0; +struct ifindex_entry *ifindex_table = NULL; +struct ifnethead ifnet; /* depend on static init XXX */ +struct ifgrouphead ifg_head; -static int if_indexlim = 8; +static int if_indexlim; static struct knlist ifklist; +#endif /* !VIMAGE */ static void filt_netdetach(struct knote *kn); static int filt_netdev(struct knote *kn, long hint); @@ -152,6 +162,19 @@ static struct filterops netdev_filtops = { 1, NULL, filt_netdetach, filt_netdev }; +#ifdef VIMAGE +static struct vnet_symmap vnet_net_symmap[] = { + VNET_SYMMAP(net, ifnet), + VNET_SYMMAP(net, rt_tables), + VNET_SYMMAP(net, rtstat), + VNET_SYMMAP(net, rttrash), + VNET_SYMMAP_END +}; +#endif + +VNET_MOD_DECLARE(NET, net, vnet_net_iattach, vnet_net_idetach, + NONE, vnet_net_symmap) + /* * System initialization */ @@ -192,6 +215,7 @@ static int netioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; int error, idx; @@ -225,6 +249,7 @@ static int netkqfilter(struct cdev *dev, struct knote *kn) { + INIT_VNET_NET(curvnet); struct knlist *klist; struct ifnet *ifp; int idx; @@ -239,7 +264,7 @@ idx = minor(dev); if (idx == 0) { - klist = &ifklist; + klist = &V_ifklist; } else { ifp = ifnet_byindex(idx); if (ifp == NULL) @@ -294,43 +319,91 @@ static void if_init(void *dummy __unused) { - +#ifdef VIMAGE + vnet_mod_register(&vnet_net_modinfo); +#else + vnet_net_iattach(NULL); +#endif IFNET_LOCK_INIT(); - TAILQ_INIT(&ifnet); - TAILQ_INIT(&ifg_head); - knlist_init(&ifklist, NULL, NULL, NULL, NULL); - if_grow(); /* create initial table */ +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { + INIT_VNET_NET(curvnet); +#endif ifdev_byindex(0) = make_dev(&net_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "network"); +#ifdef VIMAGE + } +#endif if_clone_init(); } -static void +static int +vnet_net_iattach(unused) + const void *unused; +{ + INIT_VNET_NET(curvnet); + + TAILQ_INIT(&V_ifnet); + TAILQ_INIT(&V_ifg_head); + knlist_init(&V_ifklist, NULL, NULL, NULL, NULL); + V_if_indexlim = 8; + if_grow(); /* create initial table */ + + return 0; +} + +#ifdef VIMAGE +static int +vnet_net_idetach(unused) + const void *unused; +{ + INIT_VNET_NET(curvnet); + + VNET_ASSERT(TAILQ_EMPTY(&V_ifnet)); +#ifdef NOTYET + VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head)); +#endif + VNET_ASSERT(SLIST_EMPTY(&V_ifklist.kl_list)); + + free((caddr_t)V_ifindex_table, M_IFNET); + + return 0; +} +#endif + +void if_grow(void) { + INIT_VNET_NET(curvnet); u_int n; struct ifindex_entry *e; - if_indexlim <<= 1; - n = if_indexlim * sizeof(*e); + V_if_indexlim <<= 1; + n = V_if_indexlim * sizeof(*e); e = malloc(n, M_IFNET, M_WAITOK | M_ZERO); - if (ifindex_table != NULL) { - memcpy((caddr_t)e, (caddr_t)ifindex_table, n/2); - free((caddr_t)ifindex_table, M_IFNET); + if (V_ifindex_table != NULL) { + memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2); + free((caddr_t)V_ifindex_table, M_IFNET); } - ifindex_table = e; + V_ifindex_table = e; } /* ARGSUSED*/ static void if_check(void *dummy __unused) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; int s; +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + panic("if_check() called for a non-default vimage!?!"); +#endif + s = splimp(); IFNET_RLOCK(); /* could sleep on rare error; mostly okay XXX */ - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_snd.ifq_maxlen == 0) { if_printf(ifp, "XXX: driver didn't set ifq_maxlen\n"); ifp->if_snd.ifq_maxlen = ifqmaxlen; @@ -344,7 +417,8 @@ } IFNET_RUNLOCK(); splx(s); - if_slowtimo(0); + + timeout(if_slowtimo, (void *)0, hz / IFNET_SLOWHZ); } /* @@ -355,6 +429,7 @@ struct ifnet* if_alloc(u_char type) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO); @@ -365,7 +440,7 @@ * * XXX: should be locked! */ - for (ifp->if_index = 1; ifp->if_index <= if_index; ifp->if_index++) { + for (ifp->if_index = 1; ifp->if_index <= V_if_index; ifp->if_index++) { if (ifnet_byindex(ifp->if_index) == NULL) break; } @@ -374,9 +449,9 @@ free(ifp, M_IFNET); return (NULL); } - if (ifp->if_index > if_index) - if_index = ifp->if_index; - if (if_index >= if_indexlim) + if (ifp->if_index > V_if_index) + V_if_index = ifp->if_index; + if (V_if_index >= V_if_indexlim) if_grow(); ifnet_byindex(ifp->if_index) = ifp; @@ -415,6 +490,7 @@ void if_free_type(struct ifnet *ifp, u_char type) { + INIT_VNET_NET(curvnet); /* ifp->if_vnet can be NULL here ! */ if (ifp != ifnet_byindex(ifp->if_index)) { if_printf(ifp, "%s: value was not if_alloced, skipping\n", @@ -427,8 +503,8 @@ ifnet_byindex(ifp->if_index) = NULL; /* XXX: should be locked with if_findindex() */ - while (if_index > 0 && ifnet_byindex(if_index) == NULL) - if_index--; + while (V_if_index > 0 && ifnet_byindex(V_if_index) == NULL) + V_if_index--; if (if_com_free[type] != NULL) if_com_free[type](ifp->if_l2com, type); @@ -451,6 +527,7 @@ void if_attach(struct ifnet *ifp) { + INIT_VNET_NET(curvnet); unsigned socksize, ifasize; int namelen, masklen; struct sockaddr_dl *sdl; @@ -460,6 +537,11 @@ panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); +#ifdef VIMAGE + ifp->if_vnet = curvnet; + if (ifp->if_home_vnet == NULL) + ifp->if_home_vnet = curvnet; +#endif TASK_INIT(&ifp->if_starttask, 0, if_start_deferred, ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); IF_AFDATA_LOCK_INIT(ifp); @@ -482,12 +564,18 @@ mac_ifnet_create(ifp); #endif +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif ifdev_byindex(ifp->if_index) = make_dev(&net_cdevsw, unit2minor(ifp->if_index), UID_ROOT, GID_WHEEL, 0600, "%s/%s", net_cdevsw.d_name, ifp->if_xname); make_dev_alias(ifdev_byindex(ifp->if_index), "%s%d", net_cdevsw.d_name, ifp->if_index); +#ifdef VIMAGE + } +#endif mtx_init(&ifp->if_snd.ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); @@ -533,13 +621,19 @@ ifp->if_snd.altq_ifp = ifp; IFNET_WLOCK(); - TAILQ_INSERT_TAIL(&ifnet, ifp, if_link); + TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); +#ifdef VIMAGE + curvnet->ifccnt++; +#endif IFNET_WUNLOCK(); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) +#endif devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); /* Announce the interface. */ @@ -552,16 +646,17 @@ static void if_attachdomain(void *dummy) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; int s; s = splnet(); - TAILQ_FOREACH(ifp, &ifnet, if_link) + TAILQ_FOREACH(ifp, &V_ifnet, if_link) if_attachdomain1(ifp); splx(s); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, - if_attachdomain, NULL); + if_attachdomain, NULL); static void if_attachdomain1(struct ifnet *ifp) @@ -662,6 +757,7 @@ void if_detach(struct ifnet *ifp) { + INIT_VNET_NET(ifp->if_vnet); struct ifaddr *ifa; struct radix_node_head *rnh; int s; @@ -670,13 +766,25 @@ struct ifnet *iter; int found = 0; + /* + * Detach from any vlan, bridge or lagg ifnets linked to us. + * A small though unlikely window for a race from here to ifp + * unlinking from ifnet list is possible, hence we repeat the + * procedure once again further bellow. XXX. + */ + EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); + IFNET_WLOCK(); - TAILQ_FOREACH(iter, &ifnet, if_link) + TAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { - TAILQ_REMOVE(&ifnet, ifp, if_link); + TAILQ_REMOVE(&V_ifnet, ifp, if_link); found = 1; break; } +#ifdef VIMAGE + if (found) + curvnet->ifccnt--; +#endif IFNET_WUNLOCK(); if (!found) return; @@ -720,7 +828,13 @@ * Clean up all addresses. */ ifp->if_addr = NULL; +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif destroy_dev(ifdev_byindex(ifp->if_index)); +#ifdef VIMAGE + } +#endif ifdev_byindex(ifp->if_index) = NULL; /* We can now free link ifaddr. */ @@ -737,7 +851,7 @@ * to this interface...oh well... */ for (i = 1; i <= AF_MAX; i++) { - if ((rnh = rt_tables[i]) == NULL) + if ((rnh = V_rt_tables[i]) == NULL) continue; RADIX_NODE_HEAD_LOCK(rnh); (void) rnh->rnh_walktree(rnh, if_rtdel, ifp); @@ -747,6 +861,9 @@ /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) +#endif devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); IF_AFDATA_LOCK(ifp); @@ -765,6 +882,9 @@ knlist_destroy(&ifp->if_klist); mtx_destroy(&ifp->if_snd.ifq_mtx); IF_AFDATA_DESTROY(ifp); +#ifdef VIMAGE + ifp->if_vnet = NULL; +#endif splx(s); } @@ -774,6 +894,7 @@ int if_addgroup(struct ifnet *ifp, const char *groupname) { + INIT_VNET_NET(ifp->if_vnet); struct ifg_list *ifgl; struct ifg_group *ifg = NULL; struct ifg_member *ifgm; @@ -802,7 +923,7 @@ return (ENOMEM); } - TAILQ_FOREACH(ifg, &ifg_head, ifg_next) + TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, groupname)) break; @@ -818,7 +939,7 @@ ifg->ifg_refcnt = 0; TAILQ_INIT(&ifg->ifg_members); EVENTHANDLER_INVOKE(group_attach_event, ifg); - TAILQ_INSERT_TAIL(&ifg_head, ifg, ifg_next); + TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); } ifg->ifg_refcnt++; @@ -843,6 +964,7 @@ int if_delgroup(struct ifnet *ifp, const char *groupname) { + INIT_VNET_NET(ifp->if_vnet); struct ifg_list *ifgl; struct ifg_member *ifgm; @@ -869,7 +991,7 @@ } if (--ifgl->ifgl_group->ifg_refcnt == 0) { - TAILQ_REMOVE(&ifg_head, ifgl->ifgl_group, ifg_next); + TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } @@ -932,6 +1054,7 @@ static int if_getgroupmembers(struct ifgroupreq *data) { + INIT_VNET_NET(curvnet); struct ifgroupreq *ifgr = data; struct ifg_group *ifg; struct ifg_member *ifgm; @@ -939,7 +1062,7 @@ int len, error; IFNET_RLOCK(); - TAILQ_FOREACH(ifg, &ifg_head, ifg_next) + TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, ifgr->ifgr_name)) break; if (ifg == NULL) { @@ -1041,11 +1164,12 @@ struct ifaddr * ifa_ifwithaddr(struct sockaddr *addr) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) + TAILQ_FOREACH(ifp, &V_ifnet, if_link) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; @@ -1071,11 +1195,12 @@ struct ifaddr * ifa_ifwithbroadaddr(struct sockaddr *addr) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) + TAILQ_FOREACH(ifp, &V_ifnet, if_link) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; @@ -1098,11 +1223,12 @@ struct ifaddr * ifa_ifwithdstaddr(struct sockaddr *addr) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { @@ -1126,6 +1252,7 @@ struct ifaddr * ifa_ifwithnet(struct sockaddr *addr) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *ifa_maybe = (struct ifaddr *) 0; @@ -1138,7 +1265,7 @@ */ if (af == AF_LINK) { struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr; - if (sdl->sdl_index && sdl->sdl_index <= if_index) + if (sdl->sdl_index && sdl->sdl_index <= V_if_index) return (ifaddr_byindex(sdl->sdl_index)); } @@ -1147,7 +1274,7 @@ * addresses in this address family. */ IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { char *cp, *cp2, *cp3; @@ -1369,6 +1496,7 @@ struct ifnet *ifp = (struct ifnet *)arg; int link_state = ifp->if_link_state; int link; + CURVNET_SET(ifp->if_vnet); /* Notify that the link state has changed. */ rt_ifmsg(ifp); @@ -1398,6 +1526,9 @@ (*lagg_linkstate_p)(ifp, link_state); } +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) +#endif devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); if (pending > 1) @@ -1405,6 +1536,7 @@ if (log_link_state_change) log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname, (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); + CURVNET_RESTORE(); } /* @@ -1471,12 +1603,15 @@ int s = splimp(); IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + VNET_ITERLOOP_BEGIN(); + INIT_VNET_NET(curvnet); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_timer == 0 || --ifp->if_timer) continue; if (ifp->if_watchdog) (*ifp->if_watchdog)(ifp); } + VNET_ITERLOOP_END(); IFNET_RUNLOCK(); splx(s); timeout(if_slowtimo, (void *)0, hz / IFNET_SLOWHZ); @@ -1489,10 +1624,11 @@ struct ifnet * ifunit(const char *name) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } @@ -1854,6 +1990,24 @@ ifr = (struct ifreq *)data; switch (cmd) { +#ifdef VIMAGE + case SIOCSIFVIMAGE: + error = suser(td); + if (error == 0) + error = vi_if_move((struct vi_req *) data, NULL, + TD_TO_VIMAGE(td)); + return (error); + + /* + * XXX Should be implemented as separate system calls. This is + * just a temporary hack! + */ + case SIOCSPVIMAGE: + case SIOCGPVIMAGE: + error = vi_td_ioctl(cmd, (struct vi_req *) data, td); + return (error); +#endif + case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); @@ -2061,6 +2215,7 @@ static int ifconf(u_long cmd, caddr_t data) { + INIT_VNET_NET(curvnet); struct ifconf *ifc = (struct ifconf *)data; #ifdef __amd64__ struct ifconf32 *ifc32 = (struct ifconf32 *)data; @@ -2096,7 +2251,7 @@ valid_len = 0; IFNET_RLOCK(); /* could sleep XXX */ - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { int addrs; /* @@ -2420,9 +2575,10 @@ int lastref; #ifdef INVARIANTS struct ifnet *oifp; + INIT_VNET_NET(ifp->if_vnet); IFNET_RLOCK(); - TAILQ_FOREACH(oifp, &ifnet, if_link) + TAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) @@ -2744,7 +2900,6 @@ if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f) { - KASSERT(if_com_alloc[type] == NULL, ("if_register_com_alloc: %d already registered", type)); KASSERT(if_com_free[type] == NULL, @@ -2757,7 +2912,6 @@ void if_deregister_com_alloc(u_char type) { - KASSERT(if_com_alloc[type] != NULL, ("if_deregister_com_alloc: %d not registered", type)); KASSERT(if_com_free[type] != NULL, --- /u/marko/p4/head/src/sys/net/if_clone.c 2007-08-31 03:47:47.000000000 +0200 +++ src/sys/net/if_clone.c 2007-10-22 18:06:36.000000000 +0200 @@ -30,6 +30,8 @@ * $FreeBSD: src/sys/net/if_clone.c,v 1.11 2006/07/09 06:04:00 sam Exp $ */ +#include "opt_vimage.h" + #include #include #include @@ -39,7 +41,9 @@ #include #include #include +#include +#include #include #include #if 0 @@ -204,15 +208,14 @@ { int err; - if (ifc->ifc_destroy == NULL) { - err = EOPNOTSUPP; - goto done; - } + if (ifc->ifc_destroy == NULL) + return(EOPNOTSUPP); IF_CLONE_LOCK(ifc); IFC_IFLIST_REMOVE(ifc, ifp); IF_CLONE_UNLOCK(ifc); + CURVNET_SET_QUIET(ifp->if_vnet); if_delgroup(ifp, ifc->ifc_name); err = (*ifc->ifc_destroy)(ifc, ifp); @@ -224,8 +227,7 @@ IFC_IFLIST_INSERT(ifc, ifp); IF_CLONE_UNLOCK(ifc); } - -done: + CURVNET_RESTORE(); return (err); } @@ -402,6 +404,24 @@ * Find a free unit if none was given. */ if (wildcard) { +#ifdef VIMAGE + INIT_VNET_NET(curvnet); + char name[IFNAMSIZ]; + struct ifnet *ifp; + int i = 0; + + IFNET_RLOCK(); +again: + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + sprintf(name, "%s%d", ifc->ifc_name, i); + if (strcmp(name, ifp->if_xname) == 0) { + i++; + goto again; + } + } + IFNET_RUNLOCK(); + *unit = i; +#else while ((bytoff < ifc->ifc_bmlen) && (ifc->ifc_units[bytoff] == 0xff)) bytoff++; @@ -412,6 +432,7 @@ while ((ifc->ifc_units[bytoff] & (1 << bitoff)) != 0) bitoff++; *unit = (bytoff << 3) + bitoff; +#endif } if (*unit > ifc->ifc_maxunit) { @@ -419,6 +440,7 @@ goto done; } +#ifndef VIMAGE if (!wildcard) { bytoff = *unit >> 3; bitoff = *unit - (bytoff << 3); @@ -434,6 +456,7 @@ KASSERT((ifc->ifc_units[bytoff] & (1 << bitoff)) == 0, ("%s: bit is already set", __func__)); ifc->ifc_units[bytoff] |= (1 << bitoff); +#endif IF_CLONE_ADDREF_LOCKED(ifc); done: @@ -444,9 +467,9 @@ void ifc_free_unit(struct if_clone *ifc, int unit) { +#ifndef VIMAGE int bytoff, bitoff; - /* * Compute offset in the bitmap and deallocate the unit. */ @@ -458,6 +481,7 @@ ("%s: bit is already cleared", __func__)); ifc->ifc_units[bytoff] &= ~(1 << bitoff); IF_CLONE_REMREF_LOCKED(ifc); /* releases lock */ +#endif } void --- /u/marko/p4/head/src/sys/net/if_ethersubr.c 2007-11-13 02:49:08.000000000 +0100 +++ src/sys/net/if_ethersubr.c 2007-12-10 11:26:09.000000000 +0100 @@ -37,6 +37,7 @@ #include "opt_mac.h" #include "opt_netgraph.h" #include "opt_carp.h" +#include "opt_vimage.h" #include #include @@ -48,7 +49,9 @@ #include #include #include +#include +#include #include #include #include @@ -135,8 +138,10 @@ int ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, struct ip_fw **rule, int shared); +#ifndef VIMAGE static int ether_ipfw; #endif +#endif /* * Ethernet output routine. @@ -385,9 +390,10 @@ { int error; #if defined(INET) || defined(INET6) + INIT_VNET_NET(ifp->if_vnet); struct ip_fw *rule = ip_dn_claim_rule(m); - if (IPFW_LOADED && ether_ipfw != 0) { + if (IPFW_LOADED && V_ether_ipfw != 0) { if (ether_ipfw_chk(&m, ifp, &rule, 0) == 0) { if (m) { m_freem(m); @@ -416,13 +422,14 @@ ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, struct ip_fw **rule, int shared) { + INIT_VNET_IPFW(dst->if_vnet); struct ether_header *eh; struct ether_header save_eh; struct mbuf *m; int i; struct ip_fw_args args; - if (*rule != NULL && fw_one_pass) + if (*rule != NULL && V_fw_one_pass) return 1; /* dummynet packet, already partially processed */ /* @@ -557,6 +564,8 @@ } #endif + CURVNET_SET_QUIET(ifp->if_vnet); + if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; @@ -593,6 +602,7 @@ /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); + CURVNET_RESTORE(); return; } @@ -641,8 +651,10 @@ ("%s: ng_ether_input_p is NULL", __func__)); m->m_flags &= ~M_PROMISC; (*ng_ether_input_p)(ifp, &m); - if (m == NULL) + if (m == NULL) { + CURVNET_RESTORE(); return; + } } /* @@ -653,8 +665,10 @@ if (ifp->if_bridge != NULL) { m->m_flags &= ~M_PROMISC; BRIDGE_INPUT(ifp, m); - if (m == NULL) + if (m == NULL) { + CURVNET_RESTORE(); return; + } } #ifdef DEV_CARP @@ -690,6 +704,7 @@ random_harvest(m, 16, 3, 0, RANDOM_NET); ether_demux(ifp, m); + CURVNET_RESTORE(); } /* @@ -708,11 +723,12 @@ KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__)); #if defined(INET) || defined(INET6) + INIT_VNET_NET(ifp->if_vnet); /* * Allow dummynet and/or ipfw to claim the frame. * Do not do this for PROMISC frames in case we are re-entered. */ - if (IPFW_LOADED && ether_ipfw != 0 && !(m->m_flags & M_PROMISC)) { + if (IPFW_LOADED && V_ether_ipfw != 0 && !(m->m_flags & M_PROMISC)) { struct ip_fw *rule = ip_dn_claim_rule(m); if (ether_ipfw_chk(&m, NULL, &rule, 0) == 0) { @@ -870,6 +886,25 @@ return (etherbuf); } +#ifdef VIMAGE +static void +ether_reassign(struct ifnet *ifp, struct vnet *vnet, char *dname) +{ + u_char eaddr[6]; + + bcopy(IF_LLADDR(ifp), eaddr, 6); + ether_ifdetach(ifp); + ifp->if_bpf = NULL; + if_reassign_common(ifp, vnet, "eth"); + if (dname) + snprintf(ifp->if_xname, IFNAMSIZ, "%s", dname); + + CURVNET_SET_QUIET(vnet); + ether_ifattach(ifp, eaddr); + CURVNET_RESTORE(); +} +#endif + /* * Perform common duties while attaching to interface list */ @@ -879,6 +914,9 @@ int i; struct ifaddr *ifa; struct sockaddr_dl *sdl; +#ifdef VIMAGE + struct vnet *home_vnet_0 = ifp->if_home_vnet; +#endif ifp->if_addrlen = ETHER_ADDR_LEN; ifp->if_hdrlen = ETHER_HDR_LEN; @@ -887,6 +925,9 @@ ifp->if_output = ether_output; ifp->if_input = ether_input; ifp->if_resolvemulti = ether_resolvemulti; +#ifdef VIMAGE + ifp->if_reassign = ether_reassign; +#endif if (ifp->if_baudrate == 0) ifp->if_baudrate = IF_Mbps(10); /* just a default */ ifp->if_broadcastaddr = etherbroadcastaddr; @@ -906,7 +947,11 @@ for (i = 0; i < ifp->if_addrlen; i++) if (lla[i] != 0) break; +#ifdef VIMAGE + if (i != ifp->if_addrlen && home_vnet_0 != ifp->if_home_vnet) +#else if (i != ifp->if_addrlen) +#endif if_printf(ifp, "Ethernet address: %6D\n", lla, ":"); if (ifp->if_flags & IFF_NEEDSGIANT) if_printf(ifp, "if_start running deferred for Giant\n"); @@ -931,8 +976,8 @@ SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); #if defined(INET) || defined(INET6) -SYSCTL_INT(_net_link_ether, OID_AUTO, ipfw, CTLFLAG_RW, - ðer_ipfw,0,"Pass ether pkts through firewall"); +SYSCTL_V_INT(V_NET, vnet_net, _net_link_ether, OID_AUTO, ipfw, CTLFLAG_RW, + ether_ipfw, 0, "Pass ether pkts through firewall"); #endif #if 0 --- /u/marko/p4/head/src/sys/net/if_faith.c 2007-08-31 03:47:47.000000000 +0200 +++ src/sys/net/if_faith.c 2007-10-22 18:06:36.000000000 +0200 @@ -41,6 +41,7 @@ */ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -54,6 +55,7 @@ #include #include #include +#include #include #include @@ -76,6 +78,7 @@ #include #include #include +#include #endif #define FAITHNAME "faith" @@ -323,11 +326,12 @@ faithprefix(in6) struct in6_addr *in6; { + INIT_VNET_INET6(curvnet); struct rtentry *rt; struct sockaddr_in6 sin6; int ret; - if (ip6_keepfaith == 0) + if (V_ip6_keepfaith == 0) return 0; bzero(&sin6, sizeof(sin6)); --- /u/marko/p4/head/src/sys/net/if_gif.c 2007-10-29 17:17:42.000000000 +0100 +++ src/sys/net/if_gif.c 2007-12-10 11:26:09.000000000 +0100 @@ -33,6 +33,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -48,6 +49,8 @@ #include #include #include +#include + #include #include @@ -92,7 +95,9 @@ */ static struct mtx gif_mtx; static MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface"); +#ifndef VIMAGE static LIST_HEAD(, gif_softc) gif_softc_list; +#endif void (*ng_gif_input_p)(struct ifnet *ifp, struct mbuf **mp, int af); void (*ng_gif_input_orphan_p)(struct ifnet *ifp, struct mbuf *m, int af); @@ -102,6 +107,7 @@ static void gif_start(struct ifnet *); static int gif_clone_create(struct if_clone *, int, caddr_t); static void gif_clone_destroy(struct ifnet *); +static int vnet_gif_iattach(const void *); IFC_SIMPLE_DECLARE(gif, 0); @@ -121,22 +127,30 @@ */ #define MAX_GIF_NEST 1 #endif -static int max_gif_nesting = MAX_GIF_NEST; -SYSCTL_INT(_net_link_gif, OID_AUTO, max_nesting, CTLFLAG_RW, - &max_gif_nesting, 0, "Max nested tunnels"); +#ifndef VIMAGE +static int max_gif_nesting; +#endif +SYSCTL_V_INT(V_NET, vnet_gif, _net_link_gif, OID_AUTO, max_nesting, + CTLFLAG_RW, max_gif_nesting, 0, "Max nested tunnels"); + +#ifdef INET6 +SYSCTL_DECL(_net_inet6_ip6); +SYSCTL_V_INT(V_NET, vnet_gif, _net_inet6_ip6, IPV6CTL_GIF_HLIM, + gifhlim, CTLFLAG_RW, ip6_gif_hlim, 0, ""); +#endif /* * By default, we disallow creation of multiple tunnels between the same * pair of addresses. Some applications require this functionality so * we allow control over this check here. */ -#ifdef XBONEHACK -static int parallel_tunnels = 1; -#else -static int parallel_tunnels = 0; +#ifndef VIMAGE +static int parallel_tunnels; #endif -SYSCTL_INT(_net_link_gif, OID_AUTO, parallel_tunnels, CTLFLAG_RW, - ¶llel_tunnels, 0, "Allow parallel tunnels?"); +SYSCTL_V_INT(V_NET, vnet_gif, _net_link_gif, OID_AUTO, parallel_tunnels, + CTLFLAG_RW, parallel_tunnels, 0, "Allow parallel tunnels?"); + +VNET_MOD_DECLARE(GIF, gif, NULL, vnet_gif_iattach, NET, NULL) static int gif_clone_create(ifc, unit, params) @@ -144,6 +158,7 @@ int unit; caddr_t params; { + INIT_VNET_GIF(curvnet); struct gif_softc *sc; sc = malloc(sizeof(struct gif_softc), M_GIF, M_WAITOK | M_ZERO); @@ -177,7 +192,7 @@ (*ng_gif_attach_p)(GIF2IFP(sc)); mtx_lock(&gif_mtx); - LIST_INSERT_HEAD(&gif_softc_list, sc, gif_list); + LIST_INSERT_HEAD(&V_gif_softc_list, sc, gif_list); mtx_unlock(&gif_mtx); return (0); @@ -220,29 +235,47 @@ } static int +vnet_gif_iattach(unused) + const void *unused; +{ + INIT_VNET_GIF(curvnet); + + LIST_INIT(&V_gif_softc_list); + V_max_gif_nesting = MAX_GIF_NEST; +#ifdef XBONEHACK + V_parallel_tunnels = 1; +#endif + V_ip_gif_ttl = GIF_TTL; +#ifdef INET6 + V_ip6_gif_hlim = GIF_HLIM; +#endif + + return 0; +} + +static int gifmodevent(mod, type, data) module_t mod; int type; void *data; { - switch (type) { case MOD_LOAD: mtx_init(&gif_mtx, "gif_mtx", NULL, MTX_DEF); - LIST_INIT(&gif_softc_list); - if_clone_attach(&gif_cloner); - -#ifdef INET6 - ip6_gif_hlim = GIF_HLIM; +#ifdef VIMAGE + vnet_mod_register(&vnet_gif_modinfo); +#else + vnet_gif_iattach(NULL); #endif - + if_clone_attach(&gif_cloner); break; case MOD_UNLOAD: if_clone_detach(&gif_cloner); - mtx_destroy(&gif_mtx); -#ifdef INET6 - ip6_gif_hlim = 0; +#ifdef VIMAGE + vnet_mod_deregister(&vnet_gif_modinfo); #endif + mtx_destroy(&gif_mtx); + break; default: return EOPNOTSUPP; @@ -353,6 +386,7 @@ struct sockaddr *dst; struct rtentry *rt; /* added in net2 */ { + INIT_VNET_GIF(ifp->if_vnet); struct gif_softc *sc = ifp->if_softc; struct m_tag *mtag; int error = 0; @@ -388,7 +422,7 @@ mtag = m_tag_locate(m, MTAG_GIF, MTAG_GIF_CALLED, mtag); gif_called++; } - if (gif_called > max_gif_nesting) { + if (gif_called > V_max_gif_nesting) { log(LOG_NOTICE, "gif_output: recursively called too many times(%d)\n", gif_called); @@ -822,13 +856,14 @@ struct sockaddr *src; struct sockaddr *dst; { + INIT_VNET_GIF(ifp->if_vnet); struct gif_softc *sc = ifp->if_softc; struct gif_softc *sc2; struct sockaddr *osrc, *odst, *sa; int error = 0; mtx_lock(&gif_mtx); - LIST_FOREACH(sc2, &gif_softc_list, gif_list) { + LIST_FOREACH(sc2, &V_gif_softc_list, gif_list) { if (sc2 == sc) continue; if (!sc2->gif_pdst || !sc2->gif_psrc) @@ -843,7 +878,7 @@ * Disallow parallel tunnels unless instructed * otherwise. */ - if (!parallel_tunnels && + if (!V_parallel_tunnels && bcmp(sc2->gif_pdst, dst, dst->sa_len) == 0 && bcmp(sc2->gif_psrc, src, src->sa_len) == 0) { error = EADDRNOTAVAIL; --- /u/marko/p4/head/src/sys/net/if_gif.h 2007-08-31 03:47:47.000000000 +0200 +++ src/sys/net/if_gif.h 2007-10-05 12:26:48.000000000 +0200 @@ -109,6 +109,29 @@ void gif_delete_tunnel(struct ifnet *); int gif_encapcheck(const struct mbuf *, int, int, void *); +/* + * Virtualization support + */ + +#define INIT_VNET_GIF(vnet) \ + INIT_FROM_VNET(vnet, VNET_MOD_GIF, struct vnet_gif, vnet_gif) + +#define VNET_GIF(sym) VSYM(vnet_gif, sym) + +struct vnet_gif { + LIST_HEAD(, gif_softc) _gif_softc_list; + int _max_gif_nesting; + int _parallel_tunnels; + int _ip_gif_ttl; + int _ip6_gif_hlim; +}; + +#define V_gif_softc_list VNET_GIF(gif_softc_list) +#define V_max_gif_nesting VNET_GIF(max_gif_nesting) +#define V_parallel_tunnels VNET_GIF(parallel_tunnels) +#define V_ip_gif_ttl VNET_GIF(ip_gif_ttl) +#define V_ip6_gif_hlim VNET_GIF(ip6_gif_hlim) + #endif /* _KERNEL */ #endif /* _NET_IF_GIF_H_ */ --- /u/marko/p4/head/src/sys/net/if_gre.c 2007-08-31 03:47:47.000000000 +0200 +++ src/sys/net/if_gre.c 2007-10-22 18:06:37.000000000 +0200 @@ -51,6 +51,7 @@ #include "opt_atalk.h" #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -63,6 +64,7 @@ #include #include #include +#include #include #include @@ -71,6 +73,7 @@ #include #ifdef INET +#include #include #include #include @@ -238,12 +241,15 @@ gre_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct rtentry *rt) { +#ifdef INET6 + INIT_VNET_INET(ifp->if_vnet); +#endif int error = 0; struct gre_softc *sc = ifp->if_softc; struct greip *gh; struct ip *ip; - u_short ip_id = 0; - uint8_t ip_tos = 0; + u_short gre_ip_id = 0; + uint8_t gre_ip_tos = 0; u_int16_t etype = 0; struct mobile_h mob_h; u_int32_t af; @@ -360,13 +366,13 @@ switch (dst->sa_family) { case AF_INET: ip = mtod(m, struct ip *); - ip_tos = ip->ip_tos; - ip_id = ip->ip_id; + gre_ip_tos = ip->ip_tos; + gre_ip_id = ip->ip_id; etype = ETHERTYPE_IP; break; #ifdef INET6 case AF_INET6: - ip_id = ip_newid(); + gre_ip_id = ip_newid(); etype = ETHERTYPE_IPV6; break; #endif @@ -409,8 +415,8 @@ ((struct ip*)gh)->ip_v = IPPROTO_IPV4; ((struct ip*)gh)->ip_hl = (sizeof(struct ip)) >> 2; ((struct ip*)gh)->ip_ttl = GRE_TTL; - ((struct ip*)gh)->ip_tos = ip_tos; - ((struct ip*)gh)->ip_id = ip_id; + ((struct ip*)gh)->ip_tos = gre_ip_tos; + ((struct ip*)gh)->ip_id = gre_ip_id; gh->gi_len = m->m_pkthdr.len; } --- /u/marko/p4/head/src/sys/net/if_loop.c 2007-10-29 17:17:42.000000000 +0100 +++ src/sys/net/if_loop.c 2007-12-10 11:26:09.000000000 +0100 @@ -27,7 +27,7 @@ * SUCH DAMAGE. * * @(#)if_loop.c 8.2 (Berkeley) 1/9/95 - * $FreeBSD: src/sys/net/if_loop.c,v 1.113 2007/10/27 18:25:53 yar Exp $ + * $FreeBSD: src/sys/net/if_loop.c,v 1.112 2007/02/09 00:09:35 cognet Exp $ */ /* @@ -38,6 +38,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipx.h" +#include "opt_vimage.h" #include #include @@ -50,7 +51,9 @@ #include #include #include +#include +#include #include #include #include @@ -94,6 +97,7 @@ struct lo_softc { struct ifnet *sc_ifp; + LIST_ENTRY(lo_softc) sc_next; }; int loioctl(struct ifnet *, u_long, caddr_t); @@ -102,11 +106,20 @@ struct sockaddr *dst, struct rtentry *rt); static int lo_clone_create(struct if_clone *, int, caddr_t); static void lo_clone_destroy(struct ifnet *); +static int vnet_loif_iattach(const void *); +#ifdef VIMAGE +static int vnet_loif_idetach(const void *); +#endif +#ifndef VIMAGE struct ifnet *loif = NULL; /* Used externally */ +static LIST_HEAD(lo_list, lo_softc) lo_list; +#endif /* !VIMAGE */ static MALLOC_DEFINE(M_LO, LONAME, "Loopback Interface"); +static struct mtx lo_mtx; + IFC_SIMPLE_DECLARE(lo, 1); static void @@ -114,12 +127,18 @@ struct ifnet *ifp; { struct lo_softc *sc; +#ifdef INVARIANTS + INIT_VNET_NET(ifp->if_vnet); +#endif sc = ifp->if_softc; /* XXX: destroying lo0 will lead to panics. */ - KASSERT(loif != ifp, ("%s: destroying lo0", __func__)); + KASSERT(V_loif != ifp, ("%s: destroying lo0", __func__)); + mtx_lock(&lo_mtx); + LIST_REMOVE(sc, sc_next); + mtx_unlock(&lo_mtx); bpfdetach(ifp); if_detach(ifp); if_free(ifp); @@ -132,6 +151,7 @@ int unit; caddr_t params; { + INIT_VNET_NET(curvnet); struct ifnet *ifp; struct lo_softc *sc; @@ -141,6 +161,8 @@ free(sc, M_LO); return (ENOSPC); } + if (V_loif == NULL) + V_loif = ifp; if_initname(ifp, ifc->ifc_name, unit); ifp->if_mtu = LOMTU; @@ -151,18 +173,72 @@ ifp->if_softc = sc; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); - if (loif == NULL) - loif = ifp; + mtx_lock(&lo_mtx); + LIST_INSERT_HEAD(&V_lo_list, sc, sc_next); + mtx_unlock(&lo_mtx); return (0); } +VNET_MOD_DECLARE_STATELESS(LOIF, loif, vnet_loif_iattach, vnet_loif_idetach, + NET) + +static int vnet_loif_iattach(unused) + const void *unused; +{ + INIT_VNET_NET(curvnet); + + LIST_INIT(&V_lo_list); +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) + if_clone_attach(&lo_cloner); + else + lo_cloner.ifc_attach(&lo_cloner); +#else + if_clone_attach(&lo_cloner); +#endif + return 0; +} + +#ifdef VIMAGE +static int vnet_loif_idetach(unused) + const void *unused; +{ + INIT_VNET_NET(curvnet); + struct lo_softc *sc, *nsc; + + LIST_FOREACH_SAFE(sc, &V_lo_list, sc_next, nsc) { + struct ifnet *ifp = sc->sc_ifp; + + if (ifp == V_loif) { + /* + * A hack to allow lo0 to be detached: + * bump if_unit number from 0 to 1. By + * setting V_loif to NULL we prevent queuing + * of routing messages that would have + * m_pkthdr.rcvif pointing to a nonexisting + * ifnet, i.e. the lo0 we just destroyed. + */ + ifp->if_dunit = 1; + V_loif = NULL; + } + if_clone_destroy(ifp->if_xname); + } + return 0; +} +#endif + static int loop_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: - if_clone_attach(&lo_cloner); + mtx_init(&lo_mtx, "lo_mtx", NULL, MTX_DEF); +#ifdef VIMAGE + vnet_mod_register(&vnet_loif_modinfo); +#else + vnet_loif_iattach(NULL); +#endif break; case MOD_UNLOAD: printf("loop module unload - not possible for this module type\n"); @@ -195,7 +271,7 @@ if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { m_freem(m); return (rt->rt_flags & RTF_BLACKHOLE ? 0 : - rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); + rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); } ifp->if_opackets++; @@ -241,6 +317,7 @@ int af; int hlen; { + INIT_VNET_NET(ifp->if_vnet); int isr; M_ASSERTPKTHDR(m); @@ -262,15 +339,15 @@ bpf_mtap(ifp->if_bpf, m); } } else { - if (bpf_peers_present(loif->if_bpf)) { - if ((m->m_flags & M_MCAST) == 0 || loif == ifp) { + if (bpf_peers_present(V_loif->if_bpf)) { + if ((m->m_flags & M_MCAST) == 0 || V_loif == ifp) { /* XXX beware sizeof(af) != 4 */ u_int32_t af1 = af; /* * We need to prepend the address family. */ - bpf_mtap2(loif->if_bpf, &af1, sizeof(af1), m); + bpf_mtap2(V_loif->if_bpf, &af1, sizeof(af1), m); } } } --- /u/marko/p4/head/src/sys/net/if_mib.c 2007-08-31 03:47:48.000000000 +0200 +++ src/sys/net/if_mib.c 2007-10-22 18:06:37.000000000 +0200 @@ -29,12 +29,16 @@ * $FreeBSD: src/sys/net/if_mib.c,v 1.18 2006/01/04 12:57:09 harti Exp $ */ +#include "opt_vimage.h" + #include #include #include #include #include +#include +#include #include #include @@ -64,12 +68,15 @@ SYSCTL_DECL(_net_link_generic); SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system, CTLFLAG_RW, 0, "Variables global to all interfaces"); -SYSCTL_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount, CTLFLAG_RD, - &if_index, 0, "Number of configured interfaces"); + +SYSCTL_V_INT(V_NET, vnet_net, _net_link_generic_system, IFMIB_IFCOUNT, + ifcount, CTLFLAG_RD, if_index, 0, + "Number of configured interfaces"); static int sysctl_ifdata(SYSCTL_HANDLER_ARGS) /* XXX bad syntax! */ { + INIT_VNET_NET(curvnet); int *name = (int *)arg1; int error; u_int namelen = arg2; @@ -81,7 +88,7 @@ if (namelen != 2) return EINVAL; - if (name[0] <= 0 || name[0] > if_index || + if (name[0] <= 0 || name[0] > V_if_index || ifnet_byindex(name[0]) == NULL) return ENOENT; --- /u/marko/p4/head/src/sys/net/if_ppp.c 2007-10-29 17:17:42.000000000 +0100 +++ src/sys/net/if_ppp.c 2007-12-10 11:26:09.000000000 +0100 @@ -80,6 +80,7 @@ #include "opt_ipx.h" #include "opt_mac.h" #include "opt_ppp.h" +#include "opt_vimage.h" #ifdef INET #define VJC @@ -98,6 +99,7 @@ #include #include #include +#include #include #include @@ -1396,6 +1398,7 @@ struct mbuf *mp, *dmp = NULL; u_char *iphdr; u_int hlen; + CURVNET_SET(ifp->if_vnet); sc->sc_stats.ppp_ipackets++; @@ -1430,7 +1433,7 @@ m_freem(m); if (dmp == NULL) { /* no error, but no decompressed packet produced */ - return; + goto done; } m = dmp; cp = mtod(m, u_char *); @@ -1587,7 +1590,7 @@ ilen, 0) == 0) { /* drop this packet */ m_freem(m); - return; + goto done; } if (sc->sc_active_filt.bf_insns == 0 || bpf_filter(sc->sc_active_filt.bf_insns, (u_char *) m, ilen, 0)) @@ -1616,13 +1619,13 @@ || sc->sc_npmode[NP_IP] != NPMODE_PASS) { /* interface is down - drop the packet. */ m_freem(m); - return; + goto done; } m->m_pkthdr.len -= PPP_HDRLEN; m->m_data += PPP_HDRLEN; m->m_len -= PPP_HDRLEN; if ((m = ip_fastforward(m)) == NULL) - return; + goto done; isr = NETISR_IP; break; #endif @@ -1635,7 +1638,7 @@ || sc->sc_npmode[NP_IPV6] != NPMODE_PASS) { /* interface is down - drop the packet. */ m_freem(m); - return; + goto done; } m->m_pkthdr.len -= PPP_HDRLEN; m->m_data += PPP_HDRLEN; @@ -1652,7 +1655,7 @@ /* XXX: || sc->sc_npmode[NP_IPX] != NPMODE_PASS*/) { /* interface is down - drop the packet. */ m_freem(m); - return; + goto done; } m->m_pkthdr.len -= PPP_HDRLEN; m->m_data += PPP_HDRLEN; @@ -1687,6 +1690,8 @@ if (isr == -1) (*sc->sc_ctlp)(sc); + done: + CURVNET_RESTORE(); return; bad: @@ -1694,6 +1699,7 @@ m_freem(m); PPP2IFP(sc)->if_ierrors++; sc->sc_stats.ppp_ierrors++; + CURVNET_RESTORE(); } #define MAX_DUMP_BYTES 128 --- /u/marko/p4/head/src/sys/net/if_spppsubr.c 2007-08-31 03:47:48.000000000 +0200 +++ src/sys/net/if_spppsubr.c 2007-10-22 18:06:37.000000000 +0200 @@ -27,6 +27,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipx.h" +#include "opt_vimage.h" #endif #ifdef NetBSD1_3 @@ -48,6 +49,7 @@ #endif #include #include +#include #if defined (__OpenBSD__) #include @@ -55,10 +57,13 @@ #include #endif +#include #include #include #include #include + +#include #include #include #include @@ -4938,6 +4943,7 @@ static void sppp_set_ip_addr(struct sppp *sp, u_long src) { + INIT_VNET_INET(curvnet); STDDCL; struct ifaddr *ifa; struct sockaddr_in *si; --- /u/marko/p4/head/src/sys/net/if_stf.c 2007-10-29 17:17:42.000000000 +0100 +++ src/sys/net/if_stf.c 2007-12-10 11:26:09.000000000 +0100 @@ -77,6 +77,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -91,7 +92,9 @@ #include #include +#include +#include #include #include #include @@ -99,6 +102,7 @@ #include #include +#include #include #include #include @@ -361,6 +365,7 @@ stf_getsrcifa6(ifp) struct ifnet *ifp; { + INIT_VNET_INET(ifp->if_vnet); struct ifaddr *ia; struct in_ifaddr *ia4; struct sockaddr_in6 *sin6; @@ -555,6 +560,7 @@ struct in_addr *in; struct ifnet *inifp; /* incoming interface */ { + INIT_VNET_INET(curvnet); struct in_ifaddr *ia4; /* @@ -578,7 +584,7 @@ /* * reject packets with broadcast */ - for (ia4 = TAILQ_FIRST(&in_ifaddrhead); + for (ia4 = TAILQ_FIRST(&V_in_ifaddrhead); ia4; ia4 = TAILQ_NEXT(ia4, ia_link)) { --- /u/marko/p4/head/src/sys/net/if_tap.c 2007-08-31 03:47:48.000000000 +0200 +++ src/sys/net/if_tap.c 2007-10-22 18:06:38.000000000 +0200 @@ -37,6 +37,7 @@ #include "opt_compat.h" #include "opt_inet.h" +#include "opt_vimage.h" #include #include @@ -58,7 +59,9 @@ #include #include #include +#include +#include #include #include #include --- /u/marko/p4/head/src/sys/net/if_tun.c 2007-10-29 17:17:42.000000000 +0100 +++ src/sys/net/if_tun.c 2007-12-10 11:26:09.000000000 +0100 @@ -21,6 +21,7 @@ #include "opt_inet6.h" #include "opt_ipx.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -43,6 +44,7 @@ #include #include #include +#include #include #include @@ -224,6 +226,7 @@ else append_unit = 0; + CURVNET_SET(TD_TO_VNET(curthread)); /* find any existing device, or allocate new unit number */ i = clone_create(&tunclones, &tun_cdevsw, &u, dev, 0); if (i) { @@ -242,6 +245,7 @@ } if_clone_create(name, namelen, NULL); + CURVNET_RESTORE(); } static void @@ -253,6 +257,7 @@ KASSERT((tp->tun_flags & TUN_OPEN) == 0, ("tununits is out of sync - unit %d", TUN2IFP(tp)->if_dunit)); + CURVNET_SET(TUN2IFP(tp)->if_vnet); dev = tp->tun_dev; bpfdetach(TUN2IFP(tp)); if_detach(TUN2IFP(tp)); @@ -261,6 +266,7 @@ knlist_destroy(&tp->tun_rsel.si_note); mtx_destroy(&tp->tun_mtx); free(tp, M_TUN); + CURVNET_RESTORE(); } static void @@ -447,6 +453,7 @@ /* * junk all pending output */ + CURVNET_SET(ifp->if_vnet); s = splimp(); IFQ_PURGE(&ifp->if_snd); splx(s); @@ -476,6 +483,7 @@ ifp->if_drv_flags &= ~IFF_DRV_RUNNING; splx(s); } + CURVNET_RESTORE(); funsetown(&tp->tun_sigio); selwakeuppri(&tp->tun_rsel, PZERO + 1); @@ -924,7 +932,9 @@ random_harvest(m, 16, 3, 0, RANDOM_NET); ifp->if_ibytes += m->m_pkthdr.len; ifp->if_ipackets++; + CURVNET_SET(ifp->if_vnet); netisr_dispatch(isr, m); + CURVNET_RESTORE(); return (0); } --- /u/marko/p4/head/src/sys/net/if_var.h 2007-12-27 19:32:15.000000000 +0100 +++ src/sys/net/if_var.h 2008-01-14 19:23:47.000000000 +0100 @@ -70,6 +70,7 @@ struct ether_header; struct carp_if; struct ifvlantrunk; +struct vnet; #endif #include /* get TAILQ macros */ @@ -160,6 +161,10 @@ (void *); int (*if_resolvemulti) /* validate/resolve multicast */ (struct ifnet *, struct sockaddr **, struct sockaddr *); + void (*if_reassign) /* reassign to vnet routine */ + (struct ifnet *, struct vnet *, char *); + struct vnet *if_vnet; /* network stack instance */ + struct vnet *if_home_vnet; /* where this ifnet originates from */ struct ifaddr *if_addr; /* pointer to link-level address */ void *if_llsoftc; /* link layer softc */ int if_drv_flags; /* driver-managed status flags */ @@ -644,20 +649,22 @@ struct cdev *ife_dev; }; -#define ifnet_byindex(idx) ifindex_table[(idx)].ife_ifnet +#define ifnet_byindex(idx) V_ifindex_table[(idx)].ife_ifnet /* * Given the index, ifaddr_byindex() returns the one and only * link-level ifaddr for the interface. You are not supposed to use * it to traverse the list of addresses associated to the interface. */ #define ifaddr_byindex(idx) ifnet_byindex(idx)->if_addr -#define ifdev_byindex(idx) ifindex_table[(idx)].ife_dev +#define ifdev_byindex(idx) V_ifindex_table[(idx)].ife_dev +extern int ifqmaxlen; +#ifndef VIMAGE extern struct ifnethead ifnet; extern struct ifindex_entry *ifindex_table; -extern int ifqmaxlen; -extern struct ifnet *loif; /* first loopback interface */ extern int if_index; +extern struct ifnet *loif; /* first loopback interface */ +#endif /* !VIMAGE */ int if_addgroup(struct ifnet *, const char *); int if_delgroup(struct ifnet *, const char *); @@ -665,6 +672,7 @@ int if_allmulti(struct ifnet *, int); struct ifnet* if_alloc(u_char); void if_attach(struct ifnet *); +void if_grow(void); int if_delmulti(struct ifnet *, struct sockaddr *); void if_delmulti_ifma(struct ifmultiaddr *); void if_detach(struct ifnet *); --- /u/marko/p4/head/src/sys/net/if_vlan.c 2007-10-20 18:52:07.000000000 +0200 +++ src/sys/net/if_vlan.c 2007-10-22 18:06:38.000000000 +0200 @@ -42,6 +42,7 @@ */ #include "opt_vlan.h" +#include "opt_vimage.h" #include #include @@ -55,7 +56,9 @@ #include #include #include +#include +#include #include #include #include @@ -421,6 +424,8 @@ sc = ifp->if_softc; ifp_p = PARENT(sc); + CURVNET_SET_QUIET(ifp_p->if_vnet); + bzero((char *)&sdl, sizeof(sdl)); sdl.sdl_len = sizeof(sdl); sdl.sdl_family = AF_LINK; @@ -455,6 +460,7 @@ return (error); } + CURVNET_RESTORE(); return (0); } @@ -572,13 +578,14 @@ static struct ifnet * vlan_clone_match_ethertag(struct if_clone *ifc, const char *name, int *tag) { + INIT_VNET_NET(curvnet); const char *cp; struct ifnet *ifp; int t = 0; /* Check for . style interface names. */ IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_type != IFT_ETHER) continue; if (strncmp(ifp->if_xname, name, strlen(ifp->if_xname)) != 0) @@ -1345,6 +1352,12 @@ error = copyin(ifr->ifr_data, &vlr, sizeof(vlr)); if (error) break; +#ifdef VIMAGE + if (ifp->if_home_vnet != ifp->if_vnet) { + error = EPERM; + break; + } +#endif if (vlr.vlr_parent[0] == '\0') { vlan_unconfig(ifp); break; @@ -1372,6 +1385,12 @@ case SIOCGETVLAN: bzero(&vlr, sizeof(vlr)); +#ifdef VIMAGE + if (ifp->if_home_vnet != ifp->if_vnet) { + error = EPERM; + break; + } +#endif VLAN_LOCK(); if (TRUNK(ifv) != NULL) { strlcpy(vlr.vlr_parent, PARENT(ifv)->if_xname, --- /u/marko/p4/head/src/sys/net/netisr.c 2007-08-31 03:47:49.000000000 +0200 +++ src/sys/net/netisr.c 2007-10-22 18:06:38.000000000 +0200 @@ -28,6 +28,7 @@ */ #include "opt_device_polling.h" +#include "opt_vimage.h" #include #include @@ -49,7 +50,9 @@ #include #include +#include +#include #include #include #include @@ -140,7 +143,10 @@ IF_DEQUEUE(ni->ni_queue, m); if (m == NULL) break; + VNET_ASSERT(m->m_pkthdr.rcvif != NULL); + CURVNET_SET(m->m_pkthdr.rcvif->if_vnet); ni->ni_handler(m); + CURVNET_RESTORE(); } } @@ -161,6 +167,7 @@ m_freem(m); return; } + VNET_ASSERT(m->m_pkthdr.rcvif != NULL) /* * Do direct dispatch only for MPSAFE netisrs (and * only when enabled). Note that when a netisr is @@ -172,8 +179,19 @@ * from an interface but does not guarantee ordering * between multiple places in the system (e.g. IP * dispatched from interfaces vs. IP queued from IPSec). + * + * If the kernel was compiled with options VIMAGE, also defer + * dispatch of netisr handlers for mbufs that have crossed a + * boundary between two vnets. Direct dispatching in such + * cases could lead to various LORs, or in most extreme + * circumstances cause the kernel stack to overflow. */ +#ifndef VIMAGE if (netisr_direct && (ni->ni_flags & NETISR_MPSAFE)) { +#else + if (netisr_direct && (ni->ni_flags & NETISR_MPSAFE) && + !(m->m_flags & M_REMOTE_VNET)) { +#endif isrstat.isrs_directed++; /* * NB: We used to drain the queue before handling @@ -184,6 +202,15 @@ */ ni->ni_handler(m); } else { +#ifdef VIMAGE + /* + * Once direct netisr dispatching is avoided using the + * M_REMOTE_VNET flag, it should not be observed any + * more, so clear it here in order to avoid further + * defering of direct netisr dispatching. + */ + m->m_flags &= ~M_REMOTE_VNET; +#endif isrstat.isrs_deferred++; if (IF_HANDOFF(ni->ni_queue, m, NULL)) schednetisr(num); @@ -210,6 +237,10 @@ m_freem(m); return (ENXIO); } + VNET_ASSERT(m->m_pkthdr.rcvif != NULL) +#ifdef VIMAGE + m->m_flags &= ~M_REMOTE_VNET; +#endif isrstat.isrs_queued++; if (!IF_HANDOFF(ni->ni_queue, m, NULL)) return (ENOBUFS); /* IF_HANDOFF has free'd the mbuf */ --- /u/marko/p4/head/src/sys/net/raw_cb.c 2007-08-31 03:47:49.000000000 +0200 +++ src/sys/net/raw_cb.c 2007-10-22 18:06:38.000000000 +0200 @@ -30,6 +30,8 @@ * $FreeBSD: src/sys/net/raw_cb.c,v 1.34 2006/06/02 08:27:15 rwatson Exp $ */ +#include "opt_vimage.h" + #include #include #include @@ -39,7 +41,9 @@ #include #include #include +#include +#include #include /* @@ -52,7 +56,9 @@ */ struct mtx rawcb_mtx; +#ifndef VIMAGE struct rawcb_list_head rawcb_list; +#endif const static u_long raw_sendspace = RAWSNDQ; const static u_long raw_recvspace = RAWRCVQ; @@ -66,6 +72,7 @@ register struct socket *so; int proto; { + INIT_VNET_NET(so->so_vnet); register struct rawcb *rp = sotorawcb(so); int error; @@ -83,7 +90,7 @@ rp->rcb_proto.sp_family = so->so_proto->pr_domain->dom_family; rp->rcb_proto.sp_protocol = proto; mtx_lock(&rawcb_mtx); - LIST_INSERT_HEAD(&rawcb_list, rp, list); + LIST_INSERT_HEAD(&V_rawcb_list, rp, list); mtx_unlock(&rawcb_mtx); return (0); } --- /u/marko/p4/head/src/sys/net/raw_cb.h 2007-08-31 03:47:49.000000000 +0200 +++ src/sys/net/raw_cb.h 2007-10-05 12:26:49.000000000 +0200 @@ -56,7 +56,11 @@ #define RAWRCVQ 8192 #ifdef _KERNEL + +#ifndef VIMAGE extern LIST_HEAD(rawcb_list_head, rawcb) rawcb_list; +#endif + extern struct mtx rawcb_mtx; /* protosw entries */ --- /u/marko/p4/head/src/sys/net/raw_usrreq.c 2007-08-31 03:47:49.000000000 +0200 +++ src/sys/net/raw_usrreq.c 2007-10-22 18:06:38.000000000 +0200 @@ -30,6 +30,8 @@ * $FreeBSD: src/sys/net/raw_usrreq.c,v 1.44 2006/11/06 13:42:02 rwatson Exp $ */ +#include "opt_vimage.h" + #include #include #include @@ -43,7 +45,9 @@ #include #include #include +#include +#include #include MTX_SYSINIT(rawcb_mtx, &rawcb_mtx, "rawcb", MTX_DEF); @@ -54,8 +58,11 @@ void raw_init() { +#ifndef VIMAGE + INIT_VNET_NET(curvnet); - LIST_INIT(&rawcb_list); + LIST_INIT(&V_rawcb_list); +#endif } @@ -73,13 +80,14 @@ register struct sockproto *proto; struct sockaddr *src, *dst; { + INIT_VNET_NET(curvnet); register struct rawcb *rp; register struct mbuf *m = m0; struct socket *last; last = 0; mtx_lock(&rawcb_mtx); - LIST_FOREACH(rp, &rawcb_list, list) { + LIST_FOREACH(rp, &V_rawcb_list, list) { if (rp->rcb_proto.sp_family != proto->sp_family) continue; if (rp->rcb_proto.sp_protocol && --- /u/marko/p4/head/src/sys/net/route.c 2008-02-27 18:29:00.000000000 +0100 +++ src/sys/net/route.c 2008-02-27 11:48:38.000000000 +0100 @@ -32,6 +32,7 @@ #include "opt_inet.h" #include "opt_mrouting.h" +#include "opt_vimage.h" #include #include @@ -40,7 +41,9 @@ #include #include #include +#include +#include #include #include @@ -49,14 +52,18 @@ #include +#ifndef VIMAGE static struct rtstat rtstat; struct radix_node_head *rt_tables[AF_MAX+1]; - static int rttrash; /* routes not in table but not freed */ +#endif /* !VIMAGE */ static void rt_maskedcopy(struct sockaddr *, struct sockaddr *, struct sockaddr *); -static void rtable_init(void **); +static int rtable_init(const void *); +#ifdef VIMAGE +static int rtable_idetach(const void *); +#endif /* compare two sockaddr structures */ #define sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0) @@ -73,15 +80,38 @@ */ #define RNTORT(p) ((struct rtentry *)(p)) -static void -rtable_init(void **table) +VNET_MOD_DECLARE_STATELESS(RTABLE, rtable, rtable_init, rtable_idetach, + NET) + +static int +rtable_init(unused) + const void *unused; { + INIT_VNET_NET(curvnet); + struct domain *dom; for (dom = domains; dom; dom = dom->dom_next) if (dom->dom_rtattach) - dom->dom_rtattach(&table[dom->dom_family], + dom->dom_rtattach((void *)&V_rt_tables[dom->dom_family], + dom->dom_rtoffset); + return 0; +} + +#ifdef VIMAGE +static int +rtable_idetach(unused) + const void *unused; +{ + INIT_VNET_NET(curvnet); + + struct domain *dom; + for (dom = domains; dom; dom = dom->dom_next) + if (dom->dom_rtdetach) + dom->dom_rtdetach((void *)&V_rt_tables[dom->dom_family], dom->dom_rtoffset); + return 0; } +#endif static uma_zone_t rtzone; /* Routing table UMA zone. */ @@ -91,7 +121,11 @@ rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); rn_init(); /* initialize all zeroes, all ones, mask table */ - rtable_init((void **)rt_tables); +#ifdef VIMAGE + vnet_mod_register(&vnet_rtable_modinfo); +#else + rtable_init(NULL); +#endif } /* @@ -128,7 +162,8 @@ struct rtentry * rtalloc1(struct sockaddr *dst, int report, u_long ignflags) { - struct radix_node_head *rnh = rt_tables[dst->sa_family]; + INIT_VNET_NET(curvnet); + struct radix_node_head *rnh = V_rt_tables[dst->sa_family]; struct rtentry *rt; struct radix_node *rn; struct rtentry *newrt; @@ -141,7 +176,7 @@ * Look up the address in the table for that Address Family */ if (rnh == NULL) { - rtstat.rts_unreach++; + V_rtstat.rts_unreach++; goto miss2; } RADIX_NODE_HEAD_LOCK(rnh); @@ -203,7 +238,7 @@ * Which basically means * "caint get there frm here" */ - rtstat.rts_unreach++; + V_rtstat.rts_unreach++; miss: RADIX_NODE_HEAD_UNLOCK(rnh); miss2: if (report) { @@ -229,10 +264,11 @@ void rtfree(struct rtentry *rt) { + INIT_VNET_NET(curvnet); struct radix_node_head *rnh; KASSERT(rt != NULL,("%s: NULL rt", __func__)); - rnh = rt_tables[rt_key(rt)->sa_family]; + rnh = V_rt_tables[rt_key(rt)->sa_family]; KASSERT(rnh != NULL,("%s: NULL rnh", __func__)); RT_LOCK_ASSERT(rt); @@ -271,7 +307,7 @@ * the rtentry must have been removed from the routing table * so it is represented in rttrash.. remove that now. */ - rttrash--; + V_rttrash--; #ifdef DIAGNOSTIC if (rt->rt_refcnt < 0) { printf("rtfree: %p not freed (neg refs)\n", rt); @@ -318,6 +354,7 @@ int flags, struct sockaddr *src) { + INIT_VNET_NET(curvnet); struct rtentry *rt, *rt0 = NULL; int error = 0; short *stat = NULL; @@ -381,7 +418,7 @@ if (rt0) RTFREE_LOCKED(rt0); - stat = &rtstat.rts_dynamic; + stat = &V_rtstat.rts_dynamic; } else { struct rtentry *gwrt; @@ -391,7 +428,7 @@ */ rt->rt_flags |= RTF_MODIFIED; flags |= RTF_MODIFIED; - stat = &rtstat.rts_newgateway; + stat = &V_rtstat.rts_newgateway; /* * add the key and gateway (in one malloc'd chunk). */ @@ -407,7 +444,7 @@ RTFREE_LOCKED(rt); out: if (error) - rtstat.rts_badredirect++; + V_rtstat.rts_badredirect++; else if (stat != NULL) (*stat)++; bzero((caddr_t)&info, sizeof(info)); @@ -591,6 +628,7 @@ int rtexpunge(struct rtentry *rt) { + INIT_VNET_NET(curvnet); struct radix_node *rn; struct radix_node_head *rnh; struct ifaddr *ifa; @@ -608,7 +646,7 @@ /* * Find the correct routing tree to use for this Address Family */ - rnh = rt_tables[rt_key(rt)->sa_family]; + rnh = V_rt_tables[rt_key(rt)->sa_family]; if (rnh == NULL) return (EAFNOSUPPORT); @@ -666,7 +704,7 @@ * one more rtentry floating around that is not * linked to the routing table. */ - rttrash++; + V_rttrash++; bad: RADIX_NODE_HEAD_UNLOCK(rnh); return (error); @@ -675,6 +713,7 @@ int rtrequest1(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt) { + INIT_VNET_NET(curvnet); int error = 0; register struct rtentry *rt; register struct radix_node *rn; @@ -686,7 +725,7 @@ /* * Find the correct routing tree to use for this Address Family */ - rnh = rt_tables[dst->sa_family]; + rnh = V_rt_tables[dst->sa_family]; if (rnh == NULL) return (EAFNOSUPPORT); RADIX_NODE_HEAD_LOCK(rnh); @@ -745,7 +784,7 @@ * linked to the routing table. rttrash will be decremented * when RTFREE(rt) is eventually called. */ - rttrash++; + V_rttrash++; /* * If the caller wants it, then it can have it, @@ -1021,8 +1060,9 @@ int rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) { + INIT_VNET_NET(curvnet); /* XXX dst may be overwritten, can we move this to below */ - struct radix_node_head *rnh = rt_tables[dst->sa_family]; + struct radix_node_head *rnh = V_rt_tables[dst->sa_family]; int dlen = SA_SIZE(dst), glen = SA_SIZE(gate); again: @@ -1161,6 +1201,7 @@ int rtinit(struct ifaddr *ifa, int cmd, int flags) { + INIT_VNET_NET(curvnet); struct sockaddr *dst; struct sockaddr *netmask; struct mbuf *m = NULL; @@ -1205,7 +1246,7 @@ * Look up an rtentry that is in the routing tree and * contains the correct info. */ - if ((rnh = rt_tables[dst->sa_family]) == NULL) + if ((rnh = V_rt_tables[dst->sa_family]) == NULL) goto bad; RADIX_NODE_HEAD_LOCK(rnh); error = ((rn = rnh->rnh_lookup(dst, netmask, rnh)) == NULL || --- /u/marko/p4/head/src/sys/net/rtsock.c 2007-09-08 22:08:44.000000000 +0200 +++ src/sys/net/rtsock.c 2007-10-22 18:06:38.000000000 +0200 @@ -29,7 +29,10 @@ * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 * $FreeBSD: src/sys/net/rtsock.c,v 1.143 2007/09/08 19:28:45 cognet Exp $ */ + #include "opt_sctp.h" +#include "opt_vimage.h" + #include #include #include @@ -44,7 +47,9 @@ #include #include #include +#include +#include #include #include #include @@ -312,6 +317,7 @@ route_output(struct mbuf *m, struct socket *so) { #define sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0) + INIT_VNET_NET(so->so_vnet); struct rt_msghdr *rtm = NULL; struct rtentry *rt = NULL; struct radix_node_head *rnh; @@ -410,7 +416,7 @@ case RTM_GET: case RTM_CHANGE: case RTM_LOCK: - rnh = rt_tables[info.rti_info[RTAX_DST]->sa_family]; + rnh = V_rt_tables[info.rti_info[RTAX_DST]->sa_family]; if (rnh == NULL) senderr(EAFNOSUPPORT); RADIX_NODE_HEAD_LOCK(rnh); @@ -1052,6 +1058,7 @@ static void rt_dispatch(struct mbuf *m, const struct sockaddr *sa) { + INIT_VNET_NET(curvnet); struct m_tag *tag; /* @@ -1069,6 +1076,14 @@ *(unsigned short *)(tag + 1) = sa->sa_family; m_tag_prepend(m, tag); } +#ifdef VIMAGE + if (V_loif) + m->m_pkthdr.rcvif = V_loif; + else { + m_freem(m); + return; + } +#endif netisr_queue(NETISR_ROUTE, m); /* mbuf is free'd on failure. */ } @@ -1115,6 +1130,7 @@ static int sysctl_iflist(int af, struct walkarg *w) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifaddr *ifa; struct rt_addrinfo info; @@ -1122,7 +1138,7 @@ bzero((caddr_t)&info, sizeof(info)); IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; ifa = ifp->if_addr; @@ -1175,6 +1191,7 @@ int sysctl_ifmalist(int af, struct walkarg *w) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifmultiaddr *ifma; struct rt_addrinfo info; @@ -1183,7 +1200,7 @@ bzero((caddr_t)&info, sizeof(info)); IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; ifa = ifp->if_addr; @@ -1224,6 +1241,7 @@ static int sysctl_rtsock(SYSCTL_HANDLER_ARGS) { + INIT_VNET_NET(curvnet); int *name = (int *)arg1; u_int namelen = arg2; struct radix_node_head *rnh; @@ -1258,7 +1276,7 @@ } else /* dump only one table */ i = lim = af; for (error = 0; error == 0 && i <= lim; i++) - if ((rnh = rt_tables[i]) != NULL) { + if ((rnh = V_rt_tables[i]) != NULL) { RADIX_NODE_HEAD_LOCK(rnh); error = rnh->rnh_walktree(rnh, sysctl_dumpentry, &w); --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/net/vnet.h 2007-10-05 12:26:49.000000000 +0200 @@ -0,0 +1,92 @@ +/*- + * Copyright (c) 2006 University of Zagreb + * Copyright (c) 2006 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * XXX RCS tag goes here + */ + +#ifndef _NET_VNET_H_ +#define _NET_VNET_H_ + + +#ifdef VIMAGE +#include +#include +#include + +#include +#include +#include +#include + +struct vnet_net { + int _if_index; + struct ifindex_entry *_ifindex_table; + struct ifnethead _ifnet; + struct ifgrouphead _ifg_head; + + int _if_indexlim; + struct knlist _ifklist; + + struct rtstat _rtstat; + struct radix_node_head *_rt_tables[AF_MAX+1]; + int _rttrash; + + struct ifnet *_loif; + LIST_HEAD(, lo_softc) _lo_list; + + LIST_HEAD(, rawcb) _rawcb_list; + + int _ether_ipfw; +}; + +#endif + +/* + * Symbol translation macros + */ +#define INIT_VNET_NET(vnet) \ + INIT_FROM_VNET(vnet, VNET_MOD_NET, struct vnet_net, vnet_net) + +#define VNET_NET(sym) VSYM(vnet_net, sym) + +#define V_if_index VNET_NET(if_index) +#define V_ifindex_table VNET_NET(ifindex_table) +#define V_ifnet VNET_NET(ifnet) +#define V_ifg_head VNET_NET(ifg_head) +#define V_if_indexlim VNET_NET(if_indexlim) +#define V_ifklist VNET_NET(ifklist) +#define V_rtstat VNET_NET(rtstat) +#define V_rt_tables VNET_NET(rt_tables) +#define V_rttrash VNET_NET(rttrash) +#define V_loif VNET_NET(loif) +#define V_lo_list VNET_NET(lo_list) +#define V_rawcb_list VNET_NET(rawcb_list) +#define V_ether_ipfw VNET_NET(ether_ipfw) + +#endif /* !_NET_VNET_H_ */ --- /u/marko/p4/head/src/sys/net80211/ieee80211.c 2007-12-27 19:32:16.000000000 +0100 +++ src/sys/net80211/ieee80211.c 2008-01-14 19:23:50.000000000 +0100 @@ -31,14 +31,19 @@ * IEEE 802.11 generic handler */ +#include "opt_vimage.h" + #include #include #include #include +#include #include +#include #include +#include #include #include @@ -77,6 +82,7 @@ static int media_status(enum ieee80211_opmode , const struct ieee80211_channel *); +static struct ieee80211com * ieee80211_find_instance(struct ifnet *ifp); /* list of all instances */ SLIST_HEAD(ieee80211_list, ieee80211com); @@ -213,6 +219,9 @@ ether_ifattach(ifp, ic->ic_myaddr); ifp->if_output = ieee80211_output; +#ifdef VIMAGE + ifp->if_reassign = NULL; /* Override ether_reassign() */ +#endif bpfattach2(ifp, DLT_IEEE802_11, sizeof(struct ieee80211_frame_addr4), &ic->ic_rawbpf); @@ -303,6 +312,30 @@ ether_ifdetach(ifp); } +#ifdef VIMAGE +void +ieee80211_reassign(struct ieee80211com *ic, struct vnet *vnet, char *dname) +{ + u_char eaddr[6]; + struct ifnet *ifp = ic->ic_ifp; + + bcopy(IF_LLADDR(ifp), eaddr, 6); + bpfdetach(ifp); + ether_ifdetach(ifp); + ifp->if_bpf = NULL; + ic->ic_rawbpf = NULL; + if_reassign_common(ifp, vnet, ifp->if_dname); + if (dname) + snprintf(ifp->if_xname, IFNAMSIZ, "%s", dname); + + CURVNET_SET_QUIET(vnet); + ether_ifattach(ifp, eaddr); + bpfattach2(ifp, DLT_IEEE802_11, + sizeof(struct ieee80211_frame_addr4), &ic->ic_rawbpf); + CURVNET_RESTORE(); +} +#endif + static __inline int mapgsm(u_int freq, u_int flags) { --- /u/marko/p4/head/src/sys/net80211/ieee80211_freebsd.c 2007-11-13 02:49:09.000000000 +0100 +++ src/sys/net80211/ieee80211_freebsd.c 2007-12-10 11:26:09.000000000 +0100 @@ -26,6 +26,8 @@ #include __FBSDID("$FreeBSD: src/sys/net80211/ieee80211_freebsd.c,v 1.17 2007/11/02 05:22:24 sam Exp $"); +#include "opt_vimage.h" + /* * IEEE 802.11 support (FreeBSD-specific code) */ @@ -37,8 +39,8 @@ #include #include #include - #include +#include #include #include @@ -296,6 +298,7 @@ struct ifnet *ifp = ic->ic_ifp; struct ieee80211_join_event iev; + CURVNET_SET(ifp->if_vnet); memset(&iev, 0, sizeof(iev)); if (ni == ic->ic_bss) { IEEE80211_ADDR_COPY(iev.iev_addr, ni->ni_bssid); @@ -309,6 +312,7 @@ RTM_IEEE80211_JOIN : RTM_IEEE80211_REJOIN, &iev, sizeof(iev)); } + CURVNET_RESTORE(); } void @@ -317,6 +321,7 @@ struct ifnet *ifp = ic->ic_ifp; struct ieee80211_leave_event iev; + CURVNET_SET_QUIET(ifp->if_vnet); if (ni == ic->ic_bss) { rt_ieee80211msg(ifp, RTM_IEEE80211_DISASSOC, NULL, 0); if_link_state_change(ifp, LINK_STATE_DOWN); @@ -326,6 +331,7 @@ IEEE80211_ADDR_COPY(iev.iev_addr, ni->ni_macaddr); rt_ieee80211msg(ifp, RTM_IEEE80211_LEAVE, &iev, sizeof(iev)); } + CURVNET_RESTORE(); } void @@ -336,7 +342,9 @@ IEEE80211_DPRINTF(ic, IEEE80211_MSG_SCAN, "%s\n", "notify scan done"); /* dispatch wireless event indicating scan completed */ + CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_SCAN, NULL, 0); + CURVNET_RESTORE(); } void @@ -364,7 +372,9 @@ iev.iev_keyix = k->wk_keyix; iev.iev_keyrsc = k->wk_keyrsc; iev.iev_rsc = rsc; + CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_REPLAY, &iev, sizeof(iev)); + CURVNET_RESTORE(); } } @@ -386,7 +396,9 @@ IEEE80211_ADDR_COPY(iev.iev_src, wh->i_addr2); iev.iev_cipher = IEEE80211_CIPHER_TKIP; iev.iev_keyix = keyix; + CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_MICHAEL, &iev, sizeof(iev)); + CURVNET_RESTORE(); } } --- /u/marko/p4/head/src/sys/net80211/ieee80211_ioctl.c 2007-11-13 02:49:09.000000000 +0100 +++ src/sys/net80211/ieee80211_ioctl.c 2007-12-10 11:26:09.000000000 +0100 @@ -35,6 +35,7 @@ #include "opt_inet.h" #include "opt_ipx.h" +#include "opt_vimage.h" #include #include @@ -43,6 +44,7 @@ #include #include #include +#include #include #include --- /u/marko/p4/head/src/sys/net80211/ieee80211_var.h 2007-11-27 15:48:32.000000000 +0100 +++ src/sys/net80211/ieee80211_var.h 2007-12-10 11:26:10.000000000 +0100 @@ -401,6 +401,8 @@ void ieee80211_ifattach(struct ieee80211com *); void ieee80211_ifdetach(struct ieee80211com *); +void ieee80211_reassign(struct ieee80211com *, struct vnet *, char *); + const struct ieee80211_rateset *ieee80211_get_suprates(struct ieee80211com *ic, const struct ieee80211_channel *); void ieee80211_announce(struct ieee80211com *); --- /u/marko/p4/head/src/sys/netgraph/netgraph.h 2008-01-31 10:37:19.000000000 +0100 +++ src/sys/netgraph/netgraph.h 2008-02-27 11:48:48.000000000 +0100 @@ -351,6 +351,7 @@ LIST_ENTRY(ng_node) nd_idnodes; /* ID hash collision list */ TAILQ_ENTRY(ng_node) nd_work; /* nodes with work to do */ struct ng_queue nd_input_queue; /* input queue for locking */ + struct vnet *nd_vnet; /* network stack instance */ #ifdef NETGRAPH_DEBUG /*----------------------------------------------*/ #define ND_MAGIC 0x59264837 int nd_magic; @@ -1123,6 +1124,7 @@ struct ng_type *ng_findtype(const char *type); int ng_make_node_common(struct ng_type *typep, node_p *nodep); int ng_name_node(node_p node, const char *name); +node_p ng_name2noderef(node_p node, const char *name); int ng_newtype(struct ng_type *tp); ng_ID_t ng_node2ID(node_p node); item_p ng_package_data(struct mbuf *m, int flags); --- /u/marko/p4/head/src/sys/netgraph/ng_base.c 2008-02-27 18:29:04.000000000 +0100 +++ src/sys/netgraph/ng_base.c 2008-02-27 17:58:56.000000000 +0100 @@ -46,6 +46,8 @@ * This file implements the base netgraph code. */ +#include "opt_vimage.h" + #include #include #include @@ -61,9 +63,11 @@ #include #include #include +#include #include +#include #include #include #include @@ -71,7 +75,9 @@ MODULE_VERSION(netgraph, NG_ABI_VERSION); /* List of all active nodes */ +#ifndef VIMAGE static LIST_HEAD(, ng_node) ng_nodelist; +#endif static struct mtx ng_nodelist_mtx; /* Mutex to protect topology events. */ @@ -88,8 +94,8 @@ static void ng_dumpitems(void); static void ng_dumpnodes(void); static void ng_dumphooks(void); - #endif /* NETGRAPH_DEBUG */ + /* * DEAD versions of the structures. * In order to avoid races, it is sometimes neccesary to point @@ -169,15 +175,16 @@ /* Hash related definitions */ /* XXX Don't need to initialise them because it's a LIST */ -#define NG_ID_HASH_SIZE 32 /* most systems wont need even this many */ +#ifndef VIMAGE static LIST_HEAD(, ng_node) ng_ID_hash[NG_ID_HASH_SIZE]; +#endif static struct mtx ng_idhash_mtx; /* Method to find a node.. used twice so do it here */ #define NG_IDHASH_FN(ID) ((ID) % (NG_ID_HASH_SIZE)) #define NG_IDHASH_FIND(ID, node) \ do { \ mtx_assert(&ng_idhash_mtx, MA_OWNED); \ - LIST_FOREACH(node, &ng_ID_hash[NG_IDHASH_FN(ID)], \ + LIST_FOREACH(node, &V_ng_ID_hash[NG_IDHASH_FN(ID)], \ nd_idnodes) { \ if (NG_NODE_IS_VALID(node) \ && (NG_NODE_ID(node) == ID)) { \ @@ -207,7 +214,6 @@ /* Imported, these used to be externally visible, some may go back. */ void ng_destroy_hook(hook_p hook); -node_p ng_name2noderef(node_p node, const char *name); int ng_path2noderef(node_p here, const char *path, node_p *dest, hook_p *lasthook); int ng_make_node(const char *type, node_p *nodepp); @@ -243,6 +249,14 @@ #define NG_WORKLIST_UNLOCK() \ mtx_unlock(&ng_worklist_mtx) +static vnet_attach_fn vnet_netgraph_iattach; +#ifdef VIMAGE +static vnet_detach_fn vnet_netgraph_idetach; +#endif /* VIMAGE */ + +VNET_MOD_DECLARE(NETGRAPH, netgraph, vnet_netgraph_iattach, + vnet_netgraph_idetach, LOIF, NULL) + #ifdef NETGRAPH_DEBUG /*----------------------------------------------*/ /* * In debug mode: @@ -341,7 +355,9 @@ #define TRAP_ERROR() #endif -static ng_ID_t nextID = 1; +#ifndef VIMAGE +static ng_ID_t nextID; +#endif #ifdef INVARIANTS #define CHECK_DATA_MBUF(m) do { \ @@ -565,7 +581,8 @@ return (EINVAL); } - /* Locate the node type. If we fail we return. Do not try to load + /* + * Locate the node type. If we fail we return. Do not try to load * module. */ if ((type = ng_findtype(typename)) == NULL) @@ -603,6 +620,7 @@ int ng_make_node_common(struct ng_type *type, node_p *nodepp) { + INIT_VNET_NETGRAPH(curvnet); node_p node; /* Require the node type to have been already installed */ @@ -618,6 +636,9 @@ return (ENOMEM); } node->nd_type = type; +#ifdef VIMAGE + node->nd_vnet = curvnet; +#endif NG_NODE_REF(node); /* note reference */ type->refs++; @@ -632,7 +653,7 @@ /* Link us into the node linked list */ mtx_lock(&ng_nodelist_mtx); - LIST_INSERT_HEAD(&ng_nodelist, node, nd_nodes); + LIST_INSERT_HEAD(&V_ng_nodelist, node, nd_nodes); mtx_unlock(&ng_nodelist_mtx); @@ -640,7 +661,7 @@ mtx_lock(&ng_idhash_mtx); for (;;) { /* wrap protection, even if silly */ node_p node2 = NULL; - node->nd_ID = nextID++; /* 137/second for 1 year before wrap */ + node->nd_ID = V_nextID++; /* 137/sec for 1 year before wrap */ /* Is there a problem with the new number? */ NG_IDHASH_FIND(node->nd_ID, node2); /* already taken? */ @@ -648,7 +669,7 @@ break; } } - LIST_INSERT_HEAD(&ng_ID_hash[NG_IDHASH_FN(node->nd_ID)], + LIST_INSERT_HEAD(&V_ng_ID_hash[NG_IDHASH_FN(node->nd_ID)], node, nd_idnodes); mtx_unlock(&ng_idhash_mtx); @@ -789,6 +810,7 @@ static node_p ng_ID2noderef(ng_ID_t ID) { + INIT_VNET_NETGRAPH(curvnet); node_p node; mtx_lock(&ng_idhash_mtx); NG_IDHASH_FIND(ID, node); @@ -857,6 +879,7 @@ node_p ng_name2noderef(node_p here, const char *name) { + INIT_VNET_NETGRAPH(curvnet); node_p node; ng_ID_t temp; @@ -873,7 +896,7 @@ /* Find node by name */ mtx_lock(&ng_nodelist_mtx); - LIST_FOREACH(node, &ng_nodelist, nd_nodes) { + LIST_FOREACH(node, &V_ng_nodelist, nd_nodes) { if (NG_NODE_IS_VALID(node) && NG_NODE_HAS_NAME(node) && (strcmp(NG_NODE_NAME(node), name) == 0)) { @@ -2548,6 +2571,7 @@ static int ng_generic_msg(node_p here, item_p item, hook_p lasthook) { + INIT_VNET_NETGRAPH(curvnet); int error = 0; struct ng_mesg *msg; struct ng_mesg *resp = NULL; @@ -2706,7 +2730,7 @@ mtx_lock(&ng_nodelist_mtx); /* Count number of nodes */ - LIST_FOREACH(node, &ng_nodelist, nd_nodes) { + LIST_FOREACH(node, &V_ng_nodelist, nd_nodes) { if (NG_NODE_IS_VALID(node) && (unnamed || NG_NODE_HAS_NAME(node))) { num++; @@ -2726,7 +2750,7 @@ /* Cycle through the linked list of nodes */ nl->numnames = 0; mtx_lock(&ng_nodelist_mtx); - LIST_FOREACH(node, &ng_nodelist, nd_nodes) { + LIST_FOREACH(node, &V_ng_nodelist, nd_nodes) { struct nodeinfo *const np = &nl->nodeinfo[nl->numnames]; if (NG_NODE_NOT_VALID(node)) @@ -3153,6 +3177,11 @@ uma_zone_set_max(ng_qzone, maxalloc); netisr_register(NETISR_NETGRAPH, (netisr_t *)ngintr, NULL, NETISR_MPSAFE); +#ifdef VIMAGE + vnet_mod_register(&vnet_netgraph_modinfo); +#else + vnet_netgraph_iattach(NULL); +#endif /* !VIMAGE */ break; case MOD_UNLOAD: /* You can't unload it because an interface may be using it. */ @@ -3165,6 +3194,42 @@ return (error); } +static int vnet_netgraph_iattach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + + LIST_INIT(&V_ng_nodelist); + V_nextID = 1; + + return 0; +} + +#ifdef VIMAGE +static int vnet_netgraph_idetach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + node_p node, last_killed = NULL; + + while ((node = LIST_FIRST(&V_ng_nodelist)) != NULL) { + if (node == last_killed) { + /* This should never happen */ + node->nd_flags |= NGF_REALLY_DIE; + printf("netgraph node %s needs NGF_REALLY_DIE\n", + node->nd_name); + ng_rmnode(node, NULL, NULL, 0); + /* This must never happen */ + if (node == LIST_FIRST(&V_ng_nodelist)) + panic("netgraph node %s won't die", + node->nd_name); + } + ng_rmnode(node, NULL, NULL, 0); + last_killed = node; + } + + return 0; +} +#endif /* VIMAGE */ + static moduledata_t netgraph_mod = { "netgraph", ngb_mod_event, @@ -3326,6 +3391,7 @@ NG_WORKLIST_UNLOCK(); break; } + CURVNET_SET(node->nd_vnet); node->nd_flags &= ~NGF_WORKQ; TAILQ_REMOVE(&ng_worklist, node, nd_work); NG_WORKLIST_UNLOCK(); @@ -3360,6 +3426,7 @@ } } NG_NODE_UNREF(node); + CURVNET_RESTORE(); } } @@ -3711,7 +3778,9 @@ { item_p item = arg; + CURVNET_SET(NGI_NODE(item)->nd_vnet); ng_snd_item(item, 0); + CURVNET_RESTORE(); } --- /u/marko/p4/head/src/sys/netgraph/ng_bridge.c 2007-08-31 03:47:58.000000000 +0200 +++ src/sys/netgraph/ng_bridge.c 2007-10-05 12:26:59.000000000 +0200 @@ -95,13 +95,14 @@ /* Per-node private data */ struct ng_bridge_private { struct ng_bridge_bucket *tab; /* hash table bucket array */ - struct ng_bridge_link *links[NG_BRIDGE_MAX_LINKS]; + struct ng_bridge_link *links[NG_BRIDGE_MAX_LINKS + 1]; struct ng_bridge_config conf; /* node configuration */ node_p node; /* netgraph node */ u_int numHosts; /* num entries in table */ u_int numBuckets; /* num buckets in table */ u_int hashMask; /* numBuckets - 1 */ int numLinks; /* num connected links */ + int persistent; /* can exist w/o any hooks */ struct callout timer; /* one second periodic timer */ }; typedef struct ng_bridge_private *priv_p; @@ -342,13 +343,13 @@ ng_bridge_newhook(node_p node, hook_p hook, const char *name) { const priv_p priv = NG_NODE_PRIVATE(node); + int linkNum = -1; /* Check for a link hook */ if (strncmp(name, NG_BRIDGE_HOOK_LINK_PREFIX, strlen(NG_BRIDGE_HOOK_LINK_PREFIX)) == 0) { const char *cp; char *eptr; - u_long linkNum; cp = name + strlen(NG_BRIDGE_HOOK_LINK_PREFIX); if (!isdigit(*cp) || (cp[0] == '0' && cp[1] != '\0')) @@ -356,6 +357,14 @@ linkNum = strtoul(cp, &eptr, 10); if (*eptr != '\0' || linkNum >= NG_BRIDGE_MAX_LINKS) return (EINVAL); + } else if (strcmp(name, "anchor") == 0) { + linkNum = NG_BRIDGE_MAX_LINKS; + if (priv->persistent) + return (EISCONN); + priv->persistent = 1; + } + + if (linkNum >= 0 ) { if (priv->links[linkNum] != NULL) return (EISCONN); MALLOC(priv->links[linkNum], struct ng_bridge_link *, @@ -366,7 +375,7 @@ NG_HOOK_SET_PRIVATE(hook, (void *)linkNum); priv->numLinks++; return (0); - } + } /* Unknown hook name */ return (EINVAL); @@ -782,7 +791,7 @@ /* Get link number */ linkNum = (intptr_t)NG_HOOK_PRIVATE(hook); - KASSERT(linkNum >= 0 && linkNum < NG_BRIDGE_MAX_LINKS, + KASSERT(linkNum >= 0 && linkNum <= NG_BRIDGE_MAX_LINKS, ("%s: linkNum=%u", __func__, linkNum)); /* Remove all hosts associated with this link */ @@ -796,7 +805,8 @@ /* If no more hooks, go away */ if ((NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0) - && (NG_NODE_IS_VALID(NG_HOOK_NODE(hook)))) { + && (NG_NODE_IS_VALID(NG_HOOK_NODE(hook))) + && !priv->persistent) { ng_rmnode_self(NG_HOOK_NODE(hook)); } return (0); --- /u/marko/p4/head/src/sys/netgraph/ng_eiface.c 2007-08-31 03:47:58.000000000 +0200 +++ src/sys/netgraph/ng_eiface.c 2007-12-01 01:36:47.000000000 +0100 @@ -28,6 +28,8 @@ * $FreeBSD: src/sys/netgraph/ng_eiface.c,v 1.39 2007/07/26 10:54:33 glebius Exp $ */ +#include "opt_vimage.h" + #include #include #include @@ -38,11 +40,14 @@ #include #include #include +#include #include #include #include +#include +#include #include #include #include @@ -111,7 +116,15 @@ }; NETGRAPH_INIT(eiface, &typestruct); +static vnet_attach_fn ng_eiface_iattach; +static vnet_detach_fn ng_eiface_idetach; + +#ifndef VIMAGE static struct unrhdr *ng_eiface_unit; +#endif + +VNET_MOD_DECLARE_STATELESS(NG_EIFACE, ng_eiface, ng_eiface_iattach, + ng_eiface_idetach, NETGRAPH) /************************************************************************ INTERFACE STUFF @@ -244,6 +257,14 @@ * Send packet; if hook is not connected, mbuf will get * freed. */ +#ifdef VIMAGE + if (ifp->if_vnet != node->nd_vnet) { + m->m_flags |= M_REMOTE_VNET; + CURVNET_SET_QUIET(node->nd_vnet); + NG_SEND_DATA_ONLY(error, priv->ether, m); + CURVNET_RESTORE(); + } else +#endif NG_SEND_DATA_ONLY(error, priv->ether, m); /* Update stats */ @@ -332,6 +353,7 @@ static int ng_eiface_constructor(node_p node) { + INIT_VNET_NETGRAPH(curvnet); struct ifnet *ifp; priv_p priv; u_char eaddr[6] = {0,0,0,0,0,0}; @@ -351,7 +373,7 @@ ifp->if_softc = priv; /* Get an interface unit number */ - priv->unit = alloc_unr(ng_eiface_unit); + priv->unit = alloc_unr(V_ng_eiface_unit); /* Link together node and private info */ NG_NODE_SET_PRIVATE(node, priv); @@ -367,12 +389,10 @@ ifp->if_snd.ifq_maxlen = IFQ_MAXLEN; ifp->if_flags = (IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST); -#if 0 - /* Give this node name */ - bzero(ifname, sizeof(ifname)); - sprintf(ifname, "if%s", ifp->if_xname); - (void)ng_name_node(node, ifname); -#endif + /* Give this node the same name as the interface (if possible) */ + if (ng_name_node(node, ifp->if_xname) != 0) + log(LOG_WARNING, "%s: can't acquire netgraph name\n", + ifp->if_xname); /* Attach the interface */ ether_ifattach(ifp, eaddr); @@ -445,8 +465,6 @@ caddr_t ptr; int buflen; -#define SA_SIZE(s) ((s)->sa_lensa_len) - /* Determine size of response and allocate it */ buflen = 0; TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) @@ -532,6 +550,12 @@ /* Update interface stats */ ifp->if_ipackets++; +#ifdef VIMAGE + /* Mark up the mbuf if crossing vnet boundary */ + if (ifp->if_vnet != NG_HOOK_NODE(hook)->nd_vnet) + m->m_flags |= M_REMOTE_VNET; +#endif + (*ifp->if_input)(ifp, m); /* Done */ @@ -544,12 +568,15 @@ static int ng_eiface_rmnode(node_p node) { + INIT_VNET_NETGRAPH(curvnet); const priv_p priv = NG_NODE_PRIVATE(node); struct ifnet *const ifp = priv->ifp; + CURVNET_SET_QUIET(ifp->if_vnet); ether_ifdetach(ifp); if_free(ifp); - free_unr(ng_eiface_unit, priv->unit); + CURVNET_RESTORE(); + free_unr(V_ng_eiface_unit, priv->unit); FREE(priv, M_NETGRAPH); NG_NODE_SET_PRIVATE(node, NULL); NG_NODE_UNREF(node); @@ -578,10 +605,18 @@ switch (event) { case MOD_LOAD: - ng_eiface_unit = new_unrhdr(0, 0xffff, NULL); +#ifdef VIMAGE + vnet_mod_register(&vnet_ng_eiface_modinfo); +#else + ng_eiface_iattach(NULL); +#endif break; case MOD_UNLOAD: - delete_unrhdr(ng_eiface_unit); +#ifdef VIMAGE + vnet_mod_deregister(&vnet_ng_eiface_modinfo); +#else + ng_eiface_idetach(NULL); +#endif break; default: error = EOPNOTSUPP; @@ -589,3 +624,32 @@ } return (error); } + +static int ng_eiface_iattach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + + V_ng_eiface_unit = new_unrhdr(0, 0xffff, NULL); + + return 0; +} + +static int ng_eiface_idetach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); +#ifdef VIMAGE + node_p node; + + do { + LIST_FOREACH(node, &V_ng_nodelist, nd_nodes) + if (node->nd_type == &typestruct) + break; + if (node != NULL) + ng_rmnode_self(node); + } while (node != NULL); +#endif + + delete_unrhdr(V_ng_eiface_unit); + + return 0; +} --- /u/marko/p4/head/src/sys/netgraph/ng_ether.c 2007-08-31 03:47:58.000000000 +0200 +++ src/sys/netgraph/ng_ether.c 2007-10-22 18:06:39.000000000 +0200 @@ -46,6 +46,8 @@ * ng_ether(4) netgraph node type */ +#include "opt_vimage.h" + #include #include #include @@ -54,7 +56,9 @@ #include #include #include +#include +#include #include #include #include @@ -70,6 +74,12 @@ #define IFP2NG(ifp) (IFP2AC((ifp))->ac_netgraph) +static vnet_attach_fn ng_ether_iattach; +static vnet_detach_fn ng_ether_idetach; + +VNET_MOD_DECLARE_STATELESS(NG_ETHER, ng_ether, ng_ether_iattach, + ng_ether_idetach, NETGRAPH) + /* Per-node private data */ struct private { struct ifnet *ifp; /* associated interface */ @@ -282,6 +292,17 @@ priv_p priv; node_p node; + /* + * Do not create / attach an ether node to this ifnet if + * a netgraph node with the same name already exists. + * This should prevent ether nodes to be attached to + * eiface nodes in the same vnet, which is pointless. + */ + if ((node = ng_name2noderef(NULL, ifp->if_xname)) != NULL) { + NG_NODE_UNREF(node); + return; + } + /* Create node */ KASSERT(!IFP2NG(ifp), ("%s: node already exists?", __func__)); if (ng_make_node_common(&ng_ether_typestruct, &node) != 0) { @@ -730,53 +751,25 @@ static int ng_ether_mod_event(module_t mod, int event, void *data) { - struct ifnet *ifp; int error = 0; int s; s = splnet(); switch (event) { case MOD_LOAD: - - /* Register function hooks */ - if (ng_ether_attach_p != NULL) { - error = EEXIST; - break; - } - ng_ether_attach_p = ng_ether_attach; - ng_ether_detach_p = ng_ether_detach; - ng_ether_output_p = ng_ether_output; - ng_ether_input_p = ng_ether_input; - ng_ether_input_orphan_p = ng_ether_input_orphan; - ng_ether_link_state_p = ng_ether_link_state; - - /* Create nodes for any already-existing Ethernet interfaces */ - IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { - if (ifp->if_type == IFT_ETHER - || ifp->if_type == IFT_L2VLAN) - ng_ether_attach(ifp); - } - IFNET_RUNLOCK(); +#ifdef VIMAGE + vnet_mod_register(&vnet_ng_ether_modinfo); +#else + error = ng_ether_iattach(NULL); +#endif break; case MOD_UNLOAD: - - /* - * Note that the base code won't try to unload us until - * all nodes have been removed, and that can't happen - * until all Ethernet interfaces are removed. In any - * case, we know there are no nodes left if the action - * is MOD_UNLOAD, so there's no need to detach any nodes. - */ - - /* Unregister function hooks */ - ng_ether_attach_p = NULL; - ng_ether_detach_p = NULL; - ng_ether_output_p = NULL; - ng_ether_input_p = NULL; - ng_ether_input_orphan_p = NULL; - ng_ether_link_state_p = NULL; +#ifdef VIMAGE + vnet_mod_deregister(&vnet_ng_ether_modinfo); +#else + ng_ether_idetach(NULL); +#endif break; default: @@ -787,3 +780,62 @@ return (error); } +static int ng_ether_iattach(const void *unused) +{ + INIT_VNET_NET(curvnet); + struct ifnet *ifp; + +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)){ +#endif + /* Register function hooks */ + if (ng_ether_attach_p != NULL) + return(EEXIST); + ng_ether_attach_p = ng_ether_attach; + ng_ether_detach_p = ng_ether_detach; + ng_ether_output_p = ng_ether_output; + ng_ether_input_p = ng_ether_input; + ng_ether_input_orphan_p = ng_ether_input_orphan; + ng_ether_link_state_p = ng_ether_link_state; +#ifdef VIMAGE + } +#endif + + /* Create nodes for any already-existing Ethernet interfaces */ + IFNET_RLOCK(); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if (ifp->if_type == IFT_ETHER + || ifp->if_type == IFT_L2VLAN) + ng_ether_attach(ifp); + } + IFNET_RUNLOCK(); + + return 0; +} + +static int ng_ether_idetach(const void *unused) +{ + /* + * Note that the base code won't try to unload us until + * all nodes have been removed, and that can't happen + * until all Ethernet interfaces are removed. In any + * case, we know there are no nodes left if the action + * is MOD_UNLOAD, so there's no need to detach any nodes. + */ + +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return(0); +#endif + + /* Unregister function hooks */ + ng_ether_attach_p = NULL; + ng_ether_detach_p = NULL; + ng_ether_output_p = NULL; + ng_ether_input_p = NULL; + ng_ether_input_orphan_p = NULL; + ng_ether_link_state_p = NULL; + + return 0; +} + --- /u/marko/p4/head/src/sys/netgraph/ng_gif.c 2007-08-31 03:47:58.000000000 +0200 +++ src/sys/netgraph/ng_gif.c 2007-12-10 11:26:11.000000000 +0100 @@ -69,6 +69,8 @@ * ng_gif(4) netgraph node type */ +#include "opt_vimage.h" + #include #include #include @@ -77,7 +79,9 @@ #include #include #include +#include +#include #include #include #include @@ -560,10 +564,13 @@ /* Create nodes for any already-existing gif interfaces */ IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + VNET_ITERLOOP_BEGIN_QUIET(); + INIT_VNET_NET(curvnet); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_type == IFT_GIF) ng_gif_attach(ifp); } + VNET_ITERLOOP_END(); IFNET_RUNLOCK(); break; --- /u/marko/p4/head/src/sys/netgraph/ng_hub.c 2007-08-31 03:47:58.000000000 +0200 +++ src/sys/netgraph/ng_hub.c 2007-10-05 12:27:00.000000000 +0200 @@ -37,6 +37,7 @@ #include static ng_constructor_t ng_hub_constructor; +static ng_newhook_t ng_hub_newhook; static ng_rcvdata_t ng_hub_rcvdata; static ng_disconnect_t ng_hub_disconnect; @@ -44,6 +45,7 @@ .version = NG_ABI_VERSION, .name = NG_HUB_NODE_TYPE, .constructor = ng_hub_constructor, + .newhook = ng_hub_newhook, .rcvdata = ng_hub_rcvdata, .disconnect = ng_hub_disconnect, }; @@ -57,6 +59,14 @@ return (0); } +static int +ng_hub_newhook(node_p node, hook_p hook, const char *name) +{ + if (strcmp(name, "anchor") == 0) + node->nd_private = (void *) 1; + return 0; +} + static int ng_hub_rcvdata(hook_p hook, item_p item) { @@ -94,7 +104,7 @@ { if (NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0 && - NG_NODE_IS_VALID(NG_HOOK_NODE(hook))) + NG_NODE_IS_VALID(NG_HOOK_NODE(hook)) && !hook->hk_node->nd_private) ng_rmnode_self(NG_HOOK_NODE(hook)); return (0); } --- /u/marko/p4/head/src/sys/netgraph/ng_iface.c 2008-01-31 10:37:20.000000000 +0100 +++ src/sys/netgraph/ng_iface.c 2008-02-27 11:48:53.000000000 +0100 @@ -56,6 +56,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipx.h" +#include "opt_vimage.h" #include #include @@ -69,6 +70,7 @@ #include #include #include +#include #include #include @@ -77,6 +79,7 @@ #include +#include #include #include #include @@ -121,6 +124,10 @@ static int ng_iface_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int ng_iface_output(struct ifnet *ifp, struct mbuf *m0, struct sockaddr *dst, struct rtentry *rt0); +#ifdef VIMAGE +static void ng_iface_reassign(struct ifnet *ifp, struct vnet *vnet, + char *dname); +#endif static void ng_iface_bpftap(struct ifnet *ifp, struct mbuf *m, sa_family_t family); static int ng_iface_send(struct ifnet *ifp, struct mbuf *m, @@ -207,7 +214,15 @@ }; NETGRAPH_INIT(iface, &typestruct); +static vnet_attach_fn ng_iface_iattach; +static vnet_detach_fn ng_iface_idetach; + +#ifndef VIMAGE static struct unrhdr *ng_iface_unit; +#endif + +VNET_MOD_DECLARE_STATELESS(NG_IFACE, ng_iface, ng_iface_iattach, + ng_iface_idetach, NETGRAPH) /************************************************************************ HELPER STUFF @@ -449,6 +464,14 @@ /* Send packet. If hook is not connected, mbuf will get freed. */ +#ifdef VIMAGE + if (ifp->if_vnet != priv->node->nd_vnet) { + m->m_flags |= M_REMOTE_VNET; + CURVNET_SET_QUIET(priv->node->nd_vnet); + NG_SEND_DATA_ONLY(error, *get_hook_from_iffam(priv, iffam), m); + CURVNET_RESTORE(); + } else +#endif NG_SEND_DATA_ONLY(error, *get_hook_from_iffam(priv, iffam), m); /* Update stats. */ @@ -505,6 +528,7 @@ static int ng_iface_constructor(node_p node) { + INIT_VNET_NETGRAPH(curvnet); struct ifnet *ifp; priv_p priv; @@ -523,7 +547,7 @@ priv->ifp = ifp; /* Get an interface unit number */ - priv->unit = alloc_unr(ng_iface_unit); + priv->unit = alloc_unr(V_ng_iface_unit); /* Link together node and private info */ NG_NODE_SET_PRIVATE(node, priv); @@ -534,6 +558,9 @@ ifp->if_output = ng_iface_output; ifp->if_start = ng_iface_start; ifp->if_ioctl = ng_iface_ioctl; +#ifdef VIMAGE + ifp->if_reassign = ng_iface_reassign; +#endif ifp->if_watchdog = NULL; ifp->if_mtu = NG_IFACE_MTU_DEFAULT; ifp->if_flags = (IFF_SIMPLEX|IFF_POINTOPOINT|IFF_NOARP|IFF_MULTICAST); @@ -558,6 +585,24 @@ return (0); } +#ifdef VIMAGE +static void +ng_iface_reassign(struct ifnet *ifp, struct vnet *vnet, char *dname) +{ + bpfdetach(ifp); + if_detach(ifp); + ifp->if_bpf = NULL; + if_reassign_common(ifp, vnet, "ser"); + if (dname) + snprintf(ifp->if_xname, IFNAMSIZ, "%s", dname); + + CURVNET_SET_QUIET(vnet); + if_attach(ifp); + bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); + CURVNET_RESTORE(); +} +#endif + /* * Give our ok for a hook to be added */ @@ -720,6 +765,12 @@ ifp->if_ipackets++; ifp->if_ibytes += m->m_pkthdr.len; +#ifdef VIMAGE + /* Mark up the mbuf if crossing vnet boundary */ + if (ifp->if_vnet != NG_HOOK_NODE(hook)->nd_vnet) + m->m_flags |= M_REMOTE_VNET; +#endif + /* Note receiving interface */ m->m_pkthdr.rcvif = ifp; @@ -765,13 +816,16 @@ static int ng_iface_shutdown(node_p node) { + INIT_VNET_NETGRAPH(curvnet); const priv_p priv = NG_NODE_PRIVATE(node); + CURVNET_SET_QUIET(priv->ifp->if_vnet); bpfdetach(priv->ifp); if_detach(priv->ifp); if_free(priv->ifp); + CURVNET_RESTORE(); priv->ifp = NULL; - free_unr(ng_iface_unit, priv->unit); + free_unr(V_ng_iface_unit, priv->unit); FREE(priv, M_NETGRAPH_IFACE); NG_NODE_SET_PRIVATE(node, NULL); NG_NODE_UNREF(node); @@ -804,10 +858,18 @@ switch (event) { case MOD_LOAD: - ng_iface_unit = new_unrhdr(0, 0xffff, NULL); +#ifdef VIMAGE + vnet_mod_register(&vnet_ng_iface_modinfo); +#else + ng_iface_iattach(NULL); +#endif break; case MOD_UNLOAD: - delete_unrhdr(ng_iface_unit); +#ifdef VIMAGE + vnet_mod_deregister(&vnet_ng_iface_modinfo); +#else + ng_iface_idetach(NULL); +#endif break; default: error = EOPNOTSUPP; @@ -815,3 +877,32 @@ } return (error); } + +static int ng_iface_iattach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + + V_ng_iface_unit = new_unrhdr(0, 0xffff, NULL); + + return 0; +} + +static int ng_iface_idetach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); +#ifdef VIMAGE + node_p node; + + do { + LIST_FOREACH(node, &V_ng_nodelist, nd_nodes) + if (node->nd_type == &typestruct) + break; + if (node != NULL) + ng_rmnode_self(node); + } while (node != NULL); +#endif + + delete_unrhdr(V_ng_iface_unit); + + return 0; +} --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/netgraph/ng_pipe.c 2007-10-30 22:13:05.000000000 +0100 @@ -0,0 +1,1051 @@ +/* + * Copyright (c) 2004, 2005, 2007 University of Zagreb + * Copyright (c) 2007 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This node permits simple traffic shaping by emulating bandwidth + * and delay, as well as random packet losses. + * The node has two hooks, upper and lower. Traffic flowing from upper to + * lower hook is referenced as downstream, and vice versa. Parameters for + * both directions can be set separately, except for delay. + */ + +#include "opt_vimage.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include + +static MALLOC_DEFINE(M_NG_PIPE, "ng_pipe", "ng_pipe"); + +struct mtx ng_pipe_giant; + +/* Packet header struct */ +struct ngp_hdr { + TAILQ_ENTRY(ngp_hdr) ngp_link; /* next pkt in queue */ + struct timeval when; /* this packet's due time */ + struct mbuf *m; /* ptr to the packet data */ +}; +TAILQ_HEAD(p_head, ngp_hdr); + +/* FIFO queue struct */ +struct ngp_fifo { + TAILQ_ENTRY(ngp_fifo) fifo_le; /* list of active queues only */ + struct p_head packet_head; /* FIFO queue head */ + u_int32_t hash; /* flow signature */ + struct timeval vtime; /* virtual time, for WFQ */ + u_int32_t rr_deficit; /* for DRR */ + u_int32_t packets; /* # of packets in this queue */ +}; + +/* Per hook info */ +struct hookinfo { + hook_p hook; + int noqueue; /* bypass any processing */ + TAILQ_HEAD(, ngp_fifo) fifo_head; /* FIFO queues */ + TAILQ_HEAD(, ngp_hdr) qout_head; /* delay queue head */ + LIST_ENTRY(hookinfo) active_le; /* active hooks */ + struct timeval qin_utime; + struct ng_pipe_hookcfg cfg; + struct ng_pipe_hookrun run; + struct ng_pipe_hookstat stats; + uint64_t *ber_p; /* loss_p(BER,psize) map */ +}; + +/* Per node info */ +struct node_priv { + u_int64_t delay; + u_int32_t overhead; + u_int32_t header_offset; + struct hookinfo lower; + struct hookinfo upper; +}; +typedef struct node_priv *priv_p; + +/* Macro for calculating the virtual time for packet dequeueing in WFQ */ +#define FIFO_VTIME_SORT(plen) \ + if (hinfo->cfg.wfq && hinfo->cfg.bandwidth) { \ + ngp_f->vtime.tv_usec = now->tv_usec + ((uint64_t) (plen) \ + + priv->overhead ) * hinfo->run.fifo_queues * \ + 8000000 / hinfo->cfg.bandwidth; \ + ngp_f->vtime.tv_sec = now->tv_sec + \ + ngp_f->vtime.tv_usec / 1000000; \ + ngp_f->vtime.tv_usec = ngp_f->vtime.tv_usec % 1000000; \ + TAILQ_FOREACH(ngp_f1, &hinfo->fifo_head, fifo_le) \ + if (ngp_f1->vtime.tv_sec > ngp_f->vtime.tv_sec || \ + (ngp_f1->vtime.tv_sec == ngp_f->vtime.tv_sec && \ + ngp_f1->vtime.tv_usec > ngp_f->vtime.tv_usec)) \ + break; \ + if (ngp_f1 == NULL) \ + TAILQ_INSERT_TAIL(&hinfo->fifo_head, ngp_f, fifo_le); \ + else \ + TAILQ_INSERT_BEFORE(ngp_f1, ngp_f, fifo_le); \ + } else \ + TAILQ_INSERT_TAIL(&hinfo->fifo_head, ngp_f, fifo_le); \ + + +static void parse_cfg(struct ng_pipe_hookcfg *, struct ng_pipe_hookcfg *, + struct hookinfo *, priv_p); +static void pipe_dequeue(struct hookinfo *, struct timeval *); +static void pipe_scheduler(void *); +static void pipe_poll(void); +static int ngp_modevent(module_t, int, void *); + +/* linked list of active "pipe" hooks */ +static LIST_HEAD(, hookinfo) active_head; +static int active_gen_id = 0; + +/* timeout handle for pipe_scheduler */ +static struct callout polling_timer; + +/* zone for storing ngp_hdr-s */ +static uma_zone_t ngp_zone; + +/* Netgraph methods */ +static ng_constructor_t ngp_constructor; +static ng_rcvmsg_t ngp_rcvmsg; +static ng_shutdown_t ngp_shutdown; +static ng_newhook_t ngp_newhook; +static ng_rcvdata_t ngp_rcvdata; +static ng_disconnect_t ngp_disconnect; + +/* Parse type for struct ng_pipe_hookstat */ +static const struct ng_parse_struct_field + ng_pipe_hookstat_type_fields[] = NG_PIPE_HOOKSTAT_INFO; +static const struct ng_parse_type ng_pipe_hookstat_type = { + &ng_parse_struct_type, + &ng_pipe_hookstat_type_fields +}; + +/* Parse type for struct ng_pipe_stats */ +static const struct ng_parse_struct_field ng_pipe_stats_type_fields[] = + NG_PIPE_STATS_INFO(&ng_pipe_hookstat_type); +static const struct ng_parse_type ng_pipe_stats_type = { + &ng_parse_struct_type, + &ng_pipe_stats_type_fields +}; + +/* Parse type for struct ng_pipe_hookrun */ +static const struct ng_parse_struct_field + ng_pipe_hookrun_type_fields[] = NG_PIPE_HOOKRUN_INFO; +static const struct ng_parse_type ng_pipe_hookrun_type = { + &ng_parse_struct_type, + &ng_pipe_hookrun_type_fields +}; + +/* Parse type for struct ng_pipe_run */ +static const struct ng_parse_struct_field + ng_pipe_run_type_fields[] = NG_PIPE_RUN_INFO(&ng_pipe_hookrun_type); +static const struct ng_parse_type ng_pipe_run_type = { + &ng_parse_struct_type, + &ng_pipe_run_type_fields +}; + +/* Parse type for struct ng_pipe_hookcfg */ +static const struct ng_parse_struct_field + ng_pipe_hookcfg_type_fields[] = NG_PIPE_HOOKCFG_INFO; +static const struct ng_parse_type ng_pipe_hookcfg_type = { + &ng_parse_struct_type, + &ng_pipe_hookcfg_type_fields +}; + +/* Parse type for struct ng_pipe_cfg */ +static const struct ng_parse_struct_field + ng_pipe_cfg_type_fields[] = NG_PIPE_CFG_INFO(&ng_pipe_hookcfg_type); +static const struct ng_parse_type ng_pipe_cfg_type = { + &ng_parse_struct_type, + &ng_pipe_cfg_type_fields +}; + +/* List of commands and how to convert arguments to/from ASCII */ +static const struct ng_cmdlist ngp_cmds[] = { + { + .cookie = NGM_PIPE_COOKIE, + .cmd = NGM_PIPE_GET_STATS, + .name = "getstats", + .respType = &ng_pipe_stats_type + }, + { + .cookie = NGM_PIPE_COOKIE, + .cmd = NGM_PIPE_CLR_STATS, + .name = "clrstats" + }, + { + .cookie = NGM_PIPE_COOKIE, + .cmd = NGM_PIPE_GETCLR_STATS, + .name = "getclrstats", + .respType = &ng_pipe_stats_type + }, + { + .cookie = NGM_PIPE_COOKIE, + .cmd = NGM_PIPE_GET_RUN, + .name = "getrun", + .respType = &ng_pipe_run_type + }, + { + .cookie = NGM_PIPE_COOKIE, + .cmd = NGM_PIPE_GET_CFG, + .name = "getcfg", + .respType = &ng_pipe_cfg_type + }, + { + .cookie = NGM_PIPE_COOKIE, + .cmd = NGM_PIPE_SET_CFG, + .name = "setcfg", + .mesgType = &ng_pipe_cfg_type, + }, + { 0 } +}; + +/* Netgraph type descriptor */ +static struct ng_type ng_pipe_typestruct = { + .version = NG_ABI_VERSION, + .name = NG_PIPE_NODE_TYPE, + .mod_event = ngp_modevent, + .constructor = ngp_constructor, + .shutdown = ngp_shutdown, + .rcvmsg = ngp_rcvmsg, + .newhook = ngp_newhook, + .rcvdata = ngp_rcvdata, + .disconnect = ngp_disconnect, + .cmdlist = ngp_cmds +}; +NETGRAPH_INIT(pipe, &ng_pipe_typestruct); + +/* Node constructor */ +static int +ngp_constructor(node_p node) +{ + priv_p priv; + + MALLOC(priv, priv_p, sizeof(*priv), M_NG_PIPE, M_ZERO | M_NOWAIT); + if (priv == NULL) + return (ENOMEM); + NG_NODE_SET_PRIVATE(node, priv); + + return (0); +} + +/* Add a hook */ +static int +ngp_newhook(node_p node, hook_p hook, const char *name) +{ + const priv_p priv = NG_NODE_PRIVATE(node); + struct hookinfo *hinfo; + + if (strcmp(name, NG_PIPE_HOOK_UPPER) == 0) { + bzero(&priv->upper, sizeof(priv->upper)); + priv->upper.hook = hook; + NG_HOOK_SET_PRIVATE(hook, &priv->upper); + } else if (strcmp(name, NG_PIPE_HOOK_LOWER) == 0) { + bzero(&priv->lower, sizeof(priv->lower)); + priv->lower.hook = hook; + NG_HOOK_SET_PRIVATE(hook, &priv->lower); + } else + return (EINVAL); + + /* Load non-zero initial cfg values */ + hinfo = NG_HOOK_PRIVATE(hook); + hinfo->cfg.qin_size_limit = 50; + hinfo->cfg.fifo = 1; + hinfo->cfg.droptail = 1; + TAILQ_INIT(&hinfo->fifo_head); + TAILQ_INIT(&hinfo->qout_head); + return (0); +} + +/* Receive a control message */ +static int +ngp_rcvmsg(node_p node, item_p item, hook_p lasthook) +{ + const priv_p priv = NG_NODE_PRIVATE(node); + struct ng_mesg *resp = NULL; + struct ng_mesg *msg; + struct ng_pipe_stats *stats; + struct ng_pipe_run *run; + struct ng_pipe_cfg *cfg; + int error = 0; + + mtx_lock(&ng_pipe_giant); + + NGI_GET_MSG(item, msg); + switch (msg->header.typecookie) { + case NGM_PIPE_COOKIE: + switch (msg->header.cmd) { + case NGM_PIPE_GET_STATS: + case NGM_PIPE_CLR_STATS: + case NGM_PIPE_GETCLR_STATS: + if (msg->header.cmd != NGM_PIPE_CLR_STATS) { + NG_MKRESPONSE(resp, msg, + sizeof(*stats), M_NOWAIT); + if (resp == NULL) { + error = ENOMEM; + break; + } + stats = (struct ng_pipe_stats *)resp->data; + bcopy(&priv->upper.stats, &stats->downstream, + sizeof(stats->downstream)); + bcopy(&priv->lower.stats, &stats->upstream, + sizeof(stats->upstream)); + } + if (msg->header.cmd != NGM_PIPE_GET_STATS) { + bzero(&priv->upper.stats, + sizeof(priv->upper.stats)); + bzero(&priv->lower.stats, + sizeof(priv->lower.stats)); + } + break; + case NGM_PIPE_GET_RUN: + NG_MKRESPONSE(resp, msg, sizeof(*run), M_NOWAIT); + if (resp == NULL) { + error = ENOMEM; + break; + } + run = (struct ng_pipe_run *)resp->data; + bcopy(&priv->upper.run, &run->downstream, + sizeof(run->downstream)); + bcopy(&priv->lower.run, &run->upstream, + sizeof(run->upstream)); + break; + case NGM_PIPE_GET_CFG: + NG_MKRESPONSE(resp, msg, sizeof(*cfg), M_NOWAIT); + if (resp == NULL) { + error = ENOMEM; + break; + } + cfg = (struct ng_pipe_cfg *)resp->data; + bcopy(&priv->upper.cfg, &cfg->downstream, + sizeof(cfg->downstream)); + bcopy(&priv->lower.cfg, &cfg->upstream, + sizeof(cfg->upstream)); + cfg->delay = priv->delay; + cfg->overhead = priv->overhead; + cfg->header_offset = priv->header_offset; + if (cfg->upstream.bandwidth == + cfg->downstream.bandwidth) { + cfg->bandwidth = cfg->upstream.bandwidth; + cfg->upstream.bandwidth = 0; + cfg->downstream.bandwidth = 0; + } else + cfg->bandwidth = 0; + break; + case NGM_PIPE_SET_CFG: + cfg = (struct ng_pipe_cfg *)msg->data; + if (msg->header.arglen != sizeof(*cfg)) { + error = EINVAL; + break; + } + + if (cfg->delay == -1) + priv->delay = 0; + else if (cfg->delay > 0 && cfg->delay < 10000000) + priv->delay = cfg->delay; + + if (cfg->bandwidth == -1) { + priv->upper.cfg.bandwidth = 0; + priv->lower.cfg.bandwidth = 0; + priv->overhead = 0; + } else if (cfg->bandwidth >= 100 && + cfg->bandwidth <= 1000000000) { + priv->upper.cfg.bandwidth = cfg->bandwidth; + priv->lower.cfg.bandwidth = cfg->bandwidth; + if (cfg->bandwidth >= 10000000) + priv->overhead = 8+4+12; /* Ethernet */ + else + priv->overhead = 10; /* HDLC */ + } + + if (cfg->overhead == -1) + priv->overhead = 0; + else if (cfg->overhead > 0 && cfg->overhead < 256) + priv->overhead = cfg->overhead; + + if (cfg->header_offset == -1) + priv->header_offset = 0; + else if (cfg->header_offset > 0 && + cfg->header_offset < 64) + priv->header_offset = cfg->header_offset; + + parse_cfg(&priv->upper.cfg, &cfg->downstream, + &priv->upper, priv); + parse_cfg(&priv->lower.cfg, &cfg->upstream, + &priv->lower, priv); + break; + default: + error = EINVAL; + break; + } + break; + default: + error = EINVAL; + break; + } + NG_RESPOND_MSG(error, node, item, resp); + NG_FREE_MSG(msg); + + mtx_unlock(&ng_pipe_giant); + + return (error); +} + +static void +parse_cfg(struct ng_pipe_hookcfg *current, struct ng_pipe_hookcfg *new, + struct hookinfo *hinfo, priv_p priv) +{ + + if (new->ber == -1) { + current->ber = 0; + if (hinfo->ber_p) { + FREE(hinfo->ber_p, M_NG_PIPE); + hinfo->ber_p = NULL; + } + } else if (new->ber >= 1 && new->ber <= 1000000000000) { + static const uint64_t one = 0x1000000000000; /* = 2^48 */ + uint64_t p0, p; + uint32_t fsize, i; + + if (hinfo->ber_p == NULL) + MALLOC(hinfo->ber_p, uint64_t *, \ + (MAX_FSIZE + MAX_OHSIZE)*sizeof(uint64_t), \ + M_NG_PIPE, M_NOWAIT); + current->ber = new->ber; + + /* + * For given BER and each frame size N (in bytes) calculate + * the probability P_OK that the frame is clean: + * + * P_OK(BER,N) = (1 - 1/BER)^(N*8) + * + * We use a 64-bit fixed-point format with decimal point + * positioned between bits 47 and 48. + */ + p0 = one - one / new->ber; + p = one; + for (fsize = 0; fsize < MAX_FSIZE + MAX_OHSIZE; fsize++) { + hinfo->ber_p[fsize] = p; + for (i=0; i<8; i++) + p = (p*(p0&0xffff)>>48) + \ + (p*((p0>>16)&0xffff)>>32) + \ + (p*(p0>>32)>>16); + } + } + + if (new->qin_size_limit == -1) + current->qin_size_limit = 0; + else if (new->qin_size_limit >= 5) + current->qin_size_limit = new->qin_size_limit; + + if (new->qout_size_limit == -1) + current->qout_size_limit = 0; + else if (new->qout_size_limit >= 5) + current->qout_size_limit = new->qout_size_limit; + + if (new->duplicate == -1) + current->duplicate = 0; + else if (new->duplicate > 0 && new->duplicate <= 50) + current->duplicate = new->duplicate; + + if (new->fifo) { + current->fifo = 1; + current->wfq = 0; + current->drr = 0; + } + + if (new->wfq) { + current->fifo = 0; + current->wfq = 1; + current->drr = 0; + } + + if (new->drr) { + current->fifo = 0; + current->wfq = 0; + /* DRR quantum */ + if (new->drr >= 32) + current->drr = new->drr; + else + current->drr = 2048; /* default quantum */ + } + + if (new->droptail) { + current->droptail = 1; + current->drophead = 0; + } + + if (new->drophead) { + current->droptail = 0; + current->drophead = 1; + } + + if (new->bandwidth == -1) { + current->bandwidth = 0; + current->fifo = 1; + current->wfq = 0; + current->drr = 0; + } else if (new->bandwidth >= 100 && new->bandwidth <= 1000000000) + current->bandwidth = new->bandwidth; + + if (current->bandwidth | priv->delay | + current->duplicate | current->ber) + hinfo->noqueue = 0; + else + hinfo->noqueue = 1; +} + +/* + * Compute a hash signature for a packet. This function suffers from the + * NIH sindrome, so probably it would be wise to look around what other + * folks have found out to be a good and efficient IP hash function... + */ +static int ip_hash(struct mbuf *m, int offset) +{ + u_int64_t i; + struct ip *ip = (struct ip *)(mtod(m, u_char *) + offset); + + if (m->m_len < sizeof(struct ip) + offset || + ip->ip_v != 4 || ip->ip_hl << 2 != sizeof(struct ip)) + return 0; + + i = ((u_int64_t) ip->ip_src.s_addr ^ + ((u_int64_t) ip->ip_src.s_addr << 13) ^ + ((u_int64_t) ip->ip_dst.s_addr << 7) ^ + ((u_int64_t) ip->ip_dst.s_addr << 19)); + return (i ^ (i >> 32)); +} + +/* + * Receive data on a hook - both in upstream and downstream direction. + * We put the frame on the inbound queue, and try to initiate dequeuing + * sequence immediately. If inbound queue is full, discard one frame + * depending on dropping policy (from the head or from the tail of the + * queue). + */ +static int +ngp_rcvdata(hook_p hook, item_p item) +{ + struct hookinfo *const hinfo = NG_HOOK_PRIVATE(hook); + const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); + struct timeval uuptime; + struct timeval *now = &uuptime; + struct ngp_fifo *ngp_f = NULL, *ngp_f1; + struct ngp_hdr *ngp_h = NULL; + struct mbuf *m; + int hash; + int error = 0; + + if (hinfo->noqueue) { + struct hookinfo *dest; + if (hinfo == &priv->lower) + dest = &priv->upper; + else + dest = &priv->lower; + NG_FWD_ITEM_HOOK(error, item, dest->hook); + return error; + } + + mtx_lock(&ng_pipe_giant); + microuptime(now); + + /* + * Attach us to the list of active ng_pipes if this was an empty + * one before, and also update the queue service deadline time. + */ + if (hinfo->run.qin_frames == 0) { + struct timeval *when = &hinfo->qin_utime; + if (when->tv_sec < now->tv_sec || (when->tv_sec == now->tv_sec + && when->tv_usec < now->tv_usec)) { + when->tv_sec = now->tv_sec; + when->tv_usec = now->tv_usec; + } + if (hinfo->run.qout_frames == 0) + LIST_INSERT_HEAD(&active_head, hinfo, active_le); + } + + /* Populate the packet header */ + ngp_h = uma_zalloc(ngp_zone, M_NOWAIT); + KASSERT((ngp_h != NULL), ("ngp_h zalloc failed (1)")); + NGI_GET_M(item, m); + KASSERT(m != NULL, ("NGI_GET_M failed")); + ngp_h->m = m; + NG_FREE_ITEM(item); + + if (hinfo->cfg.fifo) + hash = 0; /* all packets go into a single FIFO queue */ + else + hash = ip_hash(m, priv->header_offset); + + /* Find the appropriate FIFO queue for the packet and enqueue it*/ + TAILQ_FOREACH(ngp_f, &hinfo->fifo_head, fifo_le) + if (hash == ngp_f->hash) + break; + if (ngp_f == NULL) { + ngp_f = uma_zalloc(ngp_zone, M_NOWAIT); + KASSERT(ngp_h != NULL, ("ngp_h zalloc failed (2)")); + TAILQ_INIT(&ngp_f->packet_head); + ngp_f->hash = hash; + ngp_f->packets = 1; + ngp_f->rr_deficit = hinfo->cfg.drr; /* DRR quantum */ + hinfo->run.fifo_queues++; + TAILQ_INSERT_TAIL(&ngp_f->packet_head, ngp_h, ngp_link); + FIFO_VTIME_SORT(m->m_pkthdr.len); + } else { + TAILQ_INSERT_TAIL(&ngp_f->packet_head, ngp_h, ngp_link); + ngp_f->packets++; + } + hinfo->run.qin_frames++; + hinfo->run.qin_octets += m->m_pkthdr.len; + + /* Discard a frame if inbound queue limit has been reached */ + if (hinfo->run.qin_frames > hinfo->cfg.qin_size_limit) { + struct mbuf *m1; + int longest = 0; + + /* Find the longest queue */ + TAILQ_FOREACH(ngp_f1, &hinfo->fifo_head, fifo_le) + if (ngp_f1->packets > longest) { + longest = ngp_f1->packets; + ngp_f = ngp_f1; + } + + /* Drop a frame from the queue head/tail, depending on cfg */ + if (hinfo->cfg.drophead) + ngp_h = TAILQ_FIRST(&ngp_f->packet_head); + else + ngp_h = TAILQ_LAST(&ngp_f->packet_head, p_head); + TAILQ_REMOVE(&ngp_f->packet_head, ngp_h, ngp_link); + m1 = ngp_h->m; + uma_zfree(ngp_zone, ngp_h); + hinfo->run.qin_octets -= m1->m_pkthdr.len; + hinfo->stats.in_disc_octets += m1->m_pkthdr.len; + m_freem(m1); + if (--(ngp_f->packets) == 0) { + TAILQ_REMOVE(&hinfo->fifo_head, ngp_f, fifo_le); + uma_zfree(ngp_zone, ngp_f); + hinfo->run.fifo_queues--; + } + hinfo->run.qin_frames--; + hinfo->stats.in_disc_frames++; + } else if (hinfo->run.qin_frames > hinfo->cfg.qin_size_limit) { + struct mbuf *m1; + int longest = 0; + + /* Find the longest queue */ + TAILQ_FOREACH(ngp_f1, &hinfo->fifo_head, fifo_le) + if (ngp_f1->packets > longest) { + longest = ngp_f1->packets; + ngp_f = ngp_f1; + } + + /* Drop a frame from the queue head/tail, depending on cfg */ + if (hinfo->cfg.drophead) + ngp_h = TAILQ_FIRST(&ngp_f->packet_head); + else + ngp_h = TAILQ_LAST(&ngp_f->packet_head, p_head); + TAILQ_REMOVE(&ngp_f->packet_head, ngp_h, ngp_link); + m1 = ngp_h->m; + uma_zfree(ngp_zone, ngp_h); + hinfo->run.qin_octets -= m1->m_pkthdr.len; + hinfo->stats.in_disc_octets += m1->m_pkthdr.len; + m_freem(m1); + if (--(ngp_f->packets) == 0) { + TAILQ_REMOVE(&hinfo->fifo_head, ngp_f, fifo_le); + uma_zfree(ngp_zone, ngp_f); + hinfo->run.fifo_queues--; + } + hinfo->run.qin_frames--; + hinfo->stats.in_disc_frames++; + } + + /* + * Try to start the dequeuing process immediately. We must + * hold the ng_pipe_giant lock here and pipe_dequeue() will + * release it + */ + pipe_dequeue(hinfo, now); + + return (0); +} + + +/* + * Dequeueing sequence - we basically do the following: + * 1) Try to extract the frame from the inbound (bandwidth) queue; + * 2) In accordance to BER specified, discard the frame randomly; + * 3) If the frame survives BER, prepend it with delay info and move it + * to outbound (delay) queue; + * 4) Loop to 2) until bandwidth quota for this timeslice is reached, or + * inbound queue is flushed completely; + * 5) Extract the first frame from the outbound queue, if it's time has + * come. Queue the frame for transmission on the outbound hook; + * 6) Loop to 5) until outbound queue is flushed completely, or the next + * frame in the queue is not scheduled to be dequeued yet; + * 7) Transimit all frames queued in 5) + * + * Note: the caller must hold the ng_pipe_giant lock; this function + * returns with the lock released. + */ +static void +pipe_dequeue(struct hookinfo *hinfo, struct timeval *now) { + static uint64_t rand, oldrand; + const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hinfo->hook)); + struct hookinfo *dest; + struct ngp_fifo *ngp_f, *ngp_f1; + struct ngp_hdr *ngp_h; + struct timeval *when; + struct mbuf *q_head = NULL; + struct mbuf *q_tail = NULL; + struct mbuf *m; + int error = 0; + + /* Which one is the destination hook? */ + if (hinfo == &priv->lower) + dest = &priv->upper; + else + dest = &priv->lower; + + /* Bandwidth queue processing */ + while ((ngp_f = TAILQ_FIRST(&hinfo->fifo_head))) { + when = &hinfo->qin_utime; + if (when->tv_sec > now->tv_sec || (when->tv_sec == now->tv_sec + && when->tv_usec > now->tv_usec)) + break; + + ngp_h = TAILQ_FIRST(&ngp_f->packet_head); + m = ngp_h->m; + + /* Deficit Round Robin (DRR) processing */ + if (hinfo->cfg.drr) { + if (ngp_f->rr_deficit >= m->m_pkthdr.len) { + ngp_f->rr_deficit -= m->m_pkthdr.len; + } else { + ngp_f->rr_deficit += hinfo->cfg.drr; + TAILQ_REMOVE(&hinfo->fifo_head, ngp_f, fifo_le); + TAILQ_INSERT_TAIL(&hinfo->fifo_head, + ngp_f, fifo_le); + continue; + } + } + + /* + * Either create a duplicate and pass it on, or dequeue + * the original packet... + */ + if (hinfo->cfg.duplicate && + random() % 100 <= hinfo->cfg.duplicate) { + ngp_h = uma_zalloc(ngp_zone, M_NOWAIT); + KASSERT(ngp_h != NULL, ("ngp_h zalloc failed (3)")); + ngp_h->m = m_dup(m, M_NOWAIT); + KASSERT(ngp_h->m != NULL, ("m_dup failed")); + } else { + TAILQ_REMOVE(&ngp_f->packet_head, ngp_h, ngp_link); + hinfo->run.qin_frames--; + hinfo->run.qin_octets -= m->m_pkthdr.len; + ngp_f->packets--; + } + + /* Calculate the serialization delay */ + if (hinfo->cfg.bandwidth) { + hinfo->qin_utime.tv_usec += ((uint64_t) m->m_pkthdr.len + + priv->overhead ) * + 8000000 / hinfo->cfg.bandwidth; + hinfo->qin_utime.tv_sec += + hinfo->qin_utime.tv_usec / 1000000; + hinfo->qin_utime.tv_usec = + hinfo->qin_utime.tv_usec % 1000000; + } + when = &ngp_h->when; + when->tv_sec = hinfo->qin_utime.tv_sec; + when->tv_usec = hinfo->qin_utime.tv_usec; + + /* Sort / rearrange inbound queues */ + if (ngp_f->packets) { + if (hinfo->cfg.wfq) { + TAILQ_REMOVE(&hinfo->fifo_head, ngp_f, fifo_le); + FIFO_VTIME_SORT(TAILQ_FIRST( + &ngp_f->packet_head)->m->m_pkthdr.len) + } + } else { + TAILQ_REMOVE(&hinfo->fifo_head, ngp_f, fifo_le); + uma_zfree(ngp_zone, ngp_f); + hinfo->run.fifo_queues--; + } + + /* Randomly discard the frame, according to BER setting */ + if (hinfo->cfg.ber && + ((oldrand = rand) ^ (rand = random())<<17) >= + hinfo->ber_p[priv->overhead + m->m_pkthdr.len] ) { + hinfo->stats.out_disc_frames++; + hinfo->stats.out_disc_octets += m->m_pkthdr.len; + uma_zfree(ngp_zone, ngp_h); + m_freem(m); + continue; + } + + /* Discard frame if outbound queue size limit exceeded */ + if (hinfo->cfg.qout_size_limit && + hinfo->run.qout_frames>=hinfo->cfg.qout_size_limit) { + hinfo->stats.out_disc_frames++; + hinfo->stats.out_disc_octets += m->m_pkthdr.len; + uma_zfree(ngp_zone, ngp_h); + m_freem(m); + continue; + } + + /* Calculate the propagation delay */ + when->tv_usec += priv->delay; + when->tv_sec += when->tv_usec / 1000000; + when->tv_usec = when->tv_usec % 1000000; + + /* Put the frame into the delay queue */ + TAILQ_INSERT_TAIL(&hinfo->qout_head, ngp_h, ngp_link); + hinfo->run.qout_frames++; + hinfo->run.qout_octets += m->m_pkthdr.len; + } + + /* Delay queue processing */ + while ((ngp_h = TAILQ_FIRST(&hinfo->qout_head))) { + struct mbuf *m = ngp_h->m; + + when = &ngp_h->when; + if (when->tv_sec > now->tv_sec || + (when->tv_sec == now->tv_sec && + when->tv_usec > now->tv_usec)) + break; + + /* Update outbound queue stats */ + hinfo->stats.fwd_frames++; + hinfo->stats.fwd_octets += m->m_pkthdr.len; + hinfo->run.qout_frames--; + hinfo->run.qout_octets -= m->m_pkthdr.len; + + /* Dequeue the packet from qout */ + TAILQ_REMOVE(&hinfo->qout_head, ngp_h, ngp_link); + uma_zfree(ngp_zone, ngp_h); + + /* Enqueue locally for sending downstream */ + if (q_head == NULL) + q_head = m; + if (q_tail) + q_tail->m_nextpkt = m; + q_tail = m; + m->m_nextpkt = NULL; + } + + /* If both queues are empty detach us from the list of active queues */ + if (hinfo->run.qin_frames + hinfo->run.qout_frames == 0) { + LIST_REMOVE(hinfo, active_le); + active_gen_id++; + } + + mtx_unlock(&ng_pipe_giant); + + while ((m = q_head) != NULL) { + q_head = m->m_nextpkt; + m->m_nextpkt = NULL; + NG_SEND_DATA(error, dest->hook, m, meta); + } +} + + +/* + * This routine is called on every clock tick. We poll all nodes/hooks + * for queued frames by calling pipe_dequeue(). + */ +static void +pipe_scheduler(void *arg) +{ + pipe_poll(); + + /* Reschedule */ + callout_reset(&polling_timer, 1, &pipe_scheduler, NULL); +} + + +/* + * Traverse the list of all active hooks and attempt to dequeue + * some packets. Hooks with empty queues are not traversed since + * they are not linked into this list. + */ +static void +pipe_poll(void) +{ + struct hookinfo *hinfo; + struct timeval now; + int old_gen_id = active_gen_id; + + mtx_lock(&ng_pipe_giant); + microuptime(&now); + LIST_FOREACH(hinfo, &active_head, active_le) { + CURVNET_SET(NG_HOOK_NODE(hinfo->hook)->nd_vnet); + pipe_dequeue(hinfo, &now); + CURVNET_RESTORE(); + mtx_lock(&ng_pipe_giant); + if (old_gen_id != active_gen_id) { + /* the list was updated; restart traversing */ + hinfo = LIST_FIRST(&active_head); + if (hinfo == NULL) + break; + old_gen_id = active_gen_id; + continue; + } + } + mtx_unlock(&ng_pipe_giant); +} + + +/* + * Shutdown processing + * + * This is tricky. If we have both a lower and upper hook, then we + * probably want to extricate ourselves and leave the two peers + * still linked to each other. Otherwise we should just shut down as + * a normal node would. + */ +static int +ngp_shutdown(node_p node) +{ + const priv_p priv = NG_NODE_PRIVATE(node); + + if (priv->lower.hook && priv->upper.hook) + ng_bypass(priv->lower.hook, priv->upper.hook); + else { + if (priv->upper.hook != NULL) + ng_rmhook_self(priv->upper.hook); + if (priv->lower.hook != NULL) + ng_rmhook_self(priv->lower.hook); + } + NG_NODE_UNREF(node); + FREE(priv, M_NG_PIPE); + return (0); +} + + +/* + * Hook disconnection + */ +static int +ngp_disconnect(hook_p hook) +{ + struct hookinfo *const hinfo = NG_HOOK_PRIVATE(hook); + struct ngp_fifo *ngp_f; + struct ngp_hdr *ngp_h; + int removed = 0; + + mtx_lock(&ng_pipe_giant); + + KASSERT(hinfo != NULL, ("%s: null info", __FUNCTION__)); + hinfo->hook = NULL; + + /* Flush all fifo queues associated with the hook */ + while ((ngp_f = TAILQ_FIRST(&hinfo->fifo_head))) { + while ((ngp_h = TAILQ_FIRST(&ngp_f->packet_head))) { + TAILQ_REMOVE(&ngp_f->packet_head, ngp_h, ngp_link); + m_freem(ngp_h->m); + uma_zfree(ngp_zone, ngp_h); + removed++; + } + TAILQ_REMOVE(&hinfo->fifo_head, ngp_f, fifo_le); + uma_zfree(ngp_zone, ngp_f); + } + + /* Flush the delay queue */ + while ((ngp_h = TAILQ_FIRST(&hinfo->qout_head))) { + TAILQ_REMOVE(&hinfo->qout_head, ngp_h, ngp_link); + m_freem(ngp_h->m); + uma_zfree(ngp_zone, ngp_h); + removed++; + } + + /* + * Both queues should be empty by now, so detach us from + * the list of active queues + */ + if (removed) { + LIST_REMOVE(hinfo, active_le); + active_gen_id++; + } + if (hinfo->run.qin_frames + hinfo->run.qout_frames != removed) + printf("Mismatch: queued=%d but removed=%d !?!", + hinfo->run.qin_frames + hinfo->run.qout_frames, removed); + + /* Release the packet loss probability table (BER) */ + if (hinfo->ber_p) + FREE(hinfo->ber_p, M_NG_PIPE); + + mtx_unlock(&ng_pipe_giant); + + return (0); +} + +static int +ngp_modevent(module_t mod, int type, void *unused) +{ + int error = 0; + + switch (type) { + case MOD_LOAD: + ngp_zone = uma_zcreate("ng_pipe", max(sizeof(struct ngp_hdr), + sizeof (struct ngp_fifo)), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + if (ngp_zone == NULL) + panic("ng_pipe: couldn't allocate descriptor zone"); + + mtx_init(&ng_pipe_giant, "ng_pipe_giant", NULL, MTX_DEF); + LIST_INIT(&active_head); + callout_init(&polling_timer, CALLOUT_MPSAFE); + callout_reset(&polling_timer, 1, &pipe_scheduler, NULL); + break; + case MOD_UNLOAD: + callout_drain(&polling_timer); + uma_zdestroy(ngp_zone); + mtx_destroy(&ng_pipe_giant); + break; + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/netgraph/ng_pipe.h 2007-10-05 12:27:01.000000000 +0200 @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2004, 2007 University of Zagreb + * Copyright (c) 2007 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETGRAPH_PIPE_H_ +#define _NETGRAPH_PIPE_H_ + +/* Node type name and magic cookie */ +#define NG_PIPE_NODE_TYPE "pipe" +#define NGM_PIPE_COOKIE 200708191 + +/* Hook names */ +#define NG_PIPE_HOOK_UPPER "upper" +#define NG_PIPE_HOOK_LOWER "lower" + +#define MAX_FSIZE 16384 /* Largest supported frame size, in bytes, for BER */ +#define MAX_OHSIZE 256 /* Largest supported dummy-framing size, in bytes */ + +/* Statistics structure for one hook */ +struct ng_pipe_hookstat { + u_int64_t fwd_octets; + u_int64_t fwd_frames; + u_int64_t in_disc_octets; + u_int64_t in_disc_frames; + u_int64_t out_disc_octets; + u_int64_t out_disc_frames; +}; + +/* Keep this in sync with the above structure definition */ +#define NG_PIPE_HOOKSTAT_INFO { \ + { "FwdOctets", &ng_parse_uint64_type }, \ + { "FwdFrames", &ng_parse_uint64_type }, \ + { "queueDropOctets", &ng_parse_uint64_type }, \ + { "queueDropFrames", &ng_parse_uint64_type }, \ + { "delayDropOctets", &ng_parse_uint64_type }, \ + { "delayDropFrames", &ng_parse_uint64_type }, \ + { NULL }, \ +} + +/* Statistics structure returned by NGM_PIPE_GET_STATS */ +struct ng_pipe_stats { + struct ng_pipe_hookstat downstream; + struct ng_pipe_hookstat upstream; +}; + +/* Keep this in sync with the above structure definition */ +#define NG_PIPE_STATS_INFO(hstype) { \ + { "downstream", (hstype) }, \ + { "upstream", (hstype) }, \ + { NULL }, \ +} + +/* Runtime structure for one hook */ +struct ng_pipe_hookrun { + u_int32_t fifo_queues; + u_int32_t qin_octets; + u_int32_t qin_frames; + u_int32_t qout_octets; + u_int32_t qout_frames; +}; + +/* Keep this in sync with the above structure definition */ +#define NG_PIPE_HOOKRUN_INFO { \ + { "queues", &ng_parse_uint32_type }, \ + { "queuedOctets", &ng_parse_uint32_type }, \ + { "queuedFrames", &ng_parse_uint32_type }, \ + { "delayedOctets", &ng_parse_uint32_type }, \ + { "delayedFrames", &ng_parse_uint32_type }, \ + { NULL }, \ +} + +/* Runtime structure returned by NGM_PIPE_GET_RUN */ +struct ng_pipe_run { + struct ng_pipe_hookrun downstream; + struct ng_pipe_hookrun upstream; +}; + +/* Keep this in sync with the above structure definition */ +#define NG_PIPE_RUN_INFO(hstype) { \ + { "downstream", (hstype) }, \ + { "upstream", (hstype) }, \ + { NULL }, \ +} + +/* Config structure for one hook */ +struct ng_pipe_hookcfg { + u_int64_t bandwidth; + u_int64_t ber; + u_int32_t qin_size_limit; + u_int32_t qout_size_limit; + u_int32_t duplicate; + u_int32_t fifo; + u_int32_t drr; + u_int32_t wfq; + u_int32_t droptail; + u_int32_t drophead; +}; + +/* Keep this in sync with the above structure definition */ +#define NG_PIPE_HOOKCFG_INFO { \ + { "bandwidth", &ng_parse_uint64_type }, \ + { "BER", &ng_parse_uint64_type }, \ + { "queuelen", &ng_parse_uint32_type }, \ + { "delaylen", &ng_parse_uint32_type }, \ + { "duplicate", &ng_parse_uint32_type }, \ + { "fifo", &ng_parse_uint32_type }, \ + { "drr", &ng_parse_uint32_type }, \ + { "wfq", &ng_parse_uint32_type }, \ + { "droptail", &ng_parse_uint32_type }, \ + { "drophead", &ng_parse_uint32_type }, \ + { NULL }, \ +} + +/* Config structure returned by NGM_PIPE_GET_CFG */ +struct ng_pipe_cfg { + u_int64_t bandwidth; + u_int64_t delay; + u_int32_t header_offset; + u_int32_t overhead; + struct ng_pipe_hookcfg downstream; + struct ng_pipe_hookcfg upstream; +}; + +/* Keep this in sync with the above structure definition */ +#define NG_PIPE_CFG_INFO(hstype) { \ + { "bandwidth", &ng_parse_uint64_type }, \ + { "delay", &ng_parse_uint64_type }, \ + { "header_offset", &ng_parse_uint32_type }, \ + { "overhead", &ng_parse_uint32_type }, \ + { "downstream", (hstype) }, \ + { "upstream", (hstype) }, \ + { NULL }, \ +} + +/* Netgraph commands */ +enum { + NGM_PIPE_GET_STATS=1, /* get stats */ + NGM_PIPE_CLR_STATS, /* clear stats */ + NGM_PIPE_GETCLR_STATS, /* atomically get and clear stats */ + NGM_PIPE_GET_RUN, /* get current runtime status */ + NGM_PIPE_GET_CFG, /* get configurable parameters */ + NGM_PIPE_SET_CFG, /* set configurable parameters */ +}; + +#endif /* _NETGRAPH_PIPE_H_ */ --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/netgraph/ng_wormhole.c 2007-10-22 18:06:39.000000000 +0200 @@ -0,0 +1,451 @@ +/*- + * Copyright (c) 2007 University of Zagreb + * Copyright (c) 2007 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * XXX RCS tag goes here + */ + +/* + * A "worm" node can be used to establish a datapath between independent + * netgraph address spaces, i.e. between two virtual network stacks. A + * wormhole path is defined by a pair of wormhole nodes each residing in + * a different stack instance. Each node accepts only a single + * arbitrarily named hook. Once a wormhole datapath is established, all + * data messages received on the local hook will be forwarded to the + * hook connected to the remote node, and vice versa. + * + * "worm" nodes understand two node-specific messages: "peer" and + * "status". The "peer" message is used to specify the remote + * endpoint in form of "remote_worm_node_name@remote_vnet_name", or + * to fetch the current peering configuration if invoked without + * arguments. Both involved nodes must configure their peerings before + * the datapath will be established. The "status" command can be used + * to check the current state of the wormhole path, which can be one of + * unconfigured, pending or active. + * + * NB while the vnet addressing space is currently flat, it is reasonable + * to expect that this could change in the nearest future, which may be + * reflected in the addressing model for ng_wormhole datapaths. + * + * The following example shows how a netgraph path can be established + * between two network stack instances, "1" and "2": + * + * #!/bin/csh + * + * foreach vi (1 2) + * vimage -c $vi + * vimage $vi ngctl mkpeer eiface ether ether + * vimage $vi ngctl mkpeer ngeth0: worm ether ether + * vimage $vi ifconfig ngeth0 ether 40:0:0:0:0:$vi + * vimage $vi ifconfig ngeth0 10.0.0.$vi/24 + * end + * vimage 1 ngctl msg worm0: peer worm0@2 + * vimage 2 ngctl msg worm0: peer worm0@1 + * + */ + +#include "opt_vimage.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +struct ng_wormhole; +typedef struct ng_wormhole_priv *priv_p; + +#define NG_WORMHOLE_NODE_TYPE "worm" +#define NGM_WORMHOLE_COOKIE 20070806 + +static int ng_wormhole_mod_event(module_t, int, void *); +static ng_constructor_t ng_wormhole_constructor; +static ng_shutdown_t ng_wormhole_shutdown; +static ng_newhook_t ng_wormhole_newhook; +static ng_disconnect_t ng_wormhole_disconnect; +static ng_rcvdata_t ng_wormhole_rcvdata; +static ng_rcvmsg_t ng_wormhole_rcvmsg; +static vnet_attach_fn ng_wormhole_iattach; +static vnet_detach_fn ng_wormhole_idetach; +static void ng_wormhole_update_status(priv_p); +static ng_parse_t ng_wormhole_peer_parse; +static ng_unparse_t ng_wormhole_peer_unparse; +static ng_unparse_t ng_wormhole_status_unparse; + +/* Node state */ +enum { + NG_WORMHOLE_UNCONFIGURED = 0, + NG_WORMHOLE_PENDING, + NG_WORMHOLE_ACTIVE +}; + +/* Netgraph commands */ +enum { + NGM_WORMHOLE_PEER = 1, + NGM_WORMHOLE_STATUS +}; + +static const struct ng_parse_type ng_wormhole_peer_type = { + .parse = &ng_wormhole_peer_parse, + .unparse = &ng_wormhole_peer_unparse, +}; + +static const struct ng_parse_type ng_wormhole_status_type = { + .unparse = &ng_wormhole_status_unparse, +}; + +static const struct ng_cmdlist ng_wormhole_cmds[] = { + { + .cookie = NGM_WORMHOLE_COOKIE, + .cmd = NGM_WORMHOLE_PEER, + .name = "peer", + .mesgType = &ng_wormhole_peer_type, + .respType = &ng_wormhole_peer_type, + }, + { + .cookie = NGM_WORMHOLE_COOKIE, + .cmd = NGM_WORMHOLE_STATUS, + .name = "status", + .respType = &ng_wormhole_status_type, + }, + { 0 } +}; + +static struct ng_type typestruct = { + .version = NG_ABI_VERSION, + .name = NG_WORMHOLE_NODE_TYPE, + .mod_event = ng_wormhole_mod_event, + .constructor = ng_wormhole_constructor, + .rcvmsg = ng_wormhole_rcvmsg, + .shutdown = ng_wormhole_shutdown, + .newhook = ng_wormhole_newhook, + .rcvdata = ng_wormhole_rcvdata, + .disconnect = ng_wormhole_disconnect, + .cmdlist = ng_wormhole_cmds +}; +NETGRAPH_INIT(ng_wormhole, &typestruct); + +VNET_MOD_DECLARE_STATELESS(NG_WORMHOLE, ng_wormhole, ng_wormhole_iattach, + ng_wormhole_idetach, NETGRAPH) + +struct ng_wormhole_priv { + int status; + priv_p remote_priv; + struct vnet *vnet; + hook_p hook; + node_p node; + LIST_ENTRY(ng_wormhole_priv) all_wormholes_le; + int unit; +}; + +LIST_HEAD(, ng_wormhole_priv) all_wormholes_head; +/* XXX need a lock around the above list */ + +static int +ng_wormhole_constructor(node_p node) +{ + INIT_VNET_NETGRAPH(curvnet); + priv_p priv; + char buf[NG_NODESIZ]; + + MALLOC(priv, priv_p, sizeof(*priv), M_NETGRAPH, M_ZERO | M_NOWAIT); + if (priv == NULL) + return (ENOMEM); + + NG_NODE_SET_PRIVATE(node, priv); + priv->unit = alloc_unr(V_ng_wormhole_unit); + snprintf(buf, NG_NODESIZ, "%s%d", typestruct.name, priv->unit); + if (ng_name_node(node, buf) != 0) + log(LOG_WARNING, "%s: can't acquire netgraph name\n", buf); + priv->vnet = curvnet; + priv->node = node; + priv->hook = NULL; + priv->status = NG_WORMHOLE_UNCONFIGURED; + LIST_INSERT_HEAD(&all_wormholes_head, priv, all_wormholes_le); + return (0); +} + +static int +ng_wormhole_newhook(node_p node, hook_p hook, const char *name) +{ + priv_p priv = NG_NODE_PRIVATE(node); + + if (priv->hook) + return(EBUSY); + priv->hook = hook; + ng_wormhole_update_status(priv); + return (0); +} + +static int +ng_wormhole_disconnect(hook_p hook) +{ + priv_p priv = NG_NODE_PRIVATE(hook->hk_node); + + priv->hook = NULL; + ng_wormhole_update_status(priv); + return (0); +} + +static int +ng_wormhole_rcvmsg(node_p node, item_p item, hook_p lasthook) +{ + priv_p priv = NG_NODE_PRIVATE(node); + priv_p *remote_priv; + struct ng_mesg *resp = NULL; + struct ng_mesg *msg; + int error = 0; + + NGI_GET_MSG(item, msg); + switch (msg->header.typecookie) { + case NGM_WORMHOLE_COOKIE: + switch (msg->header.cmd) { + case NGM_WORMHOLE_PEER: + remote_priv = (priv_p *) &msg->data; + if (*remote_priv) { + if (*remote_priv == priv) + error = EINVAL; + else + priv->remote_priv = *remote_priv; + /* XXX drop all wormhole lock */ + ng_wormhole_update_status(priv); + } else { + NG_MKRESPONSE(resp, msg, + sizeof(priv->remote_priv), M_NOWAIT); + if (resp == NULL) + error = ENOMEM; + else + bcopy(&priv->remote_priv, resp->data, + sizeof(priv->remote_priv)); + } + + break; + case NGM_WORMHOLE_STATUS: + NG_MKRESPONSE(resp, msg, + sizeof(priv->status), M_NOWAIT); + if (resp == NULL) + error = ENOMEM; + else + bcopy(&priv->status, resp->data, + sizeof(priv->status)); + break; + default: + error = EINVAL; + break; + } + break; + default: + error = EINVAL; + break; + } + NG_RESPOND_MSG(error, node, item, resp); + NG_FREE_MSG(msg); + return (error); +} + +static int +ng_wormhole_peer_parse(const struct ng_parse_type *type, + const char *s, int *off, const u_char *const start, + u_char *const buf, int *buflen) +{ + char node_name_buf[NG_NODESIZ]; + char *t; + int len; + int error = 0; + priv_p *remote_priv = (priv_p *)buf; + + *buflen = sizeof(priv_p); + + while (isspace(s[*off])) + (*off)++; + if (strlen(&s[*off]) == 0) { + /* XXX to drop or not to drop the lock? */ + *remote_priv = NULL; + return (error); + } + if ((t = index(s + *off, '@')) == NULL) + return (EINVAL); + if ((len = t - (s + *off)) > sizeof(node_name_buf) - 1) + return (EINVAL); + strncpy(node_name_buf, s + *off, len); + node_name_buf[len] = '\0'; + *off += len + 1; /* vnet name should be in &s[*off] now */ + + /* XXX should lock all wormhole list here */ + LIST_FOREACH(*remote_priv, &all_wormholes_head, all_wormholes_le) + if (strcmp((*remote_priv)->node->nd_name, node_name_buf) == 0 && + strcmp(vnet_name((*remote_priv)->vnet), &s[*off]) == 0) + break; + if (*remote_priv) { + /* XXX should return with the lock held, drop it in rcvmsg */ + } else { + error = ENOENT; + /* XXX should unlock the all wormholes list now */ + } + return (error); +} + +static int +ng_wormhole_peer_unparse(const struct ng_parse_type *type, + const u_char *data, int *off, char *cbuf, int cbuflen) +{ + const priv_p *remote_priv = (const priv_p *)(data + *off); + + if (*remote_priv) { + /* XXX lock all wormhole list; check whether remote exists */ + snprintf(cbuf, cbuflen, "%s@%s", + (*remote_priv)->node->nd_name, + vnet_name((*remote_priv)->vnet)); + *off += sizeof(*remote_priv); + } + return (0); +} + +static int +ng_wormhole_status_unparse(const struct ng_parse_type *type, + const u_char *data, int *off, char *cbuf, int cbuflen) +{ + const int *status = (const int *)(data + *off); + + switch (*status) { + case NG_WORMHOLE_UNCONFIGURED: + snprintf(cbuf, cbuflen, "unconfigured"); + break; + case NG_WORMHOLE_PENDING: + snprintf(cbuf, cbuflen, "pending"); + break; + case NG_WORMHOLE_ACTIVE: + snprintf(cbuf, cbuflen, "active"); + break; + default: + panic("unknown status %d", *status); + } + *off += sizeof(*status); + return (0); +} + +static void +ng_wormhole_update_status(priv_p priv) +{ + priv_p remote_priv; + + /* XXX lock / unlock the all wormhole list while doing this */ + remote_priv = priv->remote_priv; + if (remote_priv == NULL) + priv->status = NG_WORMHOLE_UNCONFIGURED; + else if (remote_priv->remote_priv != priv) + priv->status = NG_WORMHOLE_PENDING; + else if (remote_priv->hook == NULL || priv->hook == NULL) + priv->status = remote_priv->status = NG_WORMHOLE_PENDING; + else + priv->status = remote_priv->status = NG_WORMHOLE_ACTIVE; +} + +static int +ng_wormhole_rcvdata(hook_p hook, item_p item) +{ + priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); + int error = 0; + priv_p remote_priv = priv->remote_priv; + struct mbuf *m; + + if (priv->status != NG_WORMHOLE_ACTIVE) { + NG_FREE_ITEM(item); + error = ENOTCONN; + } else { + m = NGI_M(item); + m->m_flags |= M_REMOTE_VNET; + CURVNET_SET_QUIET(remote_priv->vnet); + NG_FWD_ITEM_HOOK(error, item, remote_priv->hook); + CURVNET_RESTORE(); + } + return (error); +} + +static int +ng_wormhole_shutdown(node_p node) +{ + priv_p priv = NG_NODE_PRIVATE(node); + INIT_VNET_NETGRAPH(priv->vnet); + + LIST_REMOVE(priv, all_wormholes_le); + free_unr(V_ng_wormhole_unit, priv->unit); + FREE(priv, M_NETGRAPH); + NG_NODE_SET_PRIVATE(node, NULL); + NG_NODE_UNREF(node); + return (0); +} + +static int +ng_wormhole_mod_event(module_t mod, int event, void *data) +{ + int error = 0; + + switch (event) { + case MOD_LOAD: + vnet_mod_register(&vnet_ng_wormhole_modinfo); + break; + case MOD_UNLOAD: + vnet_mod_deregister(&vnet_ng_wormhole_modinfo); + break; + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +static int ng_wormhole_iattach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + + V_ng_wormhole_unit = new_unrhdr(0, 0xffff, NULL); + return (0); +} + +static int ng_wormhole_idetach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + node_p node; + + do { + LIST_FOREACH(node, &V_ng_nodelist, nd_nodes) + if (node->nd_type == &typestruct) { + ng_rmnode_self(node); + break; + } + } while (node != NULL); + delete_unrhdr(V_ng_wormhole_unit); + return (0); +} --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/netgraph/vnetgraph.h 2007-10-05 12:27:01.000000000 +0200 @@ -0,0 +1,65 @@ +/*- + * Copyright (c) 2006 University of Zagreb + * Copyright (c) 2006 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * XXX RCS tag goes here + */ + +#ifndef _NETGRAPH_VNETGRPAH_H_ +#define _NETGRAPH_VNETGRAPH_H_ + +#include + +#define INIT_VNET_NETGRAPH(vnet) \ + INIT_FROM_VNET(vnet, VNET_MOD_NETGRAPH, \ + struct vnet_netgraph, vnet_netgraph) + +#define VNET_NETGRAPH(sym) VSYM(vnet_netgraph, sym) + +#define NG_ID_HASH_SIZE 32 /* most systems wont need even this many */ + +#ifdef VIMAGE +struct vnet_netgraph { + LIST_HEAD(, ng_node) _ng_ID_hash[NG_ID_HASH_SIZE]; + LIST_HEAD(, ng_node) _ng_nodelist; + ng_ID_t _nextID; + struct unrhdr *_ng_iface_unit; + struct unrhdr *_ng_eiface_unit; + struct unrhdr *_ng_wormhole_unit; +}; +#endif + +/* Symbol translation macros */ +#define V_ng_ID_hash VNET_NETGRAPH(ng_ID_hash) +#define V_ng_nodelist VNET_NETGRAPH(ng_nodelist) +#define V_nextID VNET_NETGRAPH(nextID) +#define V_ng_iface_unit VNET_NETGRAPH(ng_iface_unit) +#define V_ng_eiface_unit VNET_NETGRAPH(ng_eiface_unit) +#define V_ng_wormhole_unit VNET_NETGRAPH(ng_wormhole_unit) + +#endif /* !_NETGRAPH_VNETGRAPH_H_ */ --- /u/marko/p4/head/src/sys/netinet/icmp_var.h 2007-08-31 03:47:59.000000000 +0200 +++ src/sys/netinet/icmp_var.h 2007-10-05 12:27:01.000000000 +0200 @@ -74,7 +74,9 @@ #ifdef _KERNEL SYSCTL_DECL(_net_inet_icmp); +#ifndef VIMAGE extern struct icmpstat icmpstat; /* icmp statistics */ +#endif extern int badport_bandlim(int); #define BANDLIM_UNLIMITED -1 #define BANDLIM_ICMP_UNREACH 0 --- /u/marko/p4/head/src/sys/netinet/if_ether.c 2008-01-04 13:50:39.000000000 +0100 +++ src/sys/netinet/if_ether.c 2008-01-14 19:23:52.000000000 +0100 @@ -41,6 +41,7 @@ #include "opt_inet.h" #include "opt_mac.h" #include "opt_carp.h" +#include "opt_vimage.h" #include #include @@ -49,9 +50,12 @@ #include #include #include +#include #include #include +#include +#include #include #include #include @@ -60,6 +64,7 @@ #include #include +#include #include #include #include @@ -80,10 +85,12 @@ SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); /* timer values */ -static int arpt_keep = (20*60); /* once resolved, good for 20 more minutes */ +#ifndef VIMAGE +static int arpt_keep; +#endif -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW, - &arpt_keep, 0, "ARP entry lifetime in seconds"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, max_age, + CTLFLAG_RW, arpt_keep, 0, "ARP entry lifetime in seconds"); #define rt_expire rt_rmx.rmx_expire @@ -96,20 +103,25 @@ }; static struct ifqueue arpintrq; -static int arp_allocated; -static int arp_maxtries = 5; -static int useloopback = 1; /* use loopback interface for local traffic */ -static int arp_proxyall = 0; +#ifndef VIMAGE +static int arp_maxtries; +static int useloopback; +static int arp_proxyall; +#endif -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW, - &arp_maxtries, 0, "ARP resolution attempts before returning error"); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW, - &useloopback, 0, "Use the loopback interface for local traffic"); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW, - &arp_proxyall, 0, "Enable proxy ARP for all suitable requests"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, maxtries, + CTLFLAG_RW, arp_maxtries, 0, + "ARP resolution attempts before returning error"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, useloopback, + CTLFLAG_RW, useloopback, 0, + "Use the loopback interface for local traffic"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, proxyall, + CTLFLAG_RW, arp_proxyall, 0, + "Enable proxy ARP for all suitable requests"); static void arp_init(void); +static int arp_iattach(const void *); static void arp_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static void arprequest(struct ifnet *, struct in_addr *, struct in_addr *, u_char *); @@ -121,6 +133,8 @@ static void in_arpinput(struct mbuf *); #endif +VNET_MOD_DECLARE_STATELESS(ARP, arp, arp_iattach, NULL, INET) + /* * Timeout routine. */ @@ -138,7 +152,9 @@ */ RT_UNLOCK(rt); + CURVNET_SET(rt->rt_ifp->if_vnet); rtrequest(RTM_DELETE, rt_key(rt), NULL, rt_mask(rt), 0, NULL); + CURVNET_RESTORE(); } /* @@ -147,6 +163,8 @@ static void arp_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); struct sockaddr *gate; struct llinfo_arp *la; static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; @@ -213,7 +231,6 @@ log(LOG_DEBUG, "%s: malloc failed\n", __func__); break; } - arp_allocated++; /* * We are storing a route entry outside of radix tree. So, * it can be found and accessed by other means than radix @@ -248,7 +265,7 @@ } #endif - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifp == rt->rt_ifp && SIN(rt_key(rt))->sin_addr.s_addr == (IA_SIN(ia))->sin_addr.s_addr) @@ -268,9 +285,9 @@ rt->rt_expire = 0; bcopy(IF_LLADDR(rt->rt_ifp), LLADDR(SDL(gate)), SDL(gate)->sdl_alen = rt->rt_ifp->if_addrlen); - if (useloopback) { - rt->rt_ifp = loif; - rt->rt_rmx.rmx_mtu = loif->if_mtu; + if (V_useloopback) { + rt->rt_ifp = V_loif; + rt->rt_rmx.rmx_mtu = V_loif->if_mtu; } /* @@ -358,6 +375,7 @@ arpresolve(struct ifnet *ifp, struct rtentry *rt0, struct mbuf *m, struct sockaddr *dst, u_char *desten) { + INIT_VNET_INET(ifp->if_vnet); struct llinfo_arp *la = NULL; struct rtentry *rt = NULL; struct sockaddr_dl *sdl; @@ -468,7 +486,7 @@ * if we have already sent arp_maxtries ARP requests. Retransmit the * ARP request, but not faster than one request per second. */ - if (la->la_asked < arp_maxtries) + if (la->la_asked < V_arp_maxtries) error = EWOULDBLOCK; /* First request. */ else error = (rt == rt0) ? EHOSTDOWN : EHOSTUNREACH; @@ -589,7 +607,8 @@ sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr.s_addr = 0; - + INIT_VNET_INET(ifp->if_vnet); + if (ifp->if_bridge) bridged = 1; @@ -644,7 +663,7 @@ /* * If bridging, fall back to using any inet address. */ - if (!bridged || (ia = TAILQ_FIRST(&in_ifaddrhead)) == NULL) + if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) goto drop; match: if (!enaddr) @@ -780,11 +799,11 @@ th->rcf = trld->trld_rcf; } if (rt->rt_expire) { - rt->rt_expire = time_uptime + arpt_keep; - callout_reset(&la->la_timer, hz * arpt_keep, arptimer, rt); + rt->rt_expire = time_uptime + V_arpt_keep; + callout_reset(&la->la_timer, hz * V_arpt_keep, arptimer, rt); } la->la_asked = 0; - la->la_preempt = arp_maxtries; + la->la_preempt = V_arp_maxtries; hold = la->la_hold; la->la_hold = NULL; RT_UNLOCK(rt); @@ -803,7 +822,7 @@ if (rt == NULL) { struct sockaddr_in sin; - if (!arp_proxyall) + if (!V_arp_proxyall) goto drop; bzero(&sin, sizeof sin); @@ -971,12 +990,31 @@ ifa->ifa_flags |= RTF_CLONING; } +static int +arp_iattach(unused) + const void *unused; +{ + INIT_VNET_INET(curvnet); + + V_arpt_keep = (20*60); /* once resolved, good for 20 more minutes */ + V_arp_maxtries = 5; + V_useloopback = 1; /* use loopback interface for local traffic */ + V_arp_proxyall = 0; + + return 0; +} + static void arp_init(void) { - +#ifdef VIMAGE + vnet_mod_register(&vnet_arp_modinfo); +#else + arp_iattach(NULL); +#endif arpintrq.ifq_maxlen = 50; mtx_init(&arpintrq.ifq_mtx, "arp_inq", NULL, MTX_DEF); netisr_register(NETISR_ARP, arpintr, &arpintrq, NETISR_MPSAFE); } + SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0); --- /u/marko/p4/head/src/sys/netinet/igmp.c 2007-10-29 17:17:43.000000000 +0100 +++ src/sys/netinet/igmp.c 2007-12-10 11:26:11.000000000 +0100 @@ -48,6 +48,7 @@ __FBSDID("$FreeBSD: src/sys/netinet/igmp.c,v 1.56 2007/10/28 15:55:21 rwatson Exp $"); #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -57,10 +58,13 @@ #include #include #include +#include +#include #include #include +#include #include #include #include @@ -79,10 +83,12 @@ static struct router_info *find_rti(struct ifnet *ifp); static void igmp_sendpkt(struct in_multi *, int, unsigned long); +#ifndef VIMAGE static struct igmpstat igmpstat; +#endif -SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RW, &igmpstat, - igmpstat, ""); +SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_igmp, IGMPCTL_STATS, + stats, CTLFLAG_RW, igmpstat, igmpstat, ""); /* * igmp_mtx protects all mutable global variables in igmp.c, as well as the @@ -92,7 +98,9 @@ * when accessed via an in_multi read-only. */ static struct mtx igmp_mtx; +#ifndef VIMAGE static SLIST_HEAD(, router_info) router_info_head; +#endif static int igmp_timers_are_running; /* @@ -115,8 +123,12 @@ void igmp_init(void) { + INIT_VNET_INET(curvnet); struct ipoption *ra; +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif /* * To avoid byte-swapping the same value over and over again. */ @@ -138,17 +150,22 @@ router_alert->m_len = sizeof(ra->ipopt_dst) + ra->ipopt_list[1]; mtx_init(&igmp_mtx, "igmp_mtx", NULL, MTX_DEF); - SLIST_INIT(&router_info_head); +#ifdef VIMAGE + } +#endif + + SLIST_INIT(&V_router_info_head); } static struct router_info * find_rti(struct ifnet *ifp) { + INIT_VNET_INET(ifp->if_vnet); struct router_info *rti; mtx_assert(&igmp_mtx, MA_OWNED); IGMP_PRINTF("[igmp.c, _find_rti] --> entering \n"); - SLIST_FOREACH(rti, &router_info_head, rti_list) { + SLIST_FOREACH(rti, &V_router_info_head, rti_list) { if (rti->rti_ifp == ifp) { IGMP_PRINTF( "[igmp.c, _find_rti] --> found old entry \n"); @@ -163,7 +180,7 @@ rti->rti_ifp = ifp; rti->rti_type = IGMP_V2_ROUTER; rti->rti_time = 0; - SLIST_INSERT_HEAD(&router_info_head, rti, rti_list); + SLIST_INSERT_HEAD(&V_router_info_head, rti, rti_list); IGMP_PRINTF("[igmp.c, _find_rti] --> created an entry \n"); return (rti); } @@ -182,8 +199,9 @@ struct in_multistep step; struct router_info *rti; int timer; /** timer value in the igmp query header **/ + INIT_VNET_INET(ifp->if_vnet); - ++igmpstat.igps_rcv_total; + ++V_igmpstat.igps_rcv_total; ip = mtod(m, struct ip *); igmplen = ip->ip_len; @@ -192,14 +210,14 @@ * Validate lengths. */ if (igmplen < IGMP_MINLEN) { - ++igmpstat.igps_rcv_tooshort; + ++V_igmpstat.igps_rcv_tooshort; m_freem(m); return; } minlen = iphlen + IGMP_MINLEN; if ((m->m_flags & M_EXT || m->m_len < minlen) && (m = m_pullup(m, minlen)) == 0) { - ++igmpstat.igps_rcv_tooshort; + ++V_igmpstat.igps_rcv_tooshort; return; } @@ -210,7 +228,7 @@ m->m_len -= iphlen; igmp = mtod(m, struct igmp *); if (in_cksum(m, igmplen)) { - ++igmpstat.igps_rcv_badsum; + ++V_igmpstat.igps_rcv_badsum; m_freem(m); return; } @@ -235,7 +253,7 @@ */ switch (igmp->igmp_type) { case IGMP_MEMBERSHIP_QUERY: - ++igmpstat.igps_rcv_queries; + ++V_igmpstat.igps_rcv_queries; if (ifp->if_flags & IFF_LOOPBACK) break; @@ -262,7 +280,7 @@ if (ip->ip_dst.s_addr != igmp_all_hosts_group || igmp->igmp_group.s_addr != 0) { - ++igmpstat.igps_rcv_badqueries; + ++V_igmpstat.igps_rcv_badqueries; m_freem(m); return; } @@ -273,7 +291,7 @@ if (igmp->igmp_group.s_addr != 0 && !IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { - ++igmpstat.igps_rcv_badqueries; + ++V_igmpstat.igps_rcv_badqueries; m_freem(m); return; } @@ -321,13 +339,13 @@ ip->ip_src.s_addr == IA_SIN(ia)->sin_addr.s_addr) break; - ++igmpstat.igps_rcv_reports; + ++V_igmpstat.igps_rcv_reports; if (ifp->if_flags & IFF_LOOPBACK) break; if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { - ++igmpstat.igps_rcv_badreports; + ++V_igmpstat.igps_rcv_badreports; m_freem(m); return; } @@ -354,7 +372,7 @@ IN_LOOKUP_MULTI(igmp->igmp_group, ifp, inm); if (inm != NULL) { inm->inm_timer = 0; - ++igmpstat.igps_rcv_ourreports; + ++V_igmpstat.igps_rcv_ourreports; inm->inm_state = IGMP_OTHERMEMBER; } IN_MULTI_UNLOCK(); @@ -422,6 +440,8 @@ IN_MULTI_LOCK(); igmp_timers_are_running = 0; + VNET_ITERLOOP_BEGIN(); + INIT_VNET_INET(vnet_iter); IN_FIRST_MULTI(step, inm); while (inm != NULL) { if (inm->inm_timer == 0) { @@ -434,6 +454,7 @@ } IN_NEXT_MULTI(step, inm); } + VNET_ITERLOOP_END(); IN_MULTI_UNLOCK(); } @@ -444,13 +465,16 @@ IGMP_PRINTF("[igmp.c,_slowtimo] -- > entering \n"); mtx_lock(&igmp_mtx); - SLIST_FOREACH(rti, &router_info_head, rti_list) { + VNET_ITERLOOP_BEGIN() + INIT_VNET_INET(vnet_iter); + SLIST_FOREACH(rti, &V_router_info_head, rti_list) { if (rti->rti_type == IGMP_V1_ROUTER) { rti->rti_time++; if (rti->rti_time >= IGMP_AGE_THRESHOLD) rti->rti_type = IGMP_V2_ROUTER; } } + VNET_ITERLOOP_END() mtx_unlock(&igmp_mtx); IGMP_PRINTF("[igmp.c,_slowtimo] -- > exiting \n"); } @@ -458,6 +482,8 @@ static void igmp_sendpkt(struct in_multi *inm, int type, unsigned long addr) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); struct mbuf *m; struct igmp *igmp; struct ip *ip; @@ -469,7 +495,7 @@ if (m == NULL) return; - m->m_pkthdr.rcvif = loif; + m->m_pkthdr.rcvif = V_loif; #ifdef MAC mac_netinet_igmp_send(inm->inm_ifp, m); #endif @@ -501,12 +527,12 @@ * Request loopback of the report if we are acting as a multicast * router, so that the process-level routing daemon can hear it. */ - imo.imo_multicast_loop = (ip_mrouter != NULL); + imo.imo_multicast_loop = (V_ip_mrouter != NULL); /* * XXX: Do we have to worry about reentrancy here? Don't think so. */ ip_output(m, router_alert, &igmprt, 0, &imo, NULL); - ++igmpstat.igps_snd_reports; + ++V_igmpstat.igps_snd_reports; } --- /u/marko/p4/head/src/sys/netinet/in.c 2008-01-28 23:53:52.000000000 +0100 +++ src/sys/netinet/in.c 2008-02-27 11:49:02.000000000 +0100 @@ -34,6 +34,7 @@ __FBSDID("$FreeBSD: src/sys/netinet/in.c,v 1.103 2008/01/24 08:14:38 bz Exp $"); #include "opt_carp.h" +#include "opt_vimage.h" #include #include @@ -43,11 +44,14 @@ #include #include #include +#include +#include #include #include #include +#include #include #include #include @@ -65,16 +69,19 @@ struct in_ifaddr *, struct sockaddr_in *, int); static void in_purgemaddrs(struct ifnet *); -static int subnetsarelocal = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW, - &subnetsarelocal, 0, "Treat all subnets as directly connected"); -static int sameprefixcarponly = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, same_prefix_carp_only, CTLFLAG_RW, - &sameprefixcarponly, 0, - "Refuse to create same prefixes on different interfaces"); - +#ifndef VIMAGE +static int subnetsarelocal; +static int sameprefixcarponly; extern struct inpcbinfo ripcbinfo; extern struct inpcbinfo udbinfo; +#endif + +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, subnets_are_local, + CTLFLAG_RW, subnetsarelocal, 0, + "Treat all subnets as directly connected"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, same_prefix_carp_only, + CTLFLAG_RW, sameprefixcarponly, 0, + "Refuse to create same prefixes on different interfaces"); /* * Return 1 if an internet address is for a ``local'' host @@ -85,15 +92,16 @@ int in_localaddr(struct in_addr in) { + INIT_VNET_INET(curvnet); register u_long i = ntohl(in.s_addr); register struct in_ifaddr *ia; - if (subnetsarelocal) { - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) + if (V_subnetsarelocal) { + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) if ((i & ia->ia_netmask) == ia->ia_net) return (1); } else { - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) if ((i & ia->ia_subnetmask) == ia->ia_subnet) return (1); } @@ -107,6 +115,7 @@ int in_localip(struct in_addr in) { + INIT_VNET_INET(curvnet); struct in_ifaddr *ia; LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) { @@ -199,6 +208,7 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { + INIT_VNET_INET(curvnet); /* so and ifp can be 0 ! */ register struct ifreq *ifr = (struct ifreq *)data; register struct in_ifaddr *ia = 0, *iap; register struct ifaddr *ifa; @@ -328,7 +338,7 @@ } ia->ia_ifp = ifp; - TAILQ_INSERT_TAIL(&in_ifaddrhead, ia, ia_link); + TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); splx(s); iaIsNew = 1; } @@ -492,7 +502,7 @@ */ s = splnet(); TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); - TAILQ_REMOVE(&in_ifaddrhead, ia, ia_link); + TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); if (ia->ia_addr.sin_family == AF_INET) { LIST_REMOVE(ia, ia_hash); /* @@ -707,6 +717,7 @@ in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin, int scrub) { + INIT_VNET_INET(ifp->if_vnet); register u_long i = ntohl(sin->sin_addr.s_addr); struct sockaddr_in oldaddr; int s = splimp(), flags = RTF_UP, error = 0; @@ -801,6 +812,7 @@ static int in_addprefix(struct in_ifaddr *target, int flags) { + INIT_VNET_INET(curvnet); struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error; @@ -814,7 +826,7 @@ prefix.s_addr &= mask.s_addr; } - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_addr.sin_addr; @@ -835,7 +847,7 @@ * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { - if (sameprefixcarponly && + if (V_sameprefixcarponly && target->ia_ifp->if_type != IFT_CARP && ia->ia_ifp->if_type != IFT_CARP) return (EEXIST); @@ -861,6 +873,7 @@ static int in_scrubprefix(struct in_ifaddr *target) { + INIT_VNET_INET(curvnet); struct in_ifaddr *ia; struct in_addr prefix, mask, p; int error; @@ -876,7 +889,7 @@ prefix.s_addr &= mask.s_addr; } - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) p = ia->ia_dstaddr.sin_addr; else { @@ -967,6 +980,8 @@ static void in_purgemaddrs(struct ifnet *ifp) { + INIT_VNET_INET(ifp->if_vnet); + struct in_multi *inm; struct in_multi *oinm; @@ -975,7 +990,7 @@ #endif IFF_LOCKGIANT(ifp); IN_MULTI_LOCK(); - LIST_FOREACH_SAFE(inm, &in_multihead, inm_link, oinm) { + LIST_FOREACH_SAFE(inm, &V_in_multihead, inm_link, oinm) { if (inm->inm_ifp == ifp) in_delmulti_locked(inm); } @@ -989,8 +1004,9 @@ void in_ifdetach(struct ifnet *ifp) { + INIT_VNET_INET(ifp->if_vnet); - in_pcbpurgeif0(&ripcbinfo, ifp); - in_pcbpurgeif0(&udbinfo, ifp); + in_pcbpurgeif0(&V_ripcbinfo, ifp); + in_pcbpurgeif0(&V_udbinfo, ifp); in_purgemaddrs(ifp); } --- /u/marko/p4/head/src/sys/netinet/in_gif.c 2007-10-16 13:53:37.000000000 +0200 +++ src/sys/netinet/in_gif.c 2007-10-22 18:06:40.000000000 +0200 @@ -35,6 +35,7 @@ #include "opt_mrouting.h" #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -45,12 +46,13 @@ #include #include #include - #include +#include #include #include +#include #include #include #include @@ -85,13 +87,16 @@ .pr_usrreqs = &rip_usrreqs }; -static int ip_gif_ttl = GIF_TTL; -SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW, - &ip_gif_ttl, 0, ""); +#ifndef VIMAGE +int ip_gif_ttl; +#endif +SYSCTL_V_INT(V_NET, vnet_gif, _net_inet_ip, IPCTL_GIF_TTL, gifttl, + CTLFLAG_RW, ip_gif_ttl, 0, ""); int in_gif_output(struct ifnet *ifp, int family, struct mbuf *m) { + INIT_VNET_GIF(ifp->if_vnet); struct gif_softc *sc = ifp->if_softc; struct sockaddr_in *dst = (struct sockaddr_in *)&sc->gif_ro.ro_dst; struct sockaddr_in *sin_src = (struct sockaddr_in *)sc->gif_psrc; @@ -176,7 +181,7 @@ } iphdr.ip_p = proto; /* version will be set in ip_output() */ - iphdr.ip_ttl = ip_gif_ttl; + iphdr.ip_ttl = V_ip_gif_ttl; iphdr.ip_len = m->m_pkthdr.len + sizeof(struct ip); ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED : ECN_NOCARE, &iphdr.ip_tos, &tos); @@ -239,6 +244,7 @@ void in_gif_input(struct mbuf *m, int off) { + INIT_VNET_INET(curvnet); struct ifnet *gifp = NULL; struct gif_softc *sc; struct ip *ip; @@ -252,14 +258,14 @@ sc = (struct gif_softc *)encap_getarg(m); if (sc == NULL) { m_freem(m); - ipstat.ips_nogif++; + V_ipstat.ips_nogif++; return; } gifp = GIF2IFP(sc); if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) { m_freem(m); - ipstat.ips_nogif++; + V_ipstat.ips_nogif++; return; } @@ -319,7 +325,7 @@ break; default: - ipstat.ips_nogif++; + V_ipstat.ips_nogif++; m_freem(m); return; } @@ -333,6 +339,7 @@ static int gif_validate4(const struct ip *ip, struct gif_softc *sc, struct ifnet *ifp) { + INIT_VNET_INET(curvnet); struct sockaddr_in *src, *dst; struct in_ifaddr *ia4; @@ -352,7 +359,7 @@ return 0; } /* reject packets with broadcast on source */ - TAILQ_FOREACH(ia4, &in_ifaddrhead, ia_link) { + TAILQ_FOREACH(ia4, &V_in_ifaddrhead, ia_link) { if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) continue; if (ip->ip_src.s_addr == ia4->ia_broadaddr.sin_addr.s_addr) --- /u/marko/p4/head/src/sys/netinet/in_gif.h 2007-08-31 03:48:00.000000000 +0200 +++ src/sys/netinet/in_gif.h 2007-10-05 12:27:02.000000000 +0200 @@ -35,6 +35,9 @@ #define GIF_TTL 30 +#ifndef VIMAGE +extern int ip_gif_ttl; +#endif struct gif_softc; void in_gif_input(struct mbuf *, int); int in_gif_output(struct ifnet *, int, struct mbuf *); --- /u/marko/p4/head/src/sys/netinet/in_mcast.c 2007-11-07 23:37:16.000000000 +0100 +++ src/sys/netinet/in_mcast.c 2007-10-22 18:06:41.000000000 +0200 @@ -39,6 +39,8 @@ #include __FBSDID("$FreeBSD: src/sys/netinet/in_mcast.c,v 1.3 2007/08/06 22:06:36 csjp Exp $"); +#include "opt_vimage.h" + #include #include #include @@ -48,11 +50,14 @@ #include #include #include +#include #include #include #include +#include +#include #include #include #include @@ -85,7 +90,9 @@ * ip_output() to send IGMP packets while holding the lock; this probably is * not quite desirable. */ +#ifndef VIMAGE struct in_multihead in_multihead; /* XXX BSS initialization */ +#endif struct mtx in_multi_mtx; MTX_SYSINIT(in_multi_mtx, &in_multi_mtx, "in_multi_mtx", MTX_DEF | MTX_RECURSE); @@ -312,6 +319,7 @@ struct in_multi * in_addmulti(struct in_addr *ap, struct ifnet *ifp) { + INIT_VNET_INET(ifp->if_vnet); struct in_multi *inm; inm = NULL; @@ -373,7 +381,7 @@ ninm->inm_ifma = ifma; ninm->inm_refcount = 1; ifma->ifma_protospec = ninm; - LIST_INSERT_HEAD(&in_multihead, ninm, inm_link); + LIST_INSERT_HEAD(&V_in_multihead, ninm, inm_link); igmp_joingroup(ninm); @@ -464,6 +472,8 @@ static int inp_change_source_filter(struct inpcb *inp, struct sockopt *sopt) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; @@ -532,7 +542,7 @@ ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); - if (gsr.gsr_interface == 0 || if_index < gsr.gsr_interface) + if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); @@ -753,6 +763,7 @@ static int inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { + INIT_VNET_NET(curvnet); struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; @@ -776,7 +787,7 @@ if (error) return (error); - if (msfr.msfr_ifindex == 0 || if_index < msfr.msfr_ifindex) + if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EINVAL); ifp = ifnet_byindex(msfr.msfr_ifindex); @@ -850,6 +861,7 @@ int inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) { + INIT_VNET_INET(curvnet); struct ip_mreqn mreqn; struct ip_moptions *imo; struct ifnet *ifp; @@ -956,6 +968,8 @@ static int inp_join_group(struct inpcb *inp, struct sockopt *sopt) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; @@ -1036,7 +1050,7 @@ } else { struct in_ifaddr *ia; struct ifnet *mfp = NULL; - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { mfp = ia->ia_ifp; if (!(mfp->if_flags & IFF_LOOPBACK) && (mfp->if_flags & IFF_MULTICAST)) { @@ -1089,7 +1103,7 @@ /* * Obtain the ifp. */ - if (gsr.gsr_interface == 0 || if_index < gsr.gsr_interface) + if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); @@ -1211,6 +1225,8 @@ static int inp_leave_group(struct inpcb *inp, struct sockopt *sopt) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); struct group_source_req gsr; struct ip_mreq_source mreqs; sockunion_t *gsa, *ssa; @@ -1298,7 +1314,7 @@ return (EINVAL); } - if (gsr.gsr_interface == 0 || if_index < gsr.gsr_interface) + if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); @@ -1399,6 +1415,7 @@ static int inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { + INIT_VNET_NET(curvnet); struct in_addr addr; struct ip_mreqn mreqn; struct ifnet *ifp; @@ -1415,7 +1432,7 @@ if (error) return (error); - if (mreqn.imr_ifindex < 0 || if_index < mreqn.imr_ifindex) + if (mreqn.imr_ifindex < 0 || V_if_index < mreqn.imr_ifindex) return (EINVAL); if (mreqn.imr_ifindex == 0) { @@ -1467,6 +1484,7 @@ static int inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { + INIT_VNET_NET(curvnet); struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; @@ -1496,7 +1514,7 @@ gsa->sin.sin_port = 0; /* ignore port */ - if (msfr.msfr_ifindex == 0 || if_index < msfr.msfr_ifindex) + if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); @@ -1829,12 +1847,14 @@ static struct ifnet * ip_multicast_if(struct in_addr *a) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); int ifindex; struct ifnet *ifp; if (ntohl(a->s_addr) >> 24 == 0) { ifindex = ntohl(a->s_addr) & 0xffffff; - if (ifindex < 0 || if_index < ifindex) + if (ifindex < 0 || V_if_index < ifindex) return NULL; ifp = ifnet_byindex(ifindex); } else --- /u/marko/p4/head/src/sys/netinet/in_pcb.c 2007-12-27 19:32:24.000000000 +0100 +++ src/sys/netinet/in_pcb.c 2008-01-14 19:23:52.000000000 +0100 @@ -38,6 +38,7 @@ #include "opt_ipsec.h" #include "opt_inet6.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -52,6 +53,7 @@ #include #include #include +#include #ifdef DDB #include @@ -59,10 +61,12 @@ #include +#include #include #include #include +#include #include #include #include @@ -74,7 +78,7 @@ #include #include #endif /* INET6 */ - +#include #ifdef IPSEC #include @@ -83,50 +87,60 @@ #include +#ifndef VIMAGE /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ -int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ -int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ -int ipport_firstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ -int ipport_lastauto = IPPORT_HILASTAUTO; /* 65535 */ -int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ -int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ +int ipport_lowfirstauto; +int ipport_lowlastauto; +int ipport_firstauto; +int ipport_lastauto; +int ipport_hifirstauto; +int ipport_hilastauto; /* * Reserved ports accessible only to root. There are significant * security considerations that must be accounted for when changing these, * but the security benefits can be great. Please be careful. */ -int ipport_reservedhigh = IPPORT_RESERVED - 1; /* 1023 */ -int ipport_reservedlow = 0; +int ipport_reservedhigh; +int ipport_reservedlow; /* Variables dealing with random ephemeral port allocation. */ -int ipport_randomized = 1; /* user controlled via sysctl */ -int ipport_randomcps = 10; /* user controlled via sysctl */ -int ipport_randomtime = 45; /* user controlled via sysctl */ -int ipport_stoprandom = 0; /* toggled by ipport_tick */ +int ipport_randomized; +int ipport_randomcps; +int ipport_randomtime; +int ipport_stoprandom; int ipport_tcpallocs; int ipport_tcplastcount; +#endif #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } static int +#ifndef VIMAGE sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) +#else +sysctl_net_ipport_check(SYSCTL_HANDLER_V_ARGS) +#endif { +#ifdef VIMAGE + INIT_VNET_INET(curvnet); + SYSCTL_RESOLVE_V_ARG1(); +#endif int error; - error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); + error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0) { - RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); - RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1); - RANGECHK(ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); - RANGECHK(ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); - RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); - RANGECHK(ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); + RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); + RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); + RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); + RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); + RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); + RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); } return (error); } @@ -135,30 +149,37 @@ SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW, - &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW, - &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW, - &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW, - &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW, - &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW, - &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, - CTLFLAG_RW|CTLFLAG_SECURE, &ipport_reservedhigh, 0, ""); -SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, - CTLFLAG_RW|CTLFLAG_SECURE, &ipport_reservedlow, 0, ""); -SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW, - &ipport_randomized, 0, "Enable random port allocation"); -SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW, - &ipport_randomcps, 0, "Maximum number of random port " - "allocations before switching to a sequental one"); -SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW, - &ipport_randomtime, 0, "Minimum time to keep sequental port " - "allocation before switching to a random one"); +SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, + lowfirst, CTLTYPE_INT|CTLFLAG_RW, ipport_lowfirstauto, 0, + &sysctl_net_ipport_check, "I", ""); +SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, + lowlast, CTLTYPE_INT|CTLFLAG_RW, ipport_lowlastauto, 0, + &sysctl_net_ipport_check, "I", ""); +SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, + first, CTLTYPE_INT|CTLFLAG_RW, ipport_firstauto, 0, + &sysctl_net_ipport_check, "I", ""); +SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, + last, CTLTYPE_INT|CTLFLAG_RW, ipport_lastauto, 0, + &sysctl_net_ipport_check, "I", ""); +SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, + hifirst, CTLTYPE_INT|CTLFLAG_RW, ipport_hifirstauto, 0, + &sysctl_net_ipport_check, "I", ""); +SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, + hilast, CTLTYPE_INT|CTLFLAG_RW, ipport_hilastauto, 0, + &sysctl_net_ipport_check, "I", ""); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, + reservedhigh, CTLFLAG_RW|CTLFLAG_SECURE, ipport_reservedhigh, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, reservedlow, + CTLFLAG_RW|CTLFLAG_SECURE, ipport_reservedlow, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomized, + CTLFLAG_RW, ipport_randomized, 0, "Enable random port allocation"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomcps, + CTLFLAG_RW, ipport_randomcps, 0, "Maximum number of random port " + "allocations before switching to a sequental one"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomtime, + CTLFLAG_RW, ipport_randomtime, 0, + "Minimum time to keep sequental port " + "allocation before switching to a random one"); /* * in_pcb.c: manage the Protocol Control Blocks. @@ -175,6 +196,9 @@ int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) { +#ifdef INET6 + INIT_VNET_INET6(curvnet); +#endif struct inpcb *inp; int error; @@ -207,7 +231,7 @@ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { inp->inp_vflag |= INP_IPV6PROTO; - if (ip6_v6only) + if (V_ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; } #endif @@ -215,7 +239,7 @@ pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; #ifdef INET6 - if (ip6_auto_flowlabel) + if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif INP_LOCK(inp); @@ -268,6 +292,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, struct ucred *cred) { + INIT_VNET_INET(inp->inp_vnet); struct socket *so = inp->inp_socket; unsigned short *lastport; struct sockaddr_in *sin; @@ -281,7 +306,7 @@ INP_INFO_WLOCK_ASSERT(pcbinfo); INP_LOCK_ASSERT(inp); - if (TAILQ_EMPTY(&in_ifaddrhead)) /* XXX broken! */ + if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) @@ -332,8 +357,8 @@ struct tcptw *tw; /* GROSS */ - if (ntohs(lport) <= ipport_reservedhigh && - ntohs(lport) >= ipport_reservedlow && + if (ntohs(lport) <= V_ipport_reservedhigh && + ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) return (EACCES); @@ -401,20 +426,20 @@ return (EINVAL); if (inp->inp_flags & INP_HIGHPORT) { - first = ipport_hifirstauto; /* sysctl */ - last = ipport_hilastauto; + first = V_ipport_hifirstauto; /* sysctl */ + last = V_ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); if (error) return error; - first = ipport_lowfirstauto; /* 1023 */ - last = ipport_lowlastauto; /* 600 */ + first = V_ipport_lowfirstauto; /* 1023 */ + last = V_ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { - first = ipport_firstauto; /* sysctl */ - last = ipport_lastauto; + first = V_ipport_firstauto; /* sysctl */ + last = V_ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* @@ -423,8 +448,8 @@ * use random port allocation only if the user allows it AND * ipport_tick() allows it. */ - if (ipport_randomized && - (!ipport_stoprandom || pcbinfo == &udbinfo)) + if (V_ipport_randomized && + (!V_ipport_stoprandom || pcbinfo == &V_udbinfo)) dorandom = 1; else dorandom = 0; @@ -435,8 +460,8 @@ if (first == last) dorandom = 0; /* Make sure to not include UDP packets in the count. */ - if (pcbinfo != &udbinfo) - ipport_tcpallocs++; + if (pcbinfo != &V_udbinfo) + V_ipport_tcpallocs++; /* * Simple check to ensure all ports are not used up causing * a deadlock here. @@ -556,6 +581,7 @@ in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, struct inpcb **oinpp, struct ucred *cred) { + INIT_VNET_INET(inp->inp_vnet); struct sockaddr_in *sin = (struct sockaddr_in *)nam; struct in_ifaddr *ia; struct sockaddr_in sa; @@ -591,7 +617,7 @@ if (error) return (error); } - if (!TAILQ_EMPTY(&in_ifaddrhead)) { + if (!TAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. @@ -600,12 +626,12 @@ * choose the broadcast address for that interface. */ if (faddr.s_addr == INADDR_ANY) - faddr = IA_SIN(TAILQ_FIRST(&in_ifaddrhead))->sin_addr; + faddr = IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; else if (faddr.s_addr == (u_long)INADDR_BROADCAST && - (TAILQ_FIRST(&in_ifaddrhead)->ia_ifp->if_flags & + (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST)) faddr = satosin(&TAILQ_FIRST( - &in_ifaddrhead)->ia_broadaddr)->sin_addr; + &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; } if (laddr.s_addr == INADDR_ANY) { ia = (struct in_ifaddr *)0; @@ -650,7 +676,7 @@ imo = inp->inp_moptions; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) if (ia->ia_ifp == ifp) break; if (ia == 0) @@ -1213,13 +1239,15 @@ void ipport_tick(void *xtp) { - - if (ipport_tcpallocs <= ipport_tcplastcount + ipport_randomcps) { - if (ipport_stoprandom > 0) - ipport_stoprandom--; + VNET_ITERLOOP_BEGIN(); + INIT_VNET_INET(curvnet); + if (V_ipport_tcpallocs <= V_ipport_tcplastcount + V_ipport_randomcps) { + if (V_ipport_stoprandom > 0) + V_ipport_stoprandom--; } else - ipport_stoprandom = ipport_randomtime; - ipport_tcplastcount = ipport_tcpallocs; + V_ipport_stoprandom = V_ipport_randomtime; + V_ipport_tcplastcount = V_ipport_tcpallocs; + VNET_ITERLOOP_END(); callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); } --- /u/marko/p4/head/src/sys/netinet/in_pcb.h 2007-12-27 19:32:25.000000000 +0100 +++ src/sys/netinet/in_pcb.h 2008-01-14 19:23:52.000000000 +0100 @@ -189,6 +189,8 @@ #define in6p_lport inp_lport /* for KAME src sync over BSD*'s */ #define in6p_fport inp_fport /* for KAME src sync over BSD*'s */ #define in6p_ppcb inp_ppcb /* for KAME src sync over BSD*'s */ + +#define inp_vnet inp_pcbinfo->ipi_vnet }; /* * The range of the generation count, as used in this implementation, is 9e19. @@ -270,7 +272,8 @@ * vimage 1 * general use 1 */ - void *ipi_pspare[2]; + struct vnet *ipi_vnet; + void *ipi_pspare[1]; }; #define INP_LOCK_INIT(inp, d, t) \ @@ -355,6 +358,7 @@ #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) #ifdef _KERNEL +#ifndef VIMAGE extern int ipport_reservedhigh; extern int ipport_reservedlow; extern int ipport_lowfirstauto; @@ -363,6 +367,11 @@ extern int ipport_lastauto; extern int ipport_hifirstauto; extern int ipport_hilastauto; +extern int ipport_randomized; +extern int ipport_randomcps; +extern int ipport_randomtime; +extern int ipport_stoprandom; +#endif extern struct callout ipport_tick_callout; void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); --- /u/marko/p4/head/src/sys/netinet/in_proto.c 2007-10-16 13:53:37.000000000 +0200 +++ src/sys/netinet/in_proto.c 2007-10-22 18:06:41.000000000 +0200 @@ -39,6 +39,7 @@ #include "opt_pf.h" #include "opt_carp.h" #include "opt_sctp.h" +#include "opt_vimage.h" #include #include @@ -120,6 +121,9 @@ .pr_ctlinput = udp_ctlinput, .pr_ctloutput = ip_ctloutput, .pr_init = udp_init, +#ifdef VIMAGE + .pr_destroy = udp_destroy, +#endif .pr_usrreqs = &udp_usrreqs }, { @@ -131,6 +135,9 @@ .pr_ctlinput = tcp_ctlinput, .pr_ctloutput = tcp_ctloutput, .pr_init = tcp_init, +#ifdef VIMAGE + .pr_destroy = tcp_destroy, +#endif .pr_slowtimo = tcp_slowtimo, .pr_drain = tcp_drain, .pr_usrreqs = &tcp_usrreqs @@ -341,11 +348,15 @@ .pr_input = rip_input, .pr_ctloutput = rip_ctloutput, .pr_init = rip_init, +#ifdef VIMAGE + .pr_destroy = rip_destroy, +#endif .pr_usrreqs = &rip_usrreqs }, }; extern int in_inithead(void **, int); +extern int in_detachhead(void **, int); struct domain inetdomain = { .dom_family = AF_INET, @@ -353,6 +364,9 @@ .dom_protosw = inetsw, .dom_protoswNPROTOSW = &inetsw[sizeof(inetsw)/sizeof(inetsw[0])], .dom_rtattach = in_inithead, +#ifdef VIMAGE + .dom_rtdetach = in_detachhead, +#endif .dom_rtoffset = 32, .dom_maxrtkey = sizeof(struct sockaddr_in) }; --- /u/marko/p4/head/src/sys/netinet/in_rmx.c 2008-02-27 18:29:07.000000000 +0100 +++ src/sys/netinet/in_rmx.c 2008-02-27 17:59:33.000000000 +0100 @@ -43,6 +43,8 @@ #include __FBSDID("$FreeBSD: src/sys/netinet/in_rmx.c,v 1.58 2008/02/07 11:26:52 glebius Exp $"); +#include "opt_vimage.h" + #include #include #include @@ -51,14 +53,20 @@ #include #include #include +#include +#include #include #include +#include #include #include #include -extern int in_inithead(void **head, int off); +int in_inithead(void **head, int off); +#ifdef VIMAGE +int in_detachhead(void **head, int off); +#endif #define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ @@ -149,18 +157,23 @@ return rn; } -static int rtq_reallyold = 60*60; /* one hour is "really old" */ -SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW, - &rtq_reallyold, 0, "Default expiration time on dynamically learned routes"); +#ifndef VIMAGE +static int rtq_reallyold; +static int rtq_minreallyold; +static int rtq_toomany; +#endif -static int rtq_minreallyold = 10; /* never automatically crank down to less */ -SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW, - &rtq_minreallyold, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_RTEXPIRE, rtexpire, + CTLFLAG_RW, rtq_reallyold, 0, + "Default expiration time on dynamically learned routes"); + +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_RTMINEXPIRE, + rtminexpire, CTLFLAG_RW, rtq_minreallyold, 0, "Minimum time to attempt to hold onto dynamically learned routes"); -static int rtq_toomany = 128; /* 128 cached routes is "too many" */ -SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW, - &rtq_toomany, 0, "Upper limit on dynamically learned routes"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_RTMAXCACHE, + rtmaxcache, CTLFLAG_RW, rtq_toomany, 0, + "Upper limit on dynamically learned routes"); /* * On last reference drop, mark the route as belong to us so that it can be @@ -169,6 +182,7 @@ static void in_clsroute(struct radix_node *rn, struct radix_node_head *head) { + INIT_VNET_INET(curvnet); struct rtentry *rt = (struct rtentry *)rn; RT_LOCK_ASSERT(rt); @@ -189,9 +203,9 @@ * If rtq_reallyold is 0, just delete the route without * waiting for a timeout cycle to kill it. */ - if (rtq_reallyold != 0) { + if (V_rtq_reallyold != 0) { rt->rt_flags |= RTPRF_OURS; - rt->rt_rmx.rmx_expire = time_uptime + rtq_reallyold; + rt->rt_rmx.rmx_expire = time_uptime + V_rtq_reallyold; } else { rtexpunge(rt); } @@ -214,6 +228,7 @@ static int in_rtqkill(struct radix_node *rn, void *rock) { + INIT_VNET_INET(curvnet); struct rtqk_arg *ap = rock; struct rtentry *rt = (struct rtentry *)rn; int err; @@ -237,9 +252,9 @@ } else { if (ap->updating && (rt->rt_rmx.rmx_expire - time_uptime > - rtq_reallyold)) { + V_rtq_reallyold)) { rt->rt_rmx.rmx_expire = - time_uptime + rtq_reallyold; + time_uptime + V_rtq_reallyold; } ap->nextstop = lmin(ap->nextstop, rt->rt_rmx.rmx_expire); @@ -250,20 +265,25 @@ } #define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ -static int rtq_timeout = RTQ_TIMEOUT; +#ifndef VIMAGE +static int rtq_timeout; static struct callout rtq_timer; +#endif static void in_rtqtimo(void *rock) { - struct radix_node_head *rnh = rock; + CURVNET_SET_QUIET((struct vnet *) rock); + INIT_VNET_NET((struct vnet *) rock); + INIT_VNET_INET((struct vnet *) rock); + struct radix_node_head *rnh = V_rt_tables[AF_INET]; struct rtqk_arg arg; struct timeval atv; static time_t last_adjusted_timeout = 0; arg.found = arg.killed = 0; arg.rnh = rnh; - arg.nextstop = time_uptime + rtq_timeout; + arg.nextstop = time_uptime + V_rtq_timeout; arg.draining = arg.updating = 0; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, in_rtqkill, &arg); @@ -277,18 +297,18 @@ * than once in rtq_timeout seconds, to keep from cranking down too * hard. */ - if ((arg.found - arg.killed > rtq_toomany) && - (time_uptime - last_adjusted_timeout >= rtq_timeout) && - rtq_reallyold > rtq_minreallyold) { - rtq_reallyold = 2 * rtq_reallyold / 3; - if (rtq_reallyold < rtq_minreallyold) { - rtq_reallyold = rtq_minreallyold; + if ((arg.found - arg.killed > V_rtq_toomany) && + (time_uptime - last_adjusted_timeout >= V_rtq_timeout) && + V_rtq_reallyold > V_rtq_minreallyold) { + V_rtq_reallyold = 2 * V_rtq_reallyold / 3; + if (V_rtq_reallyold < V_rtq_minreallyold) { + V_rtq_reallyold = V_rtq_minreallyold; } last_adjusted_timeout = time_uptime; #ifdef DIAGNOSTIC log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n", - rtq_reallyold); + V_rtq_reallyold); #endif arg.found = arg.killed = 0; arg.updating = 1; @@ -299,13 +319,16 @@ atv.tv_usec = 0; atv.tv_sec = arg.nextstop - time_uptime; - callout_reset(&rtq_timer, tvtohz(&atv), in_rtqtimo, rock); + callout_reset(&V_rtq_timer, tvtohz(&atv), in_rtqtimo, rock); + CURVNET_RESTORE(); } void in_rtqdrain(void) { - struct radix_node_head *rnh = rt_tables[AF_INET]; + VNET_ITERLOOP_BEGIN(); + INIT_VNET_NET(vnet_iter); + struct radix_node_head *rnh = V_rt_tables[AF_INET]; struct rtqk_arg arg; arg.found = arg.killed = 0; @@ -316,6 +339,7 @@ RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, in_rtqkill, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); + VNET_ITERLOOP_END(); } /* @@ -324,23 +348,40 @@ int in_inithead(void **head, int off) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); struct radix_node_head *rnh; if (!rn_inithead(head, off)) return 0; - if (head != (void **)&rt_tables[AF_INET]) /* BOGUS! */ + if (head != (void **)&V_rt_tables[AF_INET]) /* BOGUS! */ return 1; /* only do this for the real routing table */ + V_rtq_reallyold = 60*60; /* one hour is "really old" */ + V_rtq_minreallyold = 10; /* never automatically crank down to less */ + V_rtq_toomany = 128; /* 128 cached routes is "too many" */ + V_rtq_timeout = RTQ_TIMEOUT; rnh = *head; rnh->rnh_addaddr = in_addroute; rnh->rnh_matchaddr = in_matroute; rnh->rnh_close = in_clsroute; - callout_init(&rtq_timer, CALLOUT_MPSAFE); - in_rtqtimo(rnh); /* kick off timeout first time */ + callout_init(&V_rtq_timer, CALLOUT_MPSAFE); + in_rtqtimo(curvnet); /* kick off timeout first time */ return 1; } +#ifdef VIMAGE +int +in_detachhead(void **head, int off) +{ + INIT_VNET_INET(curvnet); + + callout_drain(&V_rtq_timer); + return 1; +} +#endif + /* * This zaps old routes when the interface goes down or interface * address is deleted. In the latter case, it deletes static routes @@ -382,13 +423,14 @@ int in_ifadown(struct ifaddr *ifa, int delete) { + INIT_VNET_NET(curvnet); struct in_ifadown_arg arg; struct radix_node_head *rnh; if (ifa->ifa_addr->sa_family != AF_INET) return 1; - rnh = rt_tables[AF_INET]; + rnh = V_rt_tables[AF_INET]; arg.ifa = ifa; arg.del = delete; RADIX_NODE_HEAD_LOCK(rnh); --- /u/marko/p4/head/src/sys/netinet/in_var.h 2007-08-31 03:48:00.000000000 +0200 +++ src/sys/netinet/in_var.h 2007-10-05 12:27:02.000000000 +0200 @@ -84,20 +84,33 @@ /* * Hash table for IP addresses. */ -extern LIST_HEAD(in_ifaddrhashhead, in_ifaddr) *in_ifaddrhashtbl; -extern TAILQ_HEAD(in_ifaddrhead, in_ifaddr) in_ifaddrhead; +LIST_HEAD(in_ifaddrhashhead, in_ifaddr); +TAILQ_HEAD(in_ifaddrhead, in_ifaddr); +#ifndef VIMAGE +extern struct in_ifaddrhashhead *in_ifaddrhashtbl; +extern struct in_ifaddrhead in_ifaddrhead; extern u_long in_ifaddrhmask; /* mask for hash table */ +#endif -#define INADDR_NHASH_LOG2 9 -#define INADDR_NHASH (1 << INADDR_NHASH_LOG2) -#define INADDR_HASHVAL(x) fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT) -#define INADDR_HASH(x) \ - (&in_ifaddrhashtbl[INADDR_HASHVAL(x) & in_ifaddrhmask]) +/* + * IP datagram reassembly. + */ +#define IPREASS_NHASH_LOG2 6 +#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) +#define IPREASS_HMASK (IPREASS_NHASH - 1) +#define IPREASS_HASH(x,y) \ + (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) /* * Macro for finding the internet address structure (in_ifaddr) * corresponding to one of our IP addresses (in_addr). */ +#define INADDR_NHASH_LOG2 9 +#define INADDR_NHASH (1 << INADDR_NHASH_LOG2) +#define INADDR_HASHVAL(x) fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT) +#define INADDR_HASH(x) \ + (&V_in_ifaddrhashtbl[INADDR_HASHVAL(x) & V_in_ifaddrhmask]) + #define INADDR_TO_IFADDR(addr, ia) \ /* struct in_addr addr; */ \ /* struct in_ifaddr *ia; */ \ @@ -130,7 +143,7 @@ /* struct ifnet *ifp; */ \ /* struct in_ifaddr *ia; */ \ { \ - for ((ia) = TAILQ_FIRST(&in_ifaddrhead); \ + for ((ia) = TAILQ_FIRST(&V_in_ifaddrhead); \ (ia) != NULL && (ia)->ia_ifp != (ifp); \ (ia) = TAILQ_NEXT((ia), ia_link)) \ continue; \ @@ -218,7 +231,11 @@ SYSCTL_DECL(_net_inet_raw); #endif -extern LIST_HEAD(in_multihead, in_multi) in_multihead; +LIST_HEAD(in_multihead, in_multi); + +#ifndef VIMAGE +extern struct in_multihead in_multihead; +#endif /* * Lock macros for IPv4 layer multicast address lists. IPv4 lock goes @@ -283,7 +300,7 @@ /* struct in_multi *inm; */ \ do { \ IN_MULTI_LOCK_ASSERT(); \ - (step).i_inm = LIST_FIRST(&in_multihead); \ + (step).i_inm = LIST_FIRST(&V_in_multihead); \ IN_NEXT_MULTI((step), (inm)); \ } while(0) --- /u/marko/p4/head/src/sys/netinet/ip6.h 2007-08-31 03:48:00.000000000 +0200 +++ src/sys/netinet/ip6.h 2007-10-05 12:27:02.000000000 +0200 @@ -275,24 +275,24 @@ if (((m)->m_flags & M_LOOP) && \ ((m)->m_len < (off) + (hlen)) && \ (((m) = m_pullup((m), (off) + (hlen))) == NULL)) { \ - ip6stat.ip6s_exthdrtoolong++; \ + V_ip6stat.ip6s_exthdrtoolong++; \ return ret; \ } else if ((m)->m_flags & M_EXT) { \ if ((m)->m_len < (off) + (hlen)) { \ - ip6stat.ip6s_exthdrtoolong++; \ + V_ip6stat.ip6s_exthdrtoolong++; \ m_freem(m); \ return ret; \ } \ } else { \ if ((m)->m_len < (off) + (hlen)) { \ - ip6stat.ip6s_exthdrtoolong++; \ + V_ip6stat.ip6s_exthdrtoolong++; \ m_freem(m); \ return ret; \ } \ } \ } else { \ if ((m)->m_len < (off) + (hlen)) { \ - ip6stat.ip6s_tooshort++; \ + V_ip6stat.ip6s_tooshort++; \ in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); \ m_freem(m); \ return ret; \ --- /u/marko/p4/head/src/sys/netinet/ip_fw.h 2008-01-28 23:53:52.000000000 +0100 +++ src/sys/netinet/ip_fw.h 2008-02-27 11:49:06.000000000 +0100 @@ -28,6 +28,9 @@ #ifndef _IPFW2_H #define _IPFW2_H +#include +#include + /* * The kernel representation of ipfw rules is made of a list of * 'instructions' (for all practical purposes equivalent to BPF @@ -546,6 +549,34 @@ */ #ifdef _KERNEL +/* + * Data structure to cache our ucred related + * information. This structure only gets used if + * the user specified UID/GID based constraints in + * a firewall rule. + */ +struct ip_fw_ugid { + gid_t fw_groups[NGROUPS]; + int fw_ngroups; + uid_t fw_uid; + int fw_prid; +}; + +#define IPFW_TABLES_MAX 128 +struct ip_fw_chain { + struct ip_fw *rules; /* list of rules */ + struct ip_fw *reap; /* list of rules to reap */ + LIST_HEAD(, cfg_nat) nat; /* list of nat entries */ + struct radix_node_head *tables[IPFW_TABLES_MAX]; + struct rwlock rwmtx; +}; + +struct table_entry { + struct radix_node rn[2]; + struct sockaddr_in addr, mask; + u_int32_t value; +}; + /* Return values from ipfw_chk() */ enum { IP_FW_PASS = 0, @@ -615,16 +646,103 @@ typedef int ip_fw_ctl_t(struct sockopt *); extern ip_fw_ctl_t *ip_fw_ctl_ptr; + +#ifndef VIMAGE extern int fw_one_pass; extern int fw_enable; #ifdef INET6 extern int fw6_enable; #endif +#endif /* For kernel ipfw_ether and ipfw_bridge. */ typedef int ip_fw_chk_t(struct ip_fw_args *args); extern ip_fw_chk_t *ip_fw_chk_ptr; #define IPFW_LOADED (ip_fw_chk_ptr != NULL) +/* + * Stack virtualization support. + */ +#ifdef VIMAGE +struct vnet_ipfw { + int _fw_one_pass; + int _fw_enable; + int _fw6_enable; + + u_int32_t _set_disable; + int _fw_deny_unknown_exthdrs; + int _fw_verbose; + int _verbose_limit; + int _fw_debug; + int _autoinc_step; + + ipfw_dyn_rule **_ipfw_dyn_v; + struct ip_fw_chain _layer3_chain; + u_int32_t _dyn_buckets; + u_int32_t _curr_dyn_buckets; + + u_int32_t _dyn_ack_lifetime; + u_int32_t _dyn_syn_lifetime; + u_int32_t _dyn_fin_lifetime; + u_int32_t _dyn_rst_lifetime; + u_int32_t _dyn_udp_lifetime; + u_int32_t _dyn_short_lifetime; + u_int32_t _dyn_keepalive_interval; + u_int32_t _dyn_keepalive_period; + u_int32_t _dyn_keepalive; + u_int32_t _static_count; + u_int32_t _static_len; + u_int32_t _dyn_count; + u_int32_t _dyn_max; + + u_int64_t _norule_counter; + + struct callout _ipfw_timeout; +}; +#endif + +/* + * Symbol translation macros + */ + +#define INIT_VNET_IPFW(vnet) \ + INIT_FROM_VNET(vnet, VNET_MOD_IPFW, struct vnet_ipfw, vnet_ipfw) + +#define VNET_IPFW(sym) VSYM(vnet_ipfw, sym) + +#define V_fw_one_pass VNET_IPFW(fw_one_pass) +#define V_fw_enable VNET_IPFW(fw_enable) +#define V_fw6_enable VNET_IPFW(fw6_enable) + +#define V_set_disable VNET_IPFW(set_disable) +#define V_fw_deny_unknown_exthdrs VNET_IPFW(fw_deny_unknown_exthdrs) +#define V_fw_verbose VNET_IPFW(fw_verbose) +#define V_verbose_limit VNET_IPFW(verbose_limit) + +#define V_fw_debug VNET_IPFW(fw_debug) +#define V_autoinc_step VNET_IPFW(autoinc_step) + +#define V_ipfw_dyn_v VNET_IPFW(ipfw_dyn_v) +#define V_layer3_chain VNET_IPFW(layer3_chain) +#define V_dyn_buckets VNET_IPFW(dyn_buckets) +#define V_curr_dyn_buckets VNET_IPFW(curr_dyn_buckets) + +#define V_dyn_ack_lifetime VNET_IPFW(dyn_ack_lifetime) +#define V_dyn_syn_lifetime VNET_IPFW(dyn_syn_lifetime) +#define V_dyn_fin_lifetime VNET_IPFW(dyn_fin_lifetime) +#define V_dyn_rst_lifetime VNET_IPFW(dyn_rst_lifetime) +#define V_dyn_udp_lifetime VNET_IPFW(dyn_udp_lifetime) +#define V_dyn_short_lifetime VNET_IPFW(dyn_short_lifetime) +#define V_dyn_keepalive_interval VNET_IPFW(dyn_keepalive_interval) +#define V_dyn_keepalive_period VNET_IPFW(dyn_keepalive_period) +#define V_dyn_keepalive VNET_IPFW(dyn_keepalive) +#define V_static_count VNET_IPFW(static_count) +#define V_static_len VNET_IPFW(static_len) +#define V_dyn_count VNET_IPFW(dyn_count) +#define V_dyn_max VNET_IPFW(dyn_max) + +#define V_norule_counter VNET_IPFW(norule_counter) +#define V_ipfw_timeout VNET_IPFW(ipfw_timeout) + #endif /* _KERNEL */ #endif /* _IPFW2_H */ --- /u/marko/p4/head/src/sys/netinet/ip_divert.c 2007-10-29 17:17:43.000000000 +0100 +++ src/sys/netinet/ip_divert.c 2007-12-10 11:26:11.000000000 +0100 @@ -34,6 +34,7 @@ #include "opt_inet.h" #include "opt_ipfw.h" #include "opt_mac.h" +#include "opt_vimage.h" #ifndef INET #error "IPDIVERT requires INET." #endif @@ -61,6 +62,7 @@ #include +#include #include #include #include --- /u/marko/p4/head/src/sys/netinet/ip_dummynet.c 2008-02-27 18:29:08.000000000 +0100 +++ src/sys/netinet/ip_dummynet.c 2008-01-14 19:23:52.000000000 +0100 @@ -26,7 +26,7 @@ */ #include -__FBSDID("$FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.115 2008/02/27 13:52:33 dwmalone Exp $"); +__FBSDID("$FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.114 2007/12/25 09:36:51 oleg Exp $"); #define DUMMYNET_DEBUG @@ -98,9 +98,6 @@ static int pipe_expire = 1 ; /* expire queue if empty */ static int dn_max_ratio = 16 ; /* max queues/buckets ratio */ -static long pipe_slot_limit = 100; /* Foot shooting limit for pipe queues. */ -static long pipe_byte_limit = 1024 * 1024; - static int red_lookup_depth = 256; /* RED - default lookup table depth */ static int red_avg_pkt_size = 512; /* RED - default medium packet size */ static int red_max_pkt_size = 1500; /* RED - default max packet size */ @@ -201,10 +198,6 @@ SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, CTLFLAG_RD, &io_pkt_drop, 0, "Number of packets dropped by dummynet."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, - CTLFLAG_RW, &pipe_slot_limit, 0, "Upper limit in slots for pipe queue."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, - CTLFLAG_RW, &pipe_byte_limit, 0, "Upper limit in bytes for pipe queue."); #endif #ifdef DUMMYNET_DEBUG @@ -1699,12 +1692,12 @@ x->plr = src->plr; x->flow_mask = src->flow_mask; if (x->flags_fs & DN_QSIZE_IS_BYTES) { - if (x->qsize > pipe_byte_limit) + if (x->qsize > 1024 * 1024) x->qsize = 1024 * 1024; } else { if (x->qsize == 0) x->qsize = 50; - if (x->qsize > pipe_slot_limit) + if (x->qsize > 100) x->qsize = 50; } /* Configuring RED. */ --- /u/marko/p4/head/src/sys/netinet/ip_fastfwd.c 2007-10-16 13:53:37.000000000 +0200 +++ src/sys/netinet/ip_fastfwd.c 2007-10-22 18:06:41.000000000 +0200 @@ -78,6 +78,7 @@ #include "opt_ipfw.h" #include "opt_ipstealth.h" +#include "opt_vimage.h" #include #include @@ -87,7 +88,9 @@ #include #include #include +#include +#include #include #include #include @@ -95,6 +98,7 @@ #include #include +#include #include #include #include @@ -105,13 +109,16 @@ #include -static int ipfastforward_active = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, fastforwarding, CTLFLAG_RW, - &ipfastforward_active, 0, "Enable fast IP forwarding"); +#ifndef VIMAGE +static int ipfastforward_active; +#endif +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, fastforwarding, + CTLFLAG_RW, ipfastforward_active, 0, "Enable fast IP forwarding"); static struct sockaddr_in * ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m) { + INIT_VNET_INET(curvnet); struct sockaddr_in *dst; struct rtentry *rt; @@ -135,8 +142,8 @@ if (rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *)rt->rt_gateway; } else { - ipstat.ips_noroute++; - ipstat.ips_cantforward++; + V_ipstat.ips_noroute++; + V_ipstat.ips_cantforward++; if (rt) RTFREE(rt); icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); @@ -155,6 +162,7 @@ struct mbuf * ip_fastforward(struct mbuf *m) { + INIT_VNET_INET(curvnet); struct ip *ip; struct mbuf *m0 = NULL; struct route ro; @@ -171,7 +179,7 @@ /* * Are we active and forwarding packets? */ - if (!ipfastforward_active || !ipforwarding) + if (!V_ipfastforward_active || !V_ipforwarding) return m; M_ASSERTVALID(m); @@ -187,7 +195,7 @@ * Is entire packet big enough? */ if (m->m_pkthdr.len < sizeof(struct ip)) { - ipstat.ips_tooshort++; + V_ipstat.ips_tooshort++; goto drop; } @@ -196,7 +204,7 @@ */ if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { - ipstat.ips_toosmall++; + V_ipstat.ips_toosmall++; return NULL; /* mbuf already free'd */ } @@ -206,7 +214,7 @@ * Is it IPv4? */ if (ip->ip_v != IPVERSION) { - ipstat.ips_badvers++; + V_ipstat.ips_badvers++; goto drop; } @@ -215,12 +223,12 @@ */ hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ - ipstat.ips_badlen++; + V_ipstat.ips_badlen++; goto drop; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { - ipstat.ips_badhlen++; + V_ipstat.ips_badhlen++; return NULL; /* mbuf already free'd */ } ip = mtod(m, struct ip *); @@ -238,7 +246,7 @@ sum = in_cksum(m, hlen); } if (sum) { - ipstat.ips_badsum++; + V_ipstat.ips_badsum++; goto drop; } @@ -253,7 +261,7 @@ * Is IP length longer than packet we have got? */ if (m->m_pkthdr.len < ip_len) { - ipstat.ips_tooshort++; + V_ipstat.ips_tooshort++; goto drop; } @@ -273,7 +281,7 @@ */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { - ipstat.ips_badaddr++; + V_ipstat.ips_badaddr++; goto drop; } @@ -331,7 +339,7 @@ if (in_localip(ip->ip_dst)) return m; - ipstat.ips_total++; + V_ipstat.ips_total++; /* * Step 3: incoming packet firewall processing @@ -513,7 +521,7 @@ */ if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= ifp->if_snd.ifq_maxlen) { - ipstat.ips_odropped++; + V_ipstat.ips_odropped++; /* would send source quench here but that is depreciated */ goto drop; } @@ -552,7 +560,7 @@ * Handle EMSGSIZE with icmp reply needfrag for TCP MTU discovery */ if (ip->ip_off & IP_DF) { - ipstat.ips_cantfrag++; + V_ipstat.ips_cantfrag++; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, mtu); goto consumed; @@ -590,16 +598,16 @@ m_freem(m); } } else - ipstat.ips_fragmented++; + V_ipstat.ips_fragmented++; } } if (error != 0) - ipstat.ips_odropped++; + V_ipstat.ips_odropped++; else { ro.ro_rt->rt_rmx.rmx_pksent++; - ipstat.ips_forward++; - ipstat.ips_fastforward++; + V_ipstat.ips_forward++; + V_ipstat.ips_fastforward++; } consumed: RTFREE(ro.ro_rt); --- /u/marko/p4/head/src/sys/netinet/ip_fw2.c 2008-02-27 18:29:08.000000000 +0100 +++ src/sys/netinet/ip_fw2.c 2008-02-27 17:59:48.000000000 +0100 @@ -45,6 +45,7 @@ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -64,6 +65,9 @@ #include #include #include +#include + +#include #include #include #include @@ -71,6 +75,7 @@ #define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */ +#include #include #include #include @@ -110,6 +115,11 @@ #include +static int vnet_ipfw_iattach(const void *); +static int vnet_ipfw_idetach(const void *); + +VNET_MOD_DECLARE(IPFW, ipfw, vnet_ipfw_iattach, vnet_ipfw_idetach, INET, NULL) + /* * set_disable contains one bit per set value (0..31). * If the bit is set, all rules with the corresponding set @@ -118,36 +128,18 @@ * and CANNOT be disabled. * Rules in set RESVD_SET can only be deleted explicitly. */ +#ifndef VIMAGE static u_int32_t set_disable; static int fw_verbose; static int verbose_limit; static struct callout ipfw_timeout; +#endif + static uma_zone_t ipfw_dyn_rule_zone; #define IPFW_DEFAULT_RULE 65535 -/* - * Data structure to cache our ucred related - * information. This structure only gets used if - * the user specified UID/GID based constraints in - * a firewall rule. - */ -struct ip_fw_ugid { - gid_t fw_groups[NGROUPS]; - int fw_ngroups; - uid_t fw_uid; - int fw_prid; -}; - -#define IPFW_TABLES_MAX 128 -struct ip_fw_chain { - struct ip_fw *rules; /* list of rules */ - struct ip_fw *reap; /* list of rules to reap */ - LIST_HEAD(, cfg_nat) nat; /* list of nat entries */ - struct radix_node_head *tables[IPFW_TABLES_MAX]; - struct rwlock rwmtx; -}; #define IPFW_LOCK_INIT(_chain) \ rw_init(&(_chain)->rwmtx, "IPFW static rules") #define IPFW_LOCK_DESTROY(_chain) rw_destroy(&(_chain)->rwmtx) @@ -161,40 +153,42 @@ /* * list of rules for layer 3 */ +#ifndef VIMAGE static struct ip_fw_chain layer3_chain; +#endif MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); -struct table_entry { - struct radix_node rn[2]; - struct sockaddr_in addr, mask; - u_int32_t value; -}; - -static int fw_debug = 1; -static int autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ +#ifndef VIMAGE +static int fw_debug; +static int autoinc_step; +#endif +#ifdef VIMAGE +extern int ipfw_chg_hook(SYSCTL_HANDLER_V_ARGS); +#else extern int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); +#endif #ifdef SYSCTL_NODE SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); -SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &fw_enable, 0, +SYSCTL_V_PROC(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, enable, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, fw_enable, 0, ipfw_chg_hook, "I", "Enable ipfw"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW, - &autoinc_step, 0, "Rule number autincrement step"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, - CTLFLAG_RW | CTLFLAG_SECURE3, - &fw_one_pass, 0, +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, autoinc_step, + CTLFLAG_RW, autoinc_step, 0, "Rule number autincrement step"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, one_pass, + CTLFLAG_RW | CTLFLAG_SECURE3, fw_one_pass, 0, "Only do a single pass through ipfw when using dummynet(4)"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW, - &fw_debug, 0, "Enable printing of debug ip_fw statements"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW, + fw_debug, 0, "Enable printing of debug ip_fw statements"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_SECURE3, - &fw_verbose, 0, "Log matches to ipfw rules"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, - &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged"); + fw_verbose, 0, "Log matches to ipfw rules"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, verbose_limit, + CTLFLAG_RW, + verbose_limit, 0, "Set upper limit of matches of ipfw rules logged"); /* * Description of dynamic rules. @@ -232,9 +226,11 @@ * obey the 'randomized match', and we do not do multiple * passes through the firewall. XXX check the latter!!! */ +#ifndef VIMAGE static ipfw_dyn_rule **ipfw_dyn_v = NULL; -static u_int32_t dyn_buckets = 256; /* must be power of 2 */ -static u_int32_t curr_dyn_buckets = 256; /* must be power of 2 */ +static u_int32_t dyn_buckets; +static u_int32_t curr_dyn_buckets; +#endif static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */ #define IPFW_DYN_LOCK_INIT() \ @@ -247,12 +243,14 @@ /* * Timeouts for various events in handing dynamic rules. */ -static u_int32_t dyn_ack_lifetime = 300; -static u_int32_t dyn_syn_lifetime = 20; -static u_int32_t dyn_fin_lifetime = 1; -static u_int32_t dyn_rst_lifetime = 1; -static u_int32_t dyn_udp_lifetime = 10; -static u_int32_t dyn_short_lifetime = 5; +#ifndef VIMAGE +static u_int32_t dyn_ack_lifetime; +static u_int32_t dyn_syn_lifetime; +static u_int32_t dyn_fin_lifetime; +static u_int32_t dyn_rst_lifetime; +static u_int32_t dyn_udp_lifetime; +static u_int32_t dyn_short_lifetime; +#endif /* * Keepalives are sent if dyn_keepalive is set. They are sent every @@ -261,57 +259,68 @@ * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower * than dyn_keepalive_period. */ - -static u_int32_t dyn_keepalive_interval = 20; -static u_int32_t dyn_keepalive_period = 5; -static u_int32_t dyn_keepalive = 1; /* do send keepalives */ +#ifndef VIMAGE +static u_int32_t dyn_keepalive_interval; +static u_int32_t dyn_keepalive_period; +static u_int32_t dyn_keepalive; static u_int32_t static_count; /* # of static rules */ static u_int32_t static_len; /* size in bytes of static rules */ -static u_int32_t dyn_count; /* # of dynamic rules */ -static u_int32_t dyn_max = 4096; /* max # of dynamic rules */ +static u_int32_t dyn_count; /* # of dynamic rules */ +static u_int32_t dyn_max; /* max # of dynamic rules */ +#endif -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW, - &dyn_buckets, 0, "Number of dyn. buckets"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD, - &curr_dyn_buckets, 0, "Current Number of dyn. buckets"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD, - &dyn_count, 0, "Number of dyn. rules"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW, - &dyn_max, 0, "Max number of dyn. rules"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD, - &static_count, 0, "Number of static rules"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW, - &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW, - &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW, - &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW, - &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW, - &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW, - &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW, - &dyn_keepalive, 0, "Enable keepalives for dyn. rules"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_buckets, + CTLFLAG_RW, dyn_buckets, 0, "Number of dyn. buckets"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, + CTLFLAG_RD, curr_dyn_buckets, 0, "Current Number of dyn. buckets"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_count, + CTLFLAG_RD, dyn_count, 0, "Number of dyn. rules"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_max, + CTLFLAG_RW, dyn_max, 0, "Max number of dyn. rules"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, static_count, + CTLFLAG_RD, static_count, 0, "Number of static rules"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, + CTLFLAG_RW, dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, + CTLFLAG_RW, dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, + CTLFLAG_RW, dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, + CTLFLAG_RW, dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, + CTLFLAG_RW, dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, + CTLFLAG_RW, dyn_short_lifetime, 0, + "Lifetime of dyn. rules for other situations"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_keepalive, + CTLFLAG_RW, dyn_keepalive, 0, "Enable keepalives for dyn. rules"); + +#ifndef VIMAGE +static int fw_deny_unknown_exthdrs; +#endif #ifdef INET6 /* * IPv6 specific variables */ -SYSCTL_DECL(_net_inet6_ip6); -static struct sysctl_ctx_list ip6_fw_sysctl_ctx; -static struct sysctl_oid *ip6_fw_sysctl_tree; +SYSCTL_DECL(_net_inet6_ip6); +SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW | CTLFLAG_SECURE, + 0, "Firewall"); +SYSCTL_V_PROC(V_NET, vnet_ipfw, _net_inet6_ip6_fw, OID_AUTO, enable, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, fw6_enable, 0, + ipfw_chg_hook, "I", "Enable ipfw+6"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet6_ip6_fw, OID_AUTO, + deny_unknown_exthdrs, CTLFLAG_RW | CTLFLAG_SECURE, + fw_deny_unknown_exthdrs, 0, + "Deny packets with unknown IPv6 Extension Headers"); #endif /* INET6 */ #endif /* SYSCTL_NODE */ #ifdef IPFIREWALL_NAT MODULE_DEPEND(ipfw, libalias, 1, 1, 1); #endif -static int fw_deny_unknown_exthdrs = 1; - /* * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T @@ -581,12 +590,13 @@ static int search_ip6_addr_net (struct in6_addr * ip6_addr) { + INIT_VNET_NET(curvnet); struct ifnet *mdc; struct ifaddr *mdc2; struct in6_ifaddr *fdm; struct in6_addr copia; - TAILQ_FOREACH(mdc, &ifnet, if_link) + TAILQ_FOREACH(mdc, &V_ifnet, if_link) TAILQ_FOREACH(mdc2, &mdc->if_addrlist, ifa_list) { if (mdc2->ifa_addr->sa_family == AF_INET6) { fdm = (struct in6_ifaddr *)mdc2; @@ -647,6 +657,7 @@ return 1; } + static __inline int hash_packet6(struct ipfw_flow_id *id) { @@ -757,7 +768,9 @@ #endif /* INET6 */ +#ifndef VIMAGE static u_int64_t norule_counter; /* counter for ipfw_log(NULL...) */ +#endif #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 #define SNP(buf) buf, sizeof(buf) @@ -771,6 +784,7 @@ struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, struct ip *ip) { + INIT_VNET_IPFW(curvnet); struct ether_header *eh = args->eh; char *action; int limit_reached = 0; @@ -780,11 +794,11 @@ proto[0] = '\0'; if (f == NULL) { /* bogus pkt */ - if (verbose_limit != 0 && norule_counter >= verbose_limit) + if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit) return; - norule_counter++; - if (norule_counter == verbose_limit) - limit_reached = verbose_limit; + V_norule_counter++; + if (V_norule_counter == V_verbose_limit) + limit_reached = V_verbose_limit; action = "Refuse"; } else { /* O_LOG is the first action, find the real one */ ipfw_insn *cmd = ACTION_PTR(f); @@ -1037,6 +1051,7 @@ static __inline int hash_packet(struct ipfw_flow_id *id) { + INIT_VNET_IPFW(curvnet); u_int32_t i; #ifdef INET6 @@ -1045,7 +1060,7 @@ else #endif /* INET6 */ i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); - i &= (curr_dyn_buckets - 1); + i &= (V_curr_dyn_buckets - 1); return i; } @@ -1063,12 +1078,12 @@ q->parent->count--; \ DEB(printf("ipfw: unlink entry 0x%08x %d -> 0x%08x %d, %d left\n",\ (q->id.src_ip), (q->id.src_port), \ - (q->id.dst_ip), (q->id.dst_port), dyn_count-1 ); ) \ + (q->id.dst_ip), (q->id.dst_port), V_dyn_count-1 ); ) \ if (prev != NULL) \ prev->next = q = q->next; \ else \ head = q = q->next; \ - dyn_count--; \ + V_dyn_count--; \ uma_zfree(ipfw_dyn_rule_zone, old_q); } #define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) @@ -1088,6 +1103,7 @@ static void remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me) { + INIT_VNET_IPFW(curvnet); static u_int32_t last_remove = 0; #define FORCE (keep_me == NULL) @@ -1097,7 +1113,7 @@ IPFW_DYN_LOCK_ASSERT(); - if (ipfw_dyn_v == NULL || dyn_count == 0) + if (V_ipfw_dyn_v == NULL || V_dyn_count == 0) return; /* do not expire more than once per second, it is useless */ if (!FORCE && last_remove == time_uptime) @@ -1110,8 +1126,8 @@ * them in a second pass. */ next_pass: - for (i = 0 ; i < curr_dyn_buckets ; i++) { - for (prev=NULL, q = ipfw_dyn_v[i] ; q ; ) { + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { + for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) { /* * Logic can become complex here, so we split tests. */ @@ -1138,7 +1154,7 @@ goto next; } if (q->dyn_type != O_LIMIT_PARENT || !q->count) { - UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q); + UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q); continue; } next: @@ -1158,6 +1174,7 @@ lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp) { + INIT_VNET_IPFW(curvnet); /* * stateful ipfw extensions. * Lookup into dynamic session queue @@ -1171,14 +1188,14 @@ IPFW_DYN_LOCK_ASSERT(); - if (ipfw_dyn_v == NULL) + if (V_ipfw_dyn_v == NULL) goto done; /* not found */ i = hash_packet( pkt ); - for (prev=NULL, q = ipfw_dyn_v[i] ; q != NULL ; ) { + for (prev=NULL, q = V_ipfw_dyn_v[i] ; q != NULL ; ) { if (q->dyn_type == O_LIMIT_PARENT && q->count) goto next; if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */ - UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q); + UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q); continue; } if (pkt->proto == q->id.proto && @@ -1228,8 +1245,8 @@ if ( prev != NULL) { /* found and not in front */ prev->next = q->next; - q->next = ipfw_dyn_v[i]; - ipfw_dyn_v[i] = q; + q->next = V_ipfw_dyn_v[i]; + V_ipfw_dyn_v[i] = q; } if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST); @@ -1239,7 +1256,7 @@ q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8); switch (q->state) { case TH_SYN: /* opening */ - q->expire = time_uptime + dyn_syn_lifetime; + q->expire = time_uptime + V_dyn_syn_lifetime; break; case BOTH_SYN: /* move to established */ @@ -1262,13 +1279,13 @@ } } } - q->expire = time_uptime + dyn_ack_lifetime; + q->expire = time_uptime + V_dyn_ack_lifetime; break; case BOTH_SYN | BOTH_FIN: /* both sides closed */ - if (dyn_fin_lifetime >= dyn_keepalive_period) - dyn_fin_lifetime = dyn_keepalive_period - 1; - q->expire = time_uptime + dyn_fin_lifetime; + if (V_dyn_fin_lifetime >= V_dyn_keepalive_period) + V_dyn_fin_lifetime = V_dyn_keepalive_period - 1; + q->expire = time_uptime + V_dyn_fin_lifetime; break; default: @@ -1280,16 +1297,16 @@ if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) printf("invalid state: 0x%x\n", q->state); #endif - if (dyn_rst_lifetime >= dyn_keepalive_period) - dyn_rst_lifetime = dyn_keepalive_period - 1; - q->expire = time_uptime + dyn_rst_lifetime; + if (V_dyn_rst_lifetime >= V_dyn_keepalive_period) + V_dyn_rst_lifetime = V_dyn_keepalive_period - 1; + q->expire = time_uptime + V_dyn_rst_lifetime; break; } } else if (pkt->proto == IPPROTO_UDP) { - q->expire = time_uptime + dyn_udp_lifetime; + q->expire = time_uptime + V_dyn_udp_lifetime; } else { /* other protocols */ - q->expire = time_uptime + dyn_short_lifetime; + q->expire = time_uptime + V_dyn_short_lifetime; } done: if (match_direction) @@ -1314,6 +1331,7 @@ static void realloc_dynamic_table(void) { + INIT_VNET_IPFW(curvnet); IPFW_DYN_LOCK_ASSERT(); /* @@ -1322,21 +1340,21 @@ * default to 1024. */ - if (dyn_buckets > 65536) - dyn_buckets = 1024; - if ((dyn_buckets & (dyn_buckets-1)) != 0) { /* not a power of 2 */ - dyn_buckets = curr_dyn_buckets; /* reset */ + if (V_dyn_buckets > 65536) + V_dyn_buckets = 1024; + if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */ + V_dyn_buckets = V_curr_dyn_buckets; /* reset */ return; } - curr_dyn_buckets = dyn_buckets; - if (ipfw_dyn_v != NULL) - free(ipfw_dyn_v, M_IPFW); + V_curr_dyn_buckets = V_dyn_buckets; + if (V_ipfw_dyn_v != NULL) + free(V_ipfw_dyn_v, M_IPFW); for (;;) { - ipfw_dyn_v = malloc(curr_dyn_buckets * sizeof(ipfw_dyn_rule *), + V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *), M_IPFW, M_NOWAIT | M_ZERO); - if (ipfw_dyn_v != NULL || curr_dyn_buckets <= 2) + if (V_ipfw_dyn_v != NULL || V_curr_dyn_buckets <= 2) break; - curr_dyn_buckets /= 2; + V_curr_dyn_buckets /= 2; } } @@ -1353,15 +1371,16 @@ static ipfw_dyn_rule * add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) { + INIT_VNET_IPFW(curvnet); ipfw_dyn_rule *r; int i; IPFW_DYN_LOCK_ASSERT(); - if (ipfw_dyn_v == NULL || - (dyn_count == 0 && dyn_buckets != curr_dyn_buckets)) { + if (V_ipfw_dyn_v == NULL || + (V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) { realloc_dynamic_table(); - if (ipfw_dyn_v == NULL) + if (V_ipfw_dyn_v == NULL) return NULL; /* failed ! */ } i = hash_packet(id); @@ -1383,21 +1402,21 @@ } r->id = *id; - r->expire = time_uptime + dyn_syn_lifetime; + r->expire = time_uptime + V_dyn_syn_lifetime; r->rule = rule; r->dyn_type = dyn_type; r->pcnt = r->bcnt = 0; r->count = 0; r->bucket = i; - r->next = ipfw_dyn_v[i]; - ipfw_dyn_v[i] = r; - dyn_count++; + r->next = V_ipfw_dyn_v[i]; + V_ipfw_dyn_v[i] = r; + V_dyn_count++; DEB(printf("ipfw: add dyn entry ty %d 0x%08x %d -> 0x%08x %d, total %d\n", dyn_type, (r->id.src_ip), (r->id.src_port), (r->id.dst_ip), (r->id.dst_port), - dyn_count ); ) + V_dyn_count ); ) return r; } @@ -1408,15 +1427,16 @@ static ipfw_dyn_rule * lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule) { + INIT_VNET_IPFW(curvnet); ipfw_dyn_rule *q; int i; IPFW_DYN_LOCK_ASSERT(); - if (ipfw_dyn_v) { + if (V_ipfw_dyn_v) { int is_v6 = IS_IP6_FLOW_ID(pkt); i = hash_packet( pkt ); - for (q = ipfw_dyn_v[i] ; q != NULL ; q=q->next) + for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next) if (q->dyn_type == O_LIMIT_PARENT && rule== q->rule && pkt->proto == q->id.proto && @@ -1433,7 +1453,7 @@ pkt->dst_ip == q->id.dst_ip) ) ) { - q->expire = time_uptime + dyn_short_lifetime; + q->expire = time_uptime + V_dyn_short_lifetime; DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);) return q; } @@ -1451,6 +1471,7 @@ install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg) { + INIT_VNET_IPFW(curvnet); static int last_log; ipfw_dyn_rule *q; struct in_addr da; @@ -1480,11 +1501,11 @@ return (0); } - if (dyn_count >= dyn_max) + if (V_dyn_count >= V_dyn_max) /* Run out of slots, try to remove any expired rule. */ remove_dyn_rule(NULL, (ipfw_dyn_rule *)1); - if (dyn_count >= dyn_max) { + if (V_dyn_count >= V_dyn_max) { if (last_log != time_uptime) { last_log = time_uptime; printf("ipfw: %s: Too many dynamic rules\n", __func__); @@ -1545,7 +1566,7 @@ /* See if we can remove some expired rule. */ remove_dyn_rule(rule, parent); if (parent->count >= conn_limit) { - if (fw_verbose && last_log != time_uptime) { + if (V_fw_verbose && last_log != time_uptime) { last_log = time_uptime; #ifdef INET6 /* @@ -1611,6 +1632,7 @@ send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) { + INIT_VNET_INET(curvnet); struct mbuf *m; struct ip *ip; struct tcphdr *tcp; @@ -1687,7 +1709,7 @@ /* * now fill fields left out earlier */ - ip->ip_ttl = ip_defttl; + ip->ip_ttl = V_ip_defttl; ip->ip_len = m->m_pkthdr.len; m->m_flags |= M_SKIP_FIREWALL; return (m); @@ -1777,6 +1799,7 @@ add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint8_t mlen, uint32_t value) { + INIT_VNET_IPFW(curvnet); struct radix_node_head *rnh; struct table_entry *ent; @@ -1790,14 +1813,14 @@ ent->addr.sin_len = ent->mask.sin_len = 8; ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr; - IPFW_WLOCK(&layer3_chain); + IPFW_WLOCK(&V_layer3_chain); if (rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent) == NULL) { - IPFW_WUNLOCK(&layer3_chain); + IPFW_WUNLOCK(&V_layer3_chain); free(ent, M_IPFW_TBL); return (EEXIST); } - IPFW_WUNLOCK(&layer3_chain); + IPFW_WUNLOCK(&V_layer3_chain); return (0); } @@ -1981,6 +2004,7 @@ u_int16_t src_port, struct ip_fw_ugid *ugp, int *lookup, struct inpcb *inp) { + INIT_VNET_INET(curvnet); struct inpcbinfo *pi; int wildcard; struct inpcb *pcb; @@ -2008,10 +2032,10 @@ return (0); if (proto == IPPROTO_TCP) { wildcard = 0; - pi = &tcbinfo; + pi = &V_tcbinfo; } else if (proto == IPPROTO_UDP) { wildcard = INPLOOKUP_WILDCARD; - pi = &udbinfo; + pi = &V_udbinfo; } else return 0; match = 0; @@ -2069,9 +2093,9 @@ struct cfg_nat *ptr; struct ifaddr *ifa; - IPFW_WLOCK(&layer3_chain); + IPFW_WLOCK(&V_layer3_chain); /* Check every nat entry... */ - LIST_FOREACH(ptr, &layer3_chain.nat, _next) { + LIST_FOREACH(ptr, &V_layer3_chain.nat, _next) { /* ...using nic 'ifp->if_xname' as dynamic alias address. */ if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) == 0) { mtx_lock(&ifp->if_addr_mtx); @@ -2087,7 +2111,7 @@ mtx_unlock(&ifp->if_addr_mtx); } } - IPFW_WUNLOCK(&layer3_chain); + IPFW_WUNLOCK(&V_layer3_chain); } static void @@ -2095,8 +2119,8 @@ { struct ip_fw *rule; - IPFW_WLOCK_ASSERT(&layer3_chain); - for (rule = layer3_chain.rules; rule; rule = rule->next) { + IPFW_WLOCK_ASSERT(&V_layer3_chain); + for (rule = V_layer3_chain.rules; rule; rule = rule->next) { ipfw_insn_nat *cmd = (ipfw_insn_nat *)ACTION_PTR(rule); if (cmd->o.opcode != O_NAT) continue; @@ -2110,19 +2134,19 @@ { struct cfg_nat *ptr; - LIST_FOREACH(ptr, &layer3_chain.nat, _next) + LIST_FOREACH(ptr, &V_layer3_chain.nat, _next) if (ptr->id == i) return(ptr); return (NULL); } #define HOOK_NAT(b, p) do { \ - IPFW_WLOCK_ASSERT(&layer3_chain); \ + IPFW_WLOCK_ASSERT(&V_layer3_chain); \ LIST_INSERT_HEAD(b, p, _next); \ } while (0) #define UNHOOK_NAT(p) do { \ - IPFW_WLOCK_ASSERT(&layer3_chain); \ + IPFW_WLOCK_ASSERT(&V_layer3_chain); \ LIST_REMOVE(p, _next); \ } while (0) @@ -2276,6 +2300,9 @@ int ipfw_chk(struct ip_fw_args *args) { + INIT_VNET_INET(curvnet); + INIT_VNET_IPFW(curvnet); + /* * Local variables holding state during the processing of a packet: * @@ -2378,7 +2405,7 @@ */ int dyn_dir = MATCH_UNKNOWN; ipfw_dyn_rule *q = NULL; - struct ip_fw_chain *chain = &layer3_chain; + struct ip_fw_chain *chain = &V_layer3_chain; struct m_tag *mtag; /* @@ -2481,7 +2508,7 @@ printf("IPFW2: IPV6 - Unknown Routing " "Header type(%d)\n", ((struct ip6_rthdr *)ulp)->ip6r_type); - if (fw_deny_unknown_exthdrs) + if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } @@ -2505,7 +2532,7 @@ if (offset == 0) { printf("IPFW2: IPV6 - Invalid Fragment " "Header\n"); - if (fw_deny_unknown_exthdrs) + if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } @@ -2577,7 +2604,7 @@ default: printf("IPFW2: IPV6 - Unknown Extension " "Header(%d), ext_hd=%x\n", proto, ext_hd); - if (fw_deny_unknown_exthdrs) + if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); PULLUP_TO(hlen, ulp, struct ip6_ext); break; @@ -2658,7 +2685,7 @@ * XXX should not happen here, but optimized out in * the caller. */ - if (fw_one_pass) { + if (V_fw_one_pass) { IPFW_RUNLOCK(chain); return (IP_FW_PASS); } @@ -2703,7 +2730,7 @@ int l, cmdlen, skip_or; /* skip rest of OR block */ again: - if (set_disable & (1 << f->set) ) + if (V_set_disable & (1 << f->set) ) continue; skip_or = 0; @@ -3089,7 +3116,7 @@ } case O_LOG: - if (fw_verbose) + if (V_fw_verbose) ipfw_log(f, hlen, args, m, oif, offset, tablearg, ip); match = 1; @@ -3689,7 +3716,7 @@ return (retval); pullup_failed: - if (fw_verbose) + if (V_fw_verbose) printf("ipfw: pullup failed\n"); return (IP_FW_DENY); } @@ -3717,6 +3744,7 @@ static int add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule) { + INIT_VNET_IPFW(curvnet); struct ip_fw *rule, *f, *prev; int l = RULESIZE(input_rule); @@ -3747,10 +3775,10 @@ * If rulenum is 0, find highest numbered rule before the * default rule, and add autoinc_step */ - if (autoinc_step < 1) - autoinc_step = 1; - else if (autoinc_step > 1000) - autoinc_step = 1000; + if (V_autoinc_step < 1) + V_autoinc_step = 1; + else if (V_autoinc_step > 1000) + V_autoinc_step = 1000; if (rule->rulenum == 0) { /* * locate the highest numbered rule before default @@ -3760,8 +3788,8 @@ break; rule->rulenum = f->rulenum; } - if (rule->rulenum < IPFW_DEFAULT_RULE - autoinc_step) - rule->rulenum += autoinc_step; + if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step) + rule->rulenum += V_autoinc_step; input_rule->rulenum = rule->rulenum; } @@ -3782,11 +3810,11 @@ } flush_rule_ptrs(chain); done: - static_count++; - static_len += l; + V_static_count++; + V_static_len += l; IPFW_WUNLOCK(chain); DEB(printf("ipfw: installed rule %d, static count now %d\n", - rule->rulenum, static_count);) + rule->rulenum, V_static_count);) return (0); } @@ -3802,6 +3830,7 @@ remove_rule(struct ip_fw_chain *chain, struct ip_fw *rule, struct ip_fw *prev) { + INIT_VNET_IPFW(curvnet); struct ip_fw *n; int l = RULESIZE(rule); @@ -3815,8 +3844,8 @@ chain->rules = n; else prev->next = n; - static_count--; - static_len -= l; + V_static_count--; + V_static_len -= l; rule->next = chain->reap; chain->reap = rule; @@ -4016,6 +4045,7 @@ static int zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only) { + INIT_VNET_IPFW(curvnet); struct ip_fw *rule; char *msg; @@ -4030,7 +4060,7 @@ IPFW_WLOCK(chain); if (rulenum == 0) { - norule_counter = 0; + V_norule_counter = 0; for (rule = chain->rules; rule; rule = rule->next) { /* Skip rules from another set. */ if (cmd == 1 && rule->set != set) @@ -4064,7 +4094,7 @@ } IPFW_WUNLOCK(chain); - if (fw_verbose) + if (V_fw_verbose) log(LOG_SECURITY | LOG_NOTICE, msg, rulenum); return (0); } @@ -4365,6 +4395,7 @@ static size_t ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) { + INIT_VNET_IPFW(curvnet); char *bp = buf; char *ep = bp + space; struct ip_fw *rule; @@ -4389,20 +4420,21 @@ * in a wild attempt to keep the ABI the same. * Why do we do this on EVERY rule? */ - bcopy(&set_disable, &(((struct ip_fw *)bp)->next_rule), - sizeof(set_disable)); + bcopy(&V_set_disable, + &(((struct ip_fw *)bp)->next_rule), + sizeof(V_set_disable)); if (((struct ip_fw *)bp)->timestamp) ((struct ip_fw *)bp)->timestamp += boot_seconds; bp += i; } } IPFW_RUNLOCK(chain); - if (ipfw_dyn_v) { + if (V_ipfw_dyn_v) { ipfw_dyn_rule *p, *last = NULL; IPFW_DYN_LOCK(); - for (i = 0 ; i < curr_dyn_buckets; i++) - for (p = ipfw_dyn_v[i] ; p != NULL; p = p->next) { + for (i = 0 ; i < V_curr_dyn_buckets; i++) + for (p = V_ipfw_dyn_v[i] ; p != NULL; p = p->next) { if (bp + sizeof *p <= ep) { ipfw_dyn_rule *dst = (ipfw_dyn_rule *)bp; @@ -4445,6 +4477,7 @@ static int ipfw_ctl(struct sockopt *sopt) { + INIT_VNET_IPFW(curvnet); #define RULE_MAXSIZE (256*sizeof(u_int32_t)) int error; size_t size; @@ -4481,9 +4514,9 @@ * change between calculating the size and returning the * data in which case we'll just return what fits. */ - size = static_len; /* size of static rules */ - if (ipfw_dyn_v) /* add size of dyn.rules */ - size += (dyn_count * sizeof(ipfw_dyn_rule)); + size = V_static_len; /* size of static rules */ + if (V_ipfw_dyn_v) /* add size of dyn.rules */ + size += (V_dyn_count * sizeof(ipfw_dyn_rule)); /* * XXX todo: if the user passes a short length just to know @@ -4492,7 +4525,7 @@ */ buf = malloc(size, M_TEMP, M_WAITOK); error = sooptcopyout(sopt, buf, - ipfw_getrules(&layer3_chain, buf, size)); + ipfw_getrules(&V_layer3_chain, buf, size)); free(buf, M_TEMP); break; @@ -4510,12 +4543,12 @@ * the old list without the need for a lock. */ - IPFW_WLOCK(&layer3_chain); - layer3_chain.reap = NULL; - free_chain(&layer3_chain, 0 /* keep default rule */); - rule = layer3_chain.reap; - layer3_chain.reap = NULL; - IPFW_WUNLOCK(&layer3_chain); + IPFW_WLOCK(&V_layer3_chain); + V_layer3_chain.reap = NULL; + free_chain(&V_layer3_chain, 0 /* keep default rule */); + rule = V_layer3_chain.reap; + V_layer3_chain.reap = NULL; + IPFW_WUNLOCK(&V_layer3_chain); if (rule != NULL) reap_rules(rule); break; @@ -4527,7 +4560,7 @@ if (error == 0) error = check_ipfw_struct(rule, sopt->sopt_valsize); if (error == 0) { - error = add_rule(&layer3_chain, rule); + error = add_rule(&V_layer3_chain, rule); size = RULESIZE(rule); if (!error && sopt->sopt_dir == SOPT_GET) error = sooptcopyout(sopt, rule, size); @@ -4554,10 +4587,10 @@ break; size = sopt->sopt_valsize; if (size == sizeof(u_int32_t)) /* delete or reassign */ - error = del_entry(&layer3_chain, rulenum[0]); + error = del_entry(&V_layer3_chain, rulenum[0]); else if (size == 2*sizeof(u_int32_t)) /* set enable/disable */ - set_disable = - (set_disable | rulenum[0]) & ~rulenum[1] & + V_set_disable = + (V_set_disable | rulenum[0]) & ~rulenum[1] & ~(1<sopt_name == IP_FW_RESETLOG); break; @@ -4584,7 +4617,7 @@ sizeof(ent), sizeof(ent)); if (error) break; - error = add_table_entry(&layer3_chain, ent.tbl, + error = add_table_entry(&V_layer3_chain, ent.tbl, ent.addr, ent.masklen, ent.value); } break; @@ -4597,7 +4630,7 @@ sizeof(ent), sizeof(ent)); if (error) break; - error = del_table_entry(&layer3_chain, ent.tbl, + error = del_table_entry(&V_layer3_chain, ent.tbl, ent.addr, ent.masklen); } break; @@ -4610,9 +4643,9 @@ sizeof(tbl), sizeof(tbl)); if (error) break; - IPFW_WLOCK(&layer3_chain); - error = flush_table(&layer3_chain, tbl); - IPFW_WUNLOCK(&layer3_chain); + IPFW_WLOCK(&V_layer3_chain); + error = flush_table(&V_layer3_chain, tbl); + IPFW_WUNLOCK(&V_layer3_chain); } break; @@ -4623,9 +4656,9 @@ if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), sizeof(tbl)))) break; - IPFW_RLOCK(&layer3_chain); - error = count_table(&layer3_chain, tbl, &cnt); - IPFW_RUNLOCK(&layer3_chain); + IPFW_RLOCK(&V_layer3_chain); + error = count_table(&V_layer3_chain, tbl, &cnt); + IPFW_RUNLOCK(&V_layer3_chain); if (error) break; error = sooptcopyout(sopt, &cnt, sizeof(cnt)); @@ -4649,9 +4682,9 @@ } tbl->size = (size - sizeof(*tbl)) / sizeof(ipfw_table_entry); - IPFW_RLOCK(&layer3_chain); - error = dump_table(&layer3_chain, tbl); - IPFW_RUNLOCK(&layer3_chain); + IPFW_RLOCK(&V_layer3_chain); + error = dump_table(&V_layer3_chain, tbl); + IPFW_RUNLOCK(&V_layer3_chain); if (error) { free(tbl, M_TEMP); break; @@ -4675,20 +4708,20 @@ /* * Find/create nat rule. */ - IPFW_WLOCK(&layer3_chain); + IPFW_WLOCK(&V_layer3_chain); ptr = lookup_nat(ser_n->id); if (ptr == NULL) { /* New rule: allocate and init new instance. */ ptr = malloc(sizeof(struct cfg_nat), M_IPFW, M_NOWAIT | M_ZERO); if (ptr == NULL) { - IPFW_WUNLOCK(&layer3_chain); + IPFW_WUNLOCK(&V_layer3_chain); free(buf, M_IPFW); return (ENOSPC); } ptr->lib = LibAliasInit(NULL); if (ptr->lib == NULL) { - IPFW_WUNLOCK(&layer3_chain); + IPFW_WUNLOCK(&V_layer3_chain); free(ptr, M_IPFW); free(buf, M_IPFW); return (EINVAL); @@ -4699,7 +4732,7 @@ UNHOOK_NAT(ptr); flush_nat_ptrs(ser_n->id); } - IPFW_WUNLOCK(&layer3_chain); + IPFW_WUNLOCK(&V_layer3_chain); /* * Basic nat configuration. @@ -4725,9 +4758,9 @@ /* Add new entries. */ add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr); free(buf, M_IPFW); - IPFW_WLOCK(&layer3_chain); - HOOK_NAT(&layer3_chain.nat, ptr); - IPFW_WUNLOCK(&layer3_chain); + IPFW_WLOCK(&V_layer3_chain); + HOOK_NAT(&V_layer3_chain.nat, ptr); + IPFW_WUNLOCK(&V_layer3_chain); } break; @@ -4737,16 +4770,16 @@ int i; error = sooptcopyin(sopt, &i, sizeof i, sizeof i); - IPFW_WLOCK(&layer3_chain); + IPFW_WLOCK(&V_layer3_chain); ptr = lookup_nat(i); if (ptr == NULL) { error = EINVAL; - IPFW_WUNLOCK(&layer3_chain); + IPFW_WUNLOCK(&V_layer3_chain); break; } UNHOOK_NAT(ptr); flush_nat_ptrs(i); - IPFW_WUNLOCK(&layer3_chain); + IPFW_WUNLOCK(&V_layer3_chain); del_redir_spool_cfg(ptr, &ptr->redir_chain); LibAliasUninit(ptr->lib); free(ptr, M_IPFW); @@ -4765,9 +4798,9 @@ off = sizeof(nat_cnt); data = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO); - IPFW_RLOCK(&layer3_chain); + IPFW_RLOCK(&V_layer3_chain); /* Serialize all the data. */ - LIST_FOREACH(n, &layer3_chain.nat, _next) { + LIST_FOREACH(n, &V_layer3_chain.nat, _next) { nat_cnt++; if (off + SOF_NAT < NAT_BUF_LEN) { bcopy(n, &data[off], SOF_NAT); @@ -4796,12 +4829,12 @@ goto nospace; } bcopy(&nat_cnt, data, sizeof(nat_cnt)); - IPFW_RUNLOCK(&layer3_chain); + IPFW_RUNLOCK(&V_layer3_chain); error = sooptcopyout(sopt, data, NAT_BUF_LEN); free(data, M_IPFW); break; nospace: - IPFW_RUNLOCK(&layer3_chain); + IPFW_RUNLOCK(&V_layer3_chain); printf("serialized data buffer not big enough:" "please increase NAT_BUF_LEN\n"); free(data, M_IPFW); @@ -4818,16 +4851,16 @@ sof = LIBALIAS_BUF_SIZE; cnt = 0; - IPFW_RLOCK(&layer3_chain); + IPFW_RLOCK(&V_layer3_chain); size = i = 0; - LIST_FOREACH(ptr, &layer3_chain.nat, _next) { + LIST_FOREACH(ptr, &V_layer3_chain.nat, _next) { if (ptr->lib->logDesc == NULL) continue; cnt++; size = cnt * (sof + sizeof(int)); data = realloc(data, size, M_IPFW, M_NOWAIT | M_ZERO); if (data == NULL) { - IPFW_RUNLOCK(&layer3_chain); + IPFW_RUNLOCK(&V_layer3_chain); return (ENOSPC); } bcopy(&ptr->id, &data[i], sizeof(int)); @@ -4835,7 +4868,7 @@ bcopy(ptr->lib->logDesc, &data[i], sof); i += sof; } - IPFW_RUNLOCK(&layer3_chain); + IPFW_RUNLOCK(&V_layer3_chain); error = sooptcopyout(sopt, data, size); free(data, M_IPFW); } @@ -4865,13 +4898,16 @@ * every dyn_keepalive_period */ static void -ipfw_tick(void * __unused unused) +ipfw_tick(void *arg) { +#ifdef VIMAGE + struct vnet_ipfw *vnet_ipfw = arg; +#endif struct mbuf *m0, *m, *mnext, **mtailp; int i; ipfw_dyn_rule *q; - if (dyn_keepalive == 0 || ipfw_dyn_v == NULL || dyn_count == 0) + if (V_dyn_keepalive == 0 || V_ipfw_dyn_v == NULL || V_dyn_count == 0) goto done; /* @@ -4883,15 +4919,15 @@ m0 = NULL; mtailp = &m0; IPFW_DYN_LOCK(); - for (i = 0 ; i < curr_dyn_buckets ; i++) { - for (q = ipfw_dyn_v[i] ; q ; q = q->next ) { + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { + for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) { if (q->dyn_type == O_LIMIT_PARENT) continue; if (q->id.proto != IPPROTO_TCP) continue; if ( (q->state & BOTH_SYN) != BOTH_SYN) continue; - if (TIME_LEQ( time_uptime+dyn_keepalive_interval, + if (TIME_LEQ( time_uptime + V_dyn_keepalive_interval, q->expire)) continue; /* too early */ if (TIME_LEQ(q->expire, time_uptime)) @@ -4914,37 +4950,40 @@ ip_output(m, NULL, NULL, 0, NULL, NULL); } done: - callout_reset(&ipfw_timeout, dyn_keepalive_period*hz, ipfw_tick, NULL); + callout_reset(&V_ipfw_timeout, V_dyn_keepalive_period * hz, + ipfw_tick, arg); } -int -ipfw_init(void) +static int vnet_ipfw_iattach(const void *unused) { + INIT_VNET_IPFW(curvnet); struct ip_fw default_rule; int error; -#ifdef INET6 - /* Setup IPv6 fw sysctl tree. */ - sysctl_ctx_init(&ip6_fw_sysctl_ctx); - ip6_fw_sysctl_tree = SYSCTL_ADD_NODE(&ip6_fw_sysctl_ctx, - SYSCTL_STATIC_CHILDREN(_net_inet6_ip6), OID_AUTO, "fw", - CTLFLAG_RW | CTLFLAG_SECURE, 0, "Firewall"); - SYSCTL_ADD_PROC(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree), - OID_AUTO, "enable", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, - &fw6_enable, 0, ipfw_chg_hook, "I", "Enable ipfw+6"); - SYSCTL_ADD_INT(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree), - OID_AUTO, "deny_unknown_exthdrs", CTLFLAG_RW | CTLFLAG_SECURE, - &fw_deny_unknown_exthdrs, 0, - "Deny packets with unknown IPv6 Extension Headers"); -#endif + V_fw_debug = 1; + V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ + V_dyn_buckets = 256; /* must be power of 2 */ + V_curr_dyn_buckets = 256; /* must be power of 2 */ + V_dyn_ack_lifetime = 300; + V_dyn_syn_lifetime = 20; + V_dyn_fin_lifetime = 1; + V_dyn_rst_lifetime = 1; + V_dyn_udp_lifetime = 10; + V_dyn_short_lifetime = 5; + V_dyn_keepalive_interval = 20; + V_dyn_keepalive_period = 5; + V_dyn_keepalive = 1; /* do send keepalives */ + V_dyn_max = 4096; /* max # of dynamic rules */ + V_fw_deny_unknown_exthdrs = 1; - layer3_chain.rules = NULL; - IPFW_LOCK_INIT(&layer3_chain); + V_layer3_chain.rules = NULL; + IPFW_LOCK_INIT(&V_layer3_chain); +#if 0 /* XXX Marko fix this! */ ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - IPFW_DYN_LOCK_INIT(); - callout_init(&ipfw_timeout, CALLOUT_MPSAFE); +#endif + callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE); bzero(&default_rule, sizeof default_rule); @@ -4960,17 +4999,66 @@ #endif O_DENY; - error = add_rule(&layer3_chain, &default_rule); + error = add_rule(&V_layer3_chain, &default_rule); if (error != 0) { printf("ipfw2: error %u initializing default rule " "(support disabled)\n", error); IPFW_DYN_LOCK_DESTROY(); - IPFW_LOCK_DESTROY(&layer3_chain); + IPFW_LOCK_DESTROY(&V_layer3_chain); uma_zdestroy(ipfw_dyn_rule_zone); return (error); } - ip_fw_default_rule = layer3_chain.rules; + ip_fw_default_rule = V_layer3_chain.rules; + +#ifdef IPFIREWALL_VERBOSE + V_fw_verbose = 1; +#endif +#ifdef IPFIREWALL_VERBOSE_LIMIT + V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; +#endif + + error = init_tables(&V_layer3_chain); + if (error) { + panic("init_tables"); /* XXX Marko fix this ! */ + } +#ifdef VIMAGE + callout_reset(&V_ipfw_timeout, hz, ipfw_tick, (void *) vnet_ipfw); +#else + callout_reset(&V_ipfw_timeout, hz, ipfw_tick, NULL); +#endif + +#ifdef IPFIREWALL_NAT + LIST_INIT(&V_layer3_chain.nat); +#endif + + return 0; +} + +int +ipfw_init(void) +{ + ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", + sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + IPFW_DYN_LOCK_INIT(); + +#if 0 /* MARKO XXX */ + /* error = init_tables(&V_layer3_chain); moved to _iattach() */ + if (error) { + IPFW_DYN_LOCK_DESTROY(); + IPFW_LOCK_DESTROY(&V_layer3_chain); + uma_zdestroy(ipfw_dyn_rule_zone); + return (error); + } +#endif + +#ifdef VIMAGE + vnet_mod_register(&vnet_ipfw_modinfo); +#else + vnet_ipfw_iattach(NULL); +#endif + printf("ipfw2 " #ifdef INET6 "(+ipv6) " @@ -4988,78 +5076,82 @@ #else "loadable", #endif - default_rule.cmd[0].opcode == O_ACCEPT ? "accept" : "deny"); +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT + "accept" +#else + "deny" +#endif + ); #ifdef IPFIREWALL_VERBOSE - fw_verbose = 1; -#endif -#ifdef IPFIREWALL_VERBOSE_LIMIT - verbose_limit = IPFIREWALL_VERBOSE_LIMIT; -#endif - if (fw_verbose == 0) printf("disabled\n"); - else if (verbose_limit == 0) +#else +# ifndef IPFIREWALL_VERBOSE_LIMIT printf("unlimited\n"); - else +# else printf("limited to %d packets/entry by default\n", - verbose_limit); + IPFIREWALL_VERBOSE_LIMIT); +# endif +#endif - error = init_tables(&layer3_chain); - if (error) { - IPFW_DYN_LOCK_DESTROY(); - IPFW_LOCK_DESTROY(&layer3_chain); - uma_zdestroy(ipfw_dyn_rule_zone); - return (error); - } ip_fw_ctl_ptr = ipfw_ctl; ip_fw_chk_ptr = ipfw_chk; - callout_reset(&ipfw_timeout, hz, ipfw_tick, NULL); #ifdef IPFIREWALL_NAT - LIST_INIT(&layer3_chain.nat); ifaddr_event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_change, NULL, EVENTHANDLER_PRI_ANY); #endif return (0); } -void -ipfw_destroy(void) +static int vnet_ipfw_idetach(const void *unused) { + INIT_VNET_IPFW(curvnet); struct ip_fw *reap; #ifdef IPFIREWALL_NAT struct cfg_nat *ptr, *ptr_temp; #endif - ip_fw_chk_ptr = NULL; - ip_fw_ctl_ptr = NULL; - callout_drain(&ipfw_timeout); - IPFW_WLOCK(&layer3_chain); - flush_tables(&layer3_chain); + callout_drain(&V_ipfw_timeout); + IPFW_WLOCK(&V_layer3_chain); + flush_tables(&V_layer3_chain); #ifdef IPFIREWALL_NAT - LIST_FOREACH_SAFE(ptr, &layer3_chain.nat, _next, ptr_temp) { + LIST_FOREACH_SAFE(ptr, &V_layer3_chain.nat, _next, ptr_temp) { LIST_REMOVE(ptr, _next); del_redir_spool_cfg(ptr, &ptr->redir_chain); LibAliasUninit(ptr->lib); free(ptr, M_IPFW); } - EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag); #endif - layer3_chain.reap = NULL; - free_chain(&layer3_chain, 1 /* kill default rule */); - reap = layer3_chain.reap, layer3_chain.reap = NULL; - IPFW_WUNLOCK(&layer3_chain); + V_layer3_chain.reap = NULL; + free_chain(&V_layer3_chain, 1 /* kill default rule */); + reap = V_layer3_chain.reap, V_layer3_chain.reap = NULL; + IPFW_WUNLOCK(&V_layer3_chain); if (reap != NULL) reap_rules(reap); - IPFW_DYN_LOCK_DESTROY(); - uma_zdestroy(ipfw_dyn_rule_zone); - if (ipfw_dyn_v != NULL) - free(ipfw_dyn_v, M_IPFW); - IPFW_LOCK_DESTROY(&layer3_chain); + IPFW_LOCK_DESTROY(&V_layer3_chain); + if (V_ipfw_dyn_v != NULL) + free(V_ipfw_dyn_v, M_IPFW); -#ifdef INET6 - /* Free IPv6 fw sysctl tree. */ - sysctl_ctx_free(&ip6_fw_sysctl_ctx); + return 0; +} + +void +ipfw_destroy(void) +{ + ip_fw_chk_ptr = NULL; + ip_fw_ctl_ptr = NULL; + +#ifdef VIMAGE + vnet_mod_deregister(&vnet_ipfw_modinfo); +#else + vnet_ipfw_idetach(NULL); +#endif + +#ifdef IPFIREWALL_NAT + EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag); #endif + IPFW_DYN_LOCK_DESTROY(); + uma_zdestroy(ipfw_dyn_rule_zone); printf("IP firewall unloaded\n"); } --- /u/marko/p4/head/src/sys/netinet/ip_fw_pfil.c 2007-11-13 02:49:10.000000000 +0100 +++ src/sys/netinet/ip_fw_pfil.c 2007-12-10 11:26:11.000000000 +0100 @@ -36,6 +36,7 @@ #endif /* INET */ #endif /* KLD_MODULE */ #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -46,7 +47,9 @@ #include #include #include +#include #include +#include #include #include @@ -65,12 +68,18 @@ #include +#ifndef VIMAGE int fw_enable = 1; -#ifdef INET6 +# ifdef INET6 int fw6_enable = 1; +# endif #endif +#ifdef VIMAGE +int ipfw_chg_hook(SYSCTL_HANDLER_V_ARGS); +#else int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); +#endif /* Dummynet hooks. */ ip_dn_ruledel_t *ip_dn_ruledel_ptr = NULL; @@ -484,8 +493,16 @@ #endif /* INET6 */ int +#ifdef VIMAGE +ipfw_chg_hook(SYSCTL_HANDLER_V_ARGS) +#else ipfw_chg_hook(SYSCTL_HANDLER_ARGS) +#endif { +#ifdef VIMAGE + INIT_VNET_IPFW(curvnet); + SYSCTL_RESOLVE_V_ARG1(); +#endif int enable = *(int *)arg1; int error; @@ -498,14 +515,14 @@ if (enable == *(int *)arg1) return (0); - if (arg1 == &fw_enable) { + if (arg1 == &V_fw_enable) { if (enable) error = ipfw_hook(); else error = ipfw_unhook(); } #ifdef INET6 - if (arg1 == &fw6_enable) { + if (arg1 == &V_fw6_enable) { if (enable) error = ipfw6_hook(); else --- /u/marko/p4/head/src/sys/netinet/ip_icmp.c 2007-10-29 17:17:43.000000000 +0100 +++ src/sys/netinet/ip_icmp.c 2007-12-10 11:26:11.000000000 +0100 @@ -34,6 +34,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -43,11 +44,14 @@ #include #include #include +#include +#include #include #include #include +#include #include #include #include @@ -76,9 +80,11 @@ * host table maintenance routines. */ +#ifndef VIMAGE struct icmpstat icmpstat; -SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RW, - &icmpstat, icmpstat, ""); +#endif +SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_icmp, ICMPCTL_STATS, stats, + CTLFLAG_RW, icmpstat, icmpstat, ""); static int icmpmaskrepl = 0; SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW, @@ -143,6 +149,7 @@ void icmp_error(struct mbuf *n, int type, int code, n_long dest, int mtu) { + INIT_VNET_INET(curvnet); register struct ip *oip = mtod(n, struct ip *), *nip; register unsigned oiphlen = oip->ip_hl << 2; register struct icmp *icp; @@ -155,7 +162,7 @@ printf("icmp_error(%p, %x, %d)\n", oip, type, code); #endif if (type != ICMP_REDIRECT) - icmpstat.icps_error++; + V_icmpstat.icps_error++; /* * Don't send error: * if the original packet was encrypted. @@ -172,7 +179,7 @@ if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && n->m_len >= oiphlen + ICMP_MINLEN && !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiphlen))->icmp_type)) { - icmpstat.icps_oldicmp++; + V_icmpstat.icps_oldicmp++; goto freeit; } /* Drop if IP header plus 8 bytes is not contignous in first mbuf. */ @@ -228,7 +235,7 @@ m->m_len = ICMP_MINLEN + icmplen; icp = mtod(m, struct icmp *); - icmpstat.icps_outhist[type]++; + V_icmpstat.icps_outhist[type]++; icp->icmp_type = type; if (type == ICMP_REDIRECT) icp->icmp_gwaddr.s_addr = dest; @@ -287,6 +294,7 @@ void icmp_input(struct mbuf *m, int off) { + INIT_VNET_INET(curvnet); struct icmp *icp; struct in_ifaddr *ia; struct ip *ip = mtod(m, struct ip *); @@ -309,12 +317,12 @@ } #endif if (icmplen < ICMP_MINLEN) { - icmpstat.icps_tooshort++; + V_icmpstat.icps_tooshort++; goto freeit; } i = hlen + min(icmplen, ICMP_ADVLENMIN); if (m->m_len < i && (m = m_pullup(m, i)) == 0) { - icmpstat.icps_tooshort++; + V_icmpstat.icps_tooshort++; return; } ip = mtod(m, struct ip *); @@ -322,7 +330,7 @@ m->m_data += hlen; icp = mtod(m, struct icmp *); if (in_cksum(m, icmplen)) { - icmpstat.icps_checksum++; + V_icmpstat.icps_checksum++; goto freeit; } m->m_len += hlen; @@ -364,7 +372,7 @@ icmpgw.sin_len = sizeof(struct sockaddr_in); icmpgw.sin_family = AF_INET; - icmpstat.icps_inhist[icp->icmp_type]++; + V_icmpstat.icps_inhist[icp->icmp_type]++; code = icp->icmp_code; switch (icp->icmp_type) { @@ -429,7 +437,7 @@ */ if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { - icmpstat.icps_badlen++; + V_icmpstat.icps_badlen++; goto freeit; } icp->icmp_ip.ip_len = ntohs(icp->icmp_ip.ip_len); @@ -452,13 +460,13 @@ break; badcode: - icmpstat.icps_badcode++; + V_icmpstat.icps_badcode++; break; case ICMP_ECHO: if (!icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { - icmpstat.icps_bmcastecho++; + V_icmpstat.icps_bmcastecho++; break; } icp->icmp_type = ICMP_ECHOREPLY; @@ -470,11 +478,11 @@ case ICMP_TSTAMP: if (!icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { - icmpstat.icps_bmcasttstamp++; + V_icmpstat.icps_bmcasttstamp++; break; } if (icmplen < ICMP_TSLEN) { - icmpstat.icps_badlen++; + V_icmpstat.icps_badlen++; break; } icp->icmp_type = ICMP_TSTAMPREPLY; @@ -523,8 +531,8 @@ } reflect: ip->ip_len += hlen; /* since ip_input deducts this */ - icmpstat.icps_reflect++; - icmpstat.icps_outhist[icp->icmp_type]++; + V_icmpstat.icps_reflect++; + V_icmpstat.icps_outhist[icp->icmp_type]++; icmp_reflect(m); return; @@ -548,13 +556,13 @@ * RFC1812 says we must ignore ICMP redirects if we * are acting as router. */ - if (drop_redirect || ipforwarding) + if (drop_redirect || V_ipforwarding) break; if (code > 3) goto badcode; if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { - icmpstat.icps_badlen++; + V_icmpstat.icps_badlen++; break; } /* @@ -614,6 +622,7 @@ static void icmp_reflect(struct mbuf *m) { + INIT_VNET_INET(curvnet); struct ip *ip = mtod(m, struct ip *); struct ifaddr *ifa; struct ifnet *ifn; @@ -626,7 +635,7 @@ ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) != (IN_LOOPBACKNET << IN_CLASSA_NSHIFT))) { m_freem(m); /* Bad return address */ - icmpstat.icps_badaddr++; + V_icmpstat.icps_badaddr++; goto done; /* Ip_output() will check for broadcast */ } t = ip->ip_dst; @@ -694,7 +703,7 @@ ia = ip_rtaddr(ip->ip_dst); if (ia == NULL) { m_freem(m); - icmpstat.icps_noroute++; + V_icmpstat.icps_noroute++; goto done; } match: @@ -703,7 +712,7 @@ #endif t = IA_SIN(ia)->sin_addr; ip->ip_src = t; - ip->ip_ttl = ip_defttl; + ip->ip_ttl = V_ip_defttl; if (optlen > 0) { register u_char *cp; --- /u/marko/p4/head/src/sys/netinet/ip_input.c 2007-12-03 11:00:09.000000000 +0100 +++ src/sys/netinet/ip_input.c 2007-12-10 11:26:11.000000000 +0100 @@ -38,6 +38,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_carp.h" +#include "opt_vimage.h" #include #include @@ -51,7 +52,9 @@ #include #include #include +#include +#include #include #include #include @@ -60,6 +63,7 @@ #include #include +#include #include #include #include @@ -84,34 +88,38 @@ #include -int rsvp_on = 0; +#ifndef VIMAGE +int rsvp_on; +int ipforwarding; +static int ipsendredirects; +int ip_defttl; +static int ip_keepfaith; +static int ip_sendsourcequench; +int ip_do_randomid; +static int ip_checkinterface; +#endif -int ipforwarding = 0; -SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW, - &ipforwarding, 0, "Enable IP forwarding between interfaces"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_FORWARDING, + forwarding, CTLFLAG_RW, ipforwarding, 0, + "Enable IP forwarding between interfaces"); -static int ipsendredirects = 1; /* XXX */ -SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW, - &ipsendredirects, 0, "Enable sending IP redirects"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_SENDREDIRECTS, + redirect, CTLFLAG_RW, ipsendredirects, 0, + "Enable sending IP redirects"); -int ip_defttl = IPDEFTTL; -SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, - &ip_defttl, 0, "Maximum TTL on IP packets"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_DEFTTL, + ttl, CTLFLAG_RW, ip_defttl, 0, "Maximum TTL on IP packets"); -static int ip_keepfaith = 0; -SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW, - &ip_keepfaith, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_KEEPFAITH, + keepfaith, CTLFLAG_RW, ip_keepfaith, 0, "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); -static int ip_sendsourcequench = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_RW, - &ip_sendsourcequench, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, + sendsourcequench, CTLFLAG_RW, ip_sendsourcequench, 0, "Enable the transmission of source quench packets"); -int ip_do_randomid = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW, - &ip_do_randomid, 0, - "Assign random ip_id values"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, random_id, + CTLFLAG_RW, ip_do_randomid, 0, "Assign random ip_id values"); /* * XXX - Setting ip_checkinterface mostly implements the receive side of @@ -126,9 +134,9 @@ * to the loopback interface instead of the interface where the * packets for those addresses are received. */ -static int ip_checkinterface = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW, - &ip_checkinterface, 0, "Verify packet arrives on correct interface"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, + check_interface, CTLFLAG_RW, ip_checkinterface, 0, + "Verify packet arrives on correct interface"); struct pfil_head inet_pfil_hook; /* Packet filter hooks */ @@ -138,9 +146,11 @@ extern struct domain inetdomain; extern struct protosw inetsw[]; u_char ip_protox[IPPROTO_MAX]; +#ifndef VIMAGE struct in_ifaddrhead in_ifaddrhead; /* first inet address */ struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ u_long in_ifaddrhmask; /* mask for hash table */ +#endif SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW, &ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue"); @@ -148,22 +158,20 @@ &ipintrq.ifq_drops, 0, "Number of packets dropped from the IP input queue"); +#ifndef VIMAGE struct ipstat ipstat; -SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW, - &ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); - -/* - * IP datagram reassembly. - */ -#define IPREASS_NHASH_LOG2 6 -#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) -#define IPREASS_HMASK (IPREASS_NHASH - 1) -#define IPREASS_HASH(x,y) \ - (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) +#endif +SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW, + ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); -static uma_zone_t ipq_zone; -static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; static struct mtx ipqlock; +#ifndef VIMAGE +static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; +static uma_zone_t ipq_zone; +static int nipq; +static int maxnipq; +static int maxfragsperpacket; +#endif #define IPQ_LOCK() mtx_lock(&ipqlock) #define IPQ_UNLOCK() mtx_unlock(&ipqlock) @@ -173,14 +181,12 @@ static void maxnipq_update(void); static void ipq_zone_change(void *); -static int maxnipq; /* Administrative limit on # reass queues. */ -static int nipq = 0; /* Total # of reass queues */ -SYSCTL_INT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD, - &nipq, 0, "Current number of IPv4 fragment reassembly queue entries"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, fragpackets, + CTLFLAG_RD, nipq, 0, + "Current number of IPv4 fragment reassembly queue entries"); -static int maxfragsperpacket; -SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW, - &maxfragsperpacket, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, maxfragsperpacket, + CTLFLAG_RW, maxfragsperpacket, 0, "Maximum number of IPv4 fragments allowed per packet"); struct callout ipport_tick_callout; @@ -191,9 +197,11 @@ #endif #ifdef IPSTEALTH -int ipstealth = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, - &ipstealth, 0, "IP stealth mode, no TTL decrementation on forwarding"); +#ifndef VIMAGE +int ipstealth; +#endif +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, + ipstealth, 0, "IP stealth mode, no TTL decrementation on forwarding"); #endif /* @@ -206,6 +214,19 @@ static void ip_freef(struct ipqhead *, struct ipq *); +#ifdef VIMAGE +static void vnet_inet_register(void); + +VNET_MOD_DECLARE(INET, inet, NULL, NULL, NET, NULL) + +static void vnet_inet_register() +{ + vnet_mod_register(&vnet_inet_modinfo); +} + +SYSINIT(inet, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, vnet_inet_register, 0); +#endif + /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. @@ -213,11 +234,58 @@ void ip_init(void) { + INIT_VNET_INET(curvnet); struct protosw *pr; int i; - TAILQ_INIT(&in_ifaddrhead); - in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &in_ifaddrhmask); + TAILQ_INIT(&V_in_ifaddrhead); + V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, + &V_in_ifaddrhmask); + + /* Initialize IP reassembly queue. */ + for (i = 0; i < IPREASS_NHASH; i++) + TAILQ_INIT(&V_ipq[i]); + V_nipq = 0; + V_maxnipq = nmbclusters / 32; + V_maxfragsperpacket = 16; + V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, + NULL, UMA_ALIGN_PTR, 0); + maxnipq_update(); + + V_ipsendredirects = 1; /* XXX */ + V_ip_defttl = IPDEFTTL; +#ifdef IPSTEALTH + V_ipstealth = 0; +#endif + + V_ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ + V_ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ + V_ipport_firstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ + V_ipport_lastauto = IPPORT_HILASTAUTO; /* 65535 */ + V_ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ + V_ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ + V_ipport_reservedhigh = IPPORT_RESERVED - 1; /* 1023 */ + V_ipport_reservedlow = 0; + V_ipport_randomized = 1; /* user controlled via sysctl */ + V_ipport_randomcps = 10; /* user controlled via sysctl */ + V_ipport_randomtime = 45; /* user controlled via sysctl */ + V_ipport_stoprandom = 0; /* toggled by ipport_tick */ + + V_rsvp_on = 0; + V_ipforwarding = 0; + V_ipsendredirects = 1; /* XXX */ + V_ip_defttl = IPDEFTTL; + V_ip_keepfaith = 0; + V_ip_sendsourcequench = 0; + V_ip_do_randomid = 0; + V_ip_checkinterface = 0; + +#ifdef VIMAGE + /* Skip initialization of globals for non-default instances. */ + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif + pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip_init: PF_INET not found"); @@ -245,26 +313,17 @@ printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); - /* Initialize IP reassembly queue. */ - IPQ_LOCK_INIT(); - for (i = 0; i < IPREASS_NHASH; i++) - TAILQ_INIT(&ipq[i]); - maxnipq = nmbclusters / 32; - maxfragsperpacket = 16; - ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, - NULL, UMA_ALIGN_PTR, 0); - maxnipq_update(); - /* Start ipport_tick. */ callout_init(&ipport_tick_callout, CALLOUT_MPSAFE); - ipport_tick(NULL); + callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change, NULL, EVENTHANDLER_PRI_ANY); /* Initialize various other remaining things. */ - ip_id = time_second & 0xffff; + IPQ_LOCK_INIT(); + V_ip_id = time_second & 0xffff; ipintrq.ifq_maxlen = ipqmaxlen; mtx_init(&ipintrq.ifq_mtx, "ip_inq", NULL, MTX_DEF); netisr_register(NETISR_IP, ip_input, &ipintrq, NETISR_MPSAFE); @@ -284,6 +343,7 @@ void ip_input(struct mbuf *m) { + INIT_VNET_INET(curvnet); struct ip *ip = NULL; struct in_ifaddr *ia = NULL; struct ifaddr *ifa; @@ -306,31 +366,31 @@ goto ours; } - ipstat.ips_total++; + V_ipstat.ips_total++; if (m->m_pkthdr.len < sizeof(struct ip)) goto tooshort; if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { - ipstat.ips_toosmall++; + V_ipstat.ips_toosmall++; return; } ip = mtod(m, struct ip *); if (ip->ip_v != IPVERSION) { - ipstat.ips_badvers++; + V_ipstat.ips_badvers++; goto bad; } hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ - ipstat.ips_badhlen++; + V_ipstat.ips_badhlen++; goto bad; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { - ipstat.ips_badhlen++; + V_ipstat.ips_badhlen++; return; } ip = mtod(m, struct ip *); @@ -340,7 +400,7 @@ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) { - ipstat.ips_badaddr++; + V_ipstat.ips_badaddr++; goto bad; } } @@ -355,7 +415,7 @@ } } if (sum) { - ipstat.ips_badsum++; + V_ipstat.ips_badsum++; goto bad; } @@ -370,7 +430,7 @@ */ ip->ip_len = ntohs(ip->ip_len); if (ip->ip_len < hlen) { - ipstat.ips_badlen++; + V_ipstat.ips_badlen++; goto bad; } ip->ip_off = ntohs(ip->ip_off); @@ -383,7 +443,7 @@ */ if (m->m_pkthdr.len < ip->ip_len) { tooshort: - ipstat.ips_tooshort++; + V_ipstat.ips_tooshort++; goto bad; } if (m->m_pkthdr.len > ip->ip_len) { @@ -455,7 +515,7 @@ * anywhere else. Also checks if the rsvp daemon is running before * grabbing the packet. */ - if (rsvp_on && ip->ip_p==IPPROTO_RSVP) + if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) goto ours; /* @@ -464,7 +524,7 @@ * we receive might be for us (and let the upper layers deal * with it). */ - if (TAILQ_EMPTY(&in_ifaddrhead) && + if (TAILQ_EMPTY(&V_in_ifaddrhead) && (m->m_flags & (M_MCAST|M_BCAST)) == 0) goto ours; @@ -486,7 +546,7 @@ * insert a workaround. If the packet got here, we already * checked with carp_iamatch() and carp_forus(). */ - checkif = ip_checkinterface && (ipforwarding == 0) && + checkif = V_ip_checkinterface && (V_ipforwarding == 0) && m->m_pkthdr.rcvif != NULL && ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) && #ifdef DEV_CARP @@ -534,13 +594,13 @@ } /* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */ if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { - ipstat.ips_cantforward++; + V_ipstat.ips_cantforward++; m_freem(m); return; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct in_multi *inm; - if (ip_mrouter) { + if (V_ip_mrouter) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the @@ -551,7 +611,7 @@ */ if (ip_mforward && ip_mforward(ip, m->m_pkthdr.rcvif, m, 0) != 0) { - ipstat.ips_cantforward++; + V_ipstat.ips_cantforward++; m_freem(m); return; } @@ -563,7 +623,7 @@ */ if (ip->ip_p == IPPROTO_IGMP) goto ours; - ipstat.ips_forward++; + V_ipstat.ips_forward++; } /* * See if we belong to the destination multicast group on the @@ -573,7 +633,7 @@ IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm); IN_MULTI_UNLOCK(); if (inm == NULL) { - ipstat.ips_notmember++; + V_ipstat.ips_notmember++; m_freem(m); return; } @@ -588,7 +648,7 @@ * FAITH(Firewall Aided Internet Translator) */ if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { - if (ip_keepfaith) { + if (V_ip_keepfaith) { if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP) goto ours; } @@ -599,8 +659,8 @@ /* * Not for us; forward if possible and desirable. */ - if (ipforwarding == 0) { - ipstat.ips_cantforward++; + if (V_ipforwarding == 0) { + V_ipstat.ips_cantforward++; m_freem(m); } else { #ifdef IPSEC @@ -660,7 +720,7 @@ /* * Switch out to protocol's input routine. */ - ipstat.ips_delivered++; + V_ipstat.ips_delivered++; (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen); return; @@ -676,32 +736,34 @@ static void maxnipq_update(void) { + INIT_VNET_INET(curvnet); /* * -1 for unlimited allocation. */ - if (maxnipq < 0) - uma_zone_set_max(ipq_zone, 0); + if (V_maxnipq < 0) + uma_zone_set_max(V_ipq_zone, 0); /* * Positive number for specific bound. */ - if (maxnipq > 0) - uma_zone_set_max(ipq_zone, maxnipq); + if (V_maxnipq > 0) + uma_zone_set_max(V_ipq_zone, V_maxnipq); /* * Zero specifies no further fragment queue allocation -- set the * bound very low, but rely on implementation elsewhere to actually * prevent allocation and reclaim current queues. */ - if (maxnipq == 0) - uma_zone_set_max(ipq_zone, 1); + if (V_maxnipq == 0) + uma_zone_set_max(V_ipq_zone, 1); } static void ipq_zone_change(void *tag) { + INIT_VNET_INET(curvnet); - if (maxnipq > 0 && maxnipq < (nmbclusters / 32)) { - maxnipq = nmbclusters / 32; + if (V_maxnipq > 0 && V_maxnipq < (nmbclusters / 32)) { + V_maxnipq = nmbclusters / 32; maxnipq_update(); } } @@ -709,9 +771,10 @@ static int sysctl_maxnipq(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET(curvnet); int error, i; - i = maxnipq; + i = V_maxnipq; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) return (error); @@ -722,7 +785,7 @@ */ if (i < -1) return (EINVAL); - maxnipq = i; + V_maxnipq = i; maxnipq_update(); return (0); } @@ -744,6 +807,7 @@ struct mbuf * ip_reass(struct mbuf *m) { + INIT_VNET_INET(curvnet); struct ip *ip; struct mbuf *p, *q, *nq, *t; struct ipq *fp = NULL; @@ -753,9 +817,9 @@ u_short hash; /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ - if (maxnipq == 0 || maxfragsperpacket == 0) { - ipstat.ips_fragments++; - ipstat.ips_fragdropped++; + if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { + V_ipstat.ips_fragments++; + V_ipstat.ips_fragdropped++; m_freem(m); return (NULL); } @@ -764,7 +828,7 @@ hlen = ip->ip_hl << 2; hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); - head = &ipq[hash]; + head = &V_ipq[hash]; IPQ_LOCK(); /* @@ -787,7 +851,7 @@ * Attempt to trim the number of allocated fragment queues if it * exceeds the administrative limit. */ - if ((nipq > maxnipq) && (maxnipq > 0)) { + if ((V_nipq > V_maxnipq) && (V_maxnipq > 0)) { /* * drop something from the tail of the current queue * before proceeding further @@ -795,15 +859,16 @@ struct ipq *q = TAILQ_LAST(head, ipqhead); if (q == NULL) { /* gak */ for (i = 0; i < IPREASS_NHASH; i++) { - struct ipq *r = TAILQ_LAST(&ipq[i], ipqhead); + struct ipq *r = TAILQ_LAST(&V_ipq[i], ipqhead); if (r) { - ipstat.ips_fragtimeout += r->ipq_nfrags; - ip_freef(&ipq[i], r); + V_ipstat.ips_fragtimeout += + r->ipq_nfrags; + ip_freef(&V_ipq[i], r); break; } } } else { - ipstat.ips_fragtimeout += q->ipq_nfrags; + V_ipstat.ips_fragtimeout += q->ipq_nfrags; ip_freef(head, q); } } @@ -820,7 +885,7 @@ * that's a non-zero multiple of 8 bytes. */ if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) { - ipstat.ips_toosmall++; /* XXX */ + V_ipstat.ips_toosmall++; /* XXX */ goto dropfrag; } m->m_flags |= M_FRAG; @@ -833,7 +898,7 @@ * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ - ipstat.ips_fragments++; + V_ipstat.ips_fragments++; m->m_pkthdr.header = ip; /* Previous ip_reass() started here. */ @@ -848,19 +913,19 @@ * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { - fp = uma_zalloc(ipq_zone, M_NOWAIT); + fp = uma_zalloc(V_ipq_zone, M_NOWAIT); if (fp == NULL) goto dropfrag; #ifdef MAC if (mac_ipq_init(fp, M_NOWAIT) != 0) { - uma_zfree(ipq_zone, fp); + uma_zfree(V_ipq_zone, fp); fp = NULL; goto dropfrag; } mac_ipq_create(m, fp); #endif TAILQ_INSERT_HEAD(head, fp, ipq_list); - nipq++; + V_nipq++; fp->ipq_nfrags = 1; fp->ipq_ttl = IPFRAGTTL; fp->ipq_p = ip->ip_p; @@ -944,7 +1009,7 @@ } nq = q->m_nextpkt; m->m_nextpkt = nq; - ipstat.ips_fragdropped++; + V_ipstat.ips_fragdropped++; fp->ipq_nfrags--; m_freem(q); } @@ -962,8 +1027,8 @@ next = 0; for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { if (GETIP(q)->ip_off != next) { - if (fp->ipq_nfrags > maxfragsperpacket) { - ipstat.ips_fragdropped += fp->ipq_nfrags; + if (fp->ipq_nfrags > V_maxfragsperpacket) { + V_ipstat.ips_fragdropped += fp->ipq_nfrags; ip_freef(head, fp); } goto done; @@ -972,8 +1037,8 @@ } /* Make sure the last packet didn't have the IP_MF flag */ if (p->m_flags & M_FRAG) { - if (fp->ipq_nfrags > maxfragsperpacket) { - ipstat.ips_fragdropped += fp->ipq_nfrags; + if (fp->ipq_nfrags > V_maxfragsperpacket) { + V_ipstat.ips_fragdropped += fp->ipq_nfrags; ip_freef(head, fp); } goto done; @@ -985,8 +1050,8 @@ q = fp->ipq_frags; ip = GETIP(q); if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { - ipstat.ips_toolong++; - ipstat.ips_fragdropped += fp->ipq_nfrags; + V_ipstat.ips_toolong++; + V_ipstat.ips_fragdropped += fp->ipq_nfrags; ip_freef(head, fp); goto done; } @@ -1028,19 +1093,19 @@ ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; TAILQ_REMOVE(head, fp, ipq_list); - nipq--; - uma_zfree(ipq_zone, fp); + V_nipq--; + uma_zfree(V_ipq_zone, fp); m->m_len += (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2); /* some debugging cruft by sklower, below, will go away soon */ if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ m_fixhdr(m); - ipstat.ips_reassembled++; + V_ipstat.ips_reassembled++; IPQ_UNLOCK(); return (m); dropfrag: - ipstat.ips_fragdropped++; + V_ipstat.ips_fragdropped++; if (fp != NULL) fp->ipq_nfrags--; m_freem(m); @@ -1058,6 +1123,7 @@ static void ip_freef(struct ipqhead *fhp, struct ipq *fp) { + INIT_VNET_INET(curvnet); struct mbuf *q; IPQ_LOCK_ASSERT(); @@ -1068,8 +1134,8 @@ m_freem(q); } TAILQ_REMOVE(fhp, fp, ipq_list); - uma_zfree(ipq_zone, fp); - nipq--; + uma_zfree(V_ipq_zone, fp); + V_nipq--; } /* @@ -1084,15 +1150,17 @@ int i; IPQ_LOCK(); + VNET_ITERLOOP_BEGIN(); + INIT_VNET_INET(vnet_iter); for (i = 0; i < IPREASS_NHASH; i++) { - for(fp = TAILQ_FIRST(&ipq[i]); fp;) { + for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) { struct ipq *fpp; fpp = fp; fp = TAILQ_NEXT(fp, ipq_list); if(--fpp->ipq_ttl == 0) { - ipstat.ips_fragtimeout += fpp->ipq_nfrags; - ip_freef(&ipq[i], fpp); + V_ipstat.ips_fragtimeout += fpp->ipq_nfrags; + ip_freef(&V_ipq[i], fpp); } } } @@ -1101,15 +1169,16 @@ * (due to the limit being lowered), drain off * enough to get down to the new limit. */ - if (maxnipq >= 0 && nipq > maxnipq) { + if (V_maxnipq >= 0 && V_nipq > V_maxnipq) { for (i = 0; i < IPREASS_NHASH; i++) { - while (nipq > maxnipq && !TAILQ_EMPTY(&ipq[i])) { - ipstat.ips_fragdropped += - TAILQ_FIRST(&ipq[i])->ipq_nfrags; - ip_freef(&ipq[i], TAILQ_FIRST(&ipq[i])); + while (V_nipq > V_maxnipq && !TAILQ_EMPTY(&V_ipq[i])) { + V_ipstat.ips_fragdropped += + TAILQ_FIRST(&V_ipq[i])->ipq_nfrags; + ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i])); } } } + VNET_ITERLOOP_END(); IPQ_UNLOCK(); } @@ -1122,13 +1191,16 @@ int i; IPQ_LOCK(); + VNET_ITERLOOP_BEGIN(); + INIT_VNET_INET(vnet_iter); for (i = 0; i < IPREASS_NHASH; i++) { - while(!TAILQ_EMPTY(&ipq[i])) { - ipstat.ips_fragdropped += - TAILQ_FIRST(&ipq[i])->ipq_nfrags; - ip_freef(&ipq[i], TAILQ_FIRST(&ipq[i])); + while(!TAILQ_EMPTY(&V_ipq[i])) { + V_ipstat.ips_fragdropped += + TAILQ_FIRST(&V_ipq[i])->ipq_nfrags; + ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i])); } } + VNET_ITERLOOP_END(); IPQ_UNLOCK(); in_rtqdrain(); } @@ -1245,6 +1317,7 @@ void ip_forward(struct mbuf *m, int srcrt) { + INIT_VNET_INET(curvnet); struct ip *ip = mtod(m, struct ip *); struct in_ifaddr *ia = NULL; struct mbuf *mcopy; @@ -1252,7 +1325,7 @@ int error, type = 0, code = 0, mtu = 0; if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { - ipstat.ips_cantforward++; + V_ipstat.ips_cantforward++; m_freem(m); return; } @@ -1324,7 +1397,7 @@ * or a route modified by a redirect. */ dest.s_addr = 0; - if (!srcrt && ipsendredirects && ia->ia_ifp == m->m_pkthdr.rcvif) { + if (!srcrt && V_ipsendredirects && ia->ia_ifp == m->m_pkthdr.rcvif) { struct sockaddr_in *sin; struct route ro; struct rtentry *rt; @@ -1360,11 +1433,11 @@ error = ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); if (error) - ipstat.ips_cantforward++; + V_ipstat.ips_cantforward++; else { - ipstat.ips_forward++; + V_ipstat.ips_forward++; if (type) - ipstat.ips_redirectsent++; + V_ipstat.ips_redirectsent++; else { if (mcopy) m_freem(mcopy); @@ -1407,7 +1480,7 @@ else mtu = ip_next_mtu(ip->ip_len, 0); } - ipstat.ips_cantfrag++; + V_ipstat.ips_cantfrag++; break; case ENOBUFS: @@ -1419,7 +1492,7 @@ * Those who need source quench packets may re-enable them * via the net.inet.ip.sendsourcequench sysctl. */ - if (ip_sendsourcequench == 0) { + if (V_ip_sendsourcequench == 0) { m_freem(mcopy); return; } else { @@ -1439,6 +1512,8 @@ ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, struct mbuf *m) { + INIT_VNET_NET(inp->inp_vnet); + if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) { struct bintime bt; @@ -1501,7 +1576,7 @@ struct sockaddr_dl *sdl2 = &sdlbuf.sdl; if (((ifp = m->m_pkthdr.rcvif)) - && ( ifp->if_index && (ifp->if_index <= if_index))) { + && ( ifp->if_index && (ifp->if_index <= V_if_index))) { sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; /* * Change our mind and don't try copy. @@ -1532,26 +1607,30 @@ * locking. This code remains in ip_input.c as ip_mroute.c is optionally * compiled. */ +#ifndef VIMAGE static int ip_rsvp_on; struct socket *ip_rsvpd; +#endif int ip_rsvp_init(struct socket *so) { + INIT_VNET_INET(so->so_vnet); + if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; - if (ip_rsvpd != NULL) + if (V_ip_rsvpd != NULL) return EADDRINUSE; - ip_rsvpd = so; + V_ip_rsvpd = so; /* * This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ - if (!ip_rsvp_on) { - ip_rsvp_on = 1; - rsvp_on++; + if (!V_ip_rsvp_on) { + V_ip_rsvp_on = 1; + V_rsvp_on++; } return 0; @@ -1560,14 +1639,16 @@ int ip_rsvp_done(void) { - ip_rsvpd = NULL; + INIT_VNET_INET(curvnet); + + V_ip_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ - if (ip_rsvp_on) { - ip_rsvp_on = 0; - rsvp_on--; + if (V_ip_rsvp_on) { + V_ip_rsvp_on = 0; + V_rsvp_on--; } return 0; } @@ -1575,6 +1656,8 @@ void rsvp_input(struct mbuf *m, int off) /* XXX must fixup manually */ { + INIT_VNET_INET(curvnet); + if (rsvp_input_p) { /* call the real one if loaded */ rsvp_input_p(m, off); return; @@ -1585,12 +1668,12 @@ * case we want to throw the packet away. */ - if (!rsvp_on) { + if (!V_rsvp_on) { m_freem(m); return; } - if (ip_rsvpd != NULL) { + if (V_ip_rsvpd != NULL) { rip_input(m, off); return; } --- /u/marko/p4/head/src/sys/netinet/ip_ipsec.c 2007-10-16 13:53:38.000000000 +0200 +++ src/sys/netinet/ip_ipsec.c 2007-10-22 18:06:42.000000000 +0200 @@ -31,6 +31,7 @@ __FBSDID("$FreeBSD: src/sys/netinet/ip_ipsec.c,v 1.8 2007/10/07 20:44:23 silby Exp $"); #include "opt_ipsec.h" +#include "opt_vimage.h" #include #include @@ -41,10 +42,12 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -60,6 +63,7 @@ #include #include #include +#include #endif /*IPSEC*/ extern struct protosw inetsw[]; @@ -92,6 +96,8 @@ ip_ipsec_fwd(struct mbuf *m) { #ifdef IPSEC + INIT_VNET_INET(curvnet); + INIT_VNET_IPSEC(curvnet); struct m_tag *mtag; struct tdb_ident *tdbi; struct secpolicy *sp; @@ -120,7 +126,7 @@ KEY_FREESP(&sp); splx(s); if (error) { - ipstat.ips_cantforward++; + V_ipstat.ips_cantforward++; return 1; } #endif /* IPSEC */ @@ -137,6 +143,7 @@ int ip_ipsec_input(struct mbuf *m) { + INIT_VNET_IPSEC(curvnet); struct ip *ip = mtod(m, struct ip *); #ifdef IPSEC struct m_tag *mtag; --- /u/marko/p4/head/src/sys/netinet/ip_mroute.c 2007-10-16 13:53:38.000000000 +0200 +++ src/sys/netinet/ip_mroute.c 2007-10-22 18:06:42.000000000 +0200 @@ -60,6 +60,7 @@ #include "opt_inet6.h" #include "opt_mac.h" #include "opt_mrouting.h" +#include "opt_vimage.h" #define _PIM_VT 1 @@ -80,9 +81,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -421,6 +424,7 @@ static int X_ip_mrouter_set(struct socket *so, struct sockopt *sopt) { + INIT_VNET_INET(curvnet); int error, optval; vifi_t vifi; struct vifctl vifc; @@ -428,7 +432,7 @@ struct bw_upcall bw_upcall; uint32_t i; - if (so != ip_mrouter && sopt->sopt_name != MRT_INIT) + if (so != V_ip_mrouter && sopt->sopt_name != MRT_INIT) return EPERM; error = 0; @@ -645,6 +649,7 @@ static void if_detached_event(void *arg __unused, struct ifnet *ifp) { + INIT_VNET_INET(curvnet); vifi_t vifi; int i; struct mfc *mfc; @@ -654,7 +659,7 @@ struct rtdetq *npq; MROUTER_LOCK(); - if (ip_mrouter == NULL) { + if (V_ip_mrouter == NULL) { MROUTER_UNLOCK(); } @@ -708,6 +713,8 @@ static int ip_mrouter_init(struct socket *so, int version) { + INIT_VNET_INET(curvnet); + if (mrtdebug) log(LOG_DEBUG, "ip_mrouter_init: so_type = %d, pr_protocol = %d\n", so->so_type, so->so_proto->pr_protocol); @@ -720,7 +727,7 @@ MROUTER_LOCK(); - if (ip_mrouter != NULL) { + if (V_ip_mrouter != NULL) { MROUTER_UNLOCK(); return EADDRINUSE; } @@ -738,7 +745,7 @@ expire_bw_upcalls_send, NULL); callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL); - ip_mrouter = so; + V_ip_mrouter = so; MROUTER_UNLOCK(); @@ -754,6 +761,7 @@ static int X_ip_mrouter_done(void) { + INIT_VNET_INET(curvnet); vifi_t vifi; int i; struct ifnet *ifp; @@ -763,7 +771,7 @@ MROUTER_LOCK(); - if (ip_mrouter == NULL) { + if (V_ip_mrouter == NULL) { MROUTER_UNLOCK(); return EINVAL; } @@ -771,7 +779,7 @@ /* * Detach/disable hooks to the reset of the system. */ - ip_mrouter = NULL; + V_ip_mrouter = NULL; mrt_api_config = 0; VIF_LOCK(); @@ -1285,6 +1293,7 @@ X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo) { + INIT_VNET_INET(curvnet); struct mfc *rt; int error; vifi_t vifi; @@ -1449,7 +1458,7 @@ mrtstat.mrts_upcalls++; k_igmpsrc.sin_addr = ip->ip_src; - if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { + if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); ++mrtstat.mrts_upq_sockfull; fail1: @@ -1589,6 +1598,7 @@ static int ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) { + INIT_VNET_INET(curvnet); struct ip *ip = mtod(m, struct ip *); vifi_t vifi; int plen = ip->ip_len; @@ -1668,7 +1678,7 @@ mrtstat.mrts_upcalls++; k_igmpsrc.sin_addr = im->im_src; - if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { + if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); ++mrtstat.mrts_upq_sockfull; @@ -1800,6 +1810,7 @@ static int X_ip_rsvp_vif(struct socket *so, struct sockopt *sopt) { + INIT_VNET_INET(curvnet); int error, vifi; if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) @@ -1829,7 +1840,7 @@ */ if (!viftable[vifi].v_rsvp_on) { viftable[vifi].v_rsvp_on = 1; - rsvp_on++; + V_rsvp_on++; } } else { /* must be VIF_OFF */ /* @@ -1844,7 +1855,7 @@ */ if (viftable[vifi].v_rsvp_on) { viftable[vifi].v_rsvp_on = 0; - rsvp_on--; + V_rsvp_on--; } } VIF_UNLOCK(); @@ -1854,6 +1865,7 @@ static void X_ip_rsvp_force_done(struct socket *so) { + INIT_VNET_INET(curvnet); int vifi; /* Don't bother if it is not the right type of socket. */ @@ -1873,7 +1885,7 @@ */ if (viftable[vifi].v_rsvp_on) { viftable[vifi].v_rsvp_on = 0; - rsvp_on--; + V_rsvp_on--; } } } @@ -1884,19 +1896,20 @@ static void X_rsvp_input(struct mbuf *m, int off) { + INIT_VNET_INET(curvnet); int vifi; struct ip *ip = mtod(m, struct ip *); struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET }; struct ifnet *ifp; if (rsvpdebug) - printf("rsvp_input: rsvp_on %d\n",rsvp_on); + printf("rsvp_input: rsvp_on %d\n", V_rsvp_on); /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ - if (!rsvp_on) { + if (!V_rsvp_on) { m_freem(m); return; } @@ -1928,7 +1941,7 @@ * then use it. Otherwise, drop packet since there * is no specific socket for this vif. */ - if (ip_rsvpd != NULL) { + if (V_ip_rsvpd != NULL) { if (rsvpdebug) printf("rsvp_input: Sending packet up old-style socket\n"); rip_input(m, off); /* xxx */ @@ -2285,6 +2298,7 @@ static void bw_upcalls_send(void) { + INIT_VNET_INET(curvnet); struct mbuf *m; int len = bw_upcalls_n * sizeof(bw_upcalls[0]); struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; @@ -2323,7 +2337,7 @@ * XXX do we need to set the address in k_igmpsrc ? */ mrtstat.mrts_upcalls++; - if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) { + if (socket_send(V_ip_mrouter, m, &k_igmpsrc) < 0) { log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); ++mrtstat.mrts_upq_sockfull; } @@ -2645,6 +2659,7 @@ pim_register_send_upcall(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { + INIT_VNET_INET(curvnet); struct mbuf *mb_first; int len = ntohs(ip->ip_len); struct igmpmsg *im; @@ -2677,7 +2692,7 @@ mrtstat.mrts_upcalls++; - if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) { + if (socket_send(V_ip_mrouter, mb_first, &k_igmpsrc) < 0) { if (mrtdebug & DEBUG_PIM) log(LOG_WARNING, "mcast: pim_register_send_upcall: ip_mrouter socket queue full"); @@ -2699,6 +2714,7 @@ pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { + INIT_VNET_INET(curvnet); struct mbuf *mb_first; struct ip *ip_outer; struct pim_encap_pimhdr *pimhdr; @@ -3028,6 +3044,7 @@ static int ip_mroute_modevent(module_t mod, int type, void *unused) { + INIT_VNET_INET(curvnet); switch (type) { case MOD_LOAD: MROUTER_LOCK_INIT(); @@ -3094,7 +3111,7 @@ * just loaded and then unloaded w/o starting up a user * process we still need to cleanup. */ - if (ip_mrouter + if (V_ip_mrouter #ifdef INET6 || ip6_mrouter #endif --- /u/marko/p4/head/src/sys/netinet/ip_options.c 2008-01-28 23:53:52.000000000 +0100 +++ src/sys/netinet/ip_options.c 2008-02-27 11:49:12.000000000 +0100 @@ -34,6 +34,7 @@ #include "opt_ipstealth.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -45,7 +46,9 @@ #include #include #include +#include +#include #include #include #include @@ -53,6 +56,7 @@ #include #include +#include #include #include #include @@ -97,6 +101,7 @@ int ip_dooptions(struct mbuf *m, int pass) { + INIT_VNET_INET(curvnet); struct ip *ip = mtod(m, struct ip *); u_char *cp; struct in_ifaddr *ia; @@ -193,7 +198,7 @@ goto dropit; #endif if (!ip_dosourceroute) { - if (ipforwarding) { + if (V_ipforwarding) { char buf[16]; /* aaa.bbb.ccc.ddd\0 */ /* * Acting as a router, so generate @@ -215,7 +220,7 @@ #ifdef IPSTEALTH dropit: #endif - ipstat.ips_cantforward++; + V_ipstat.ips_cantforward++; m_freem(m); return (1); } @@ -355,14 +360,14 @@ cp[IPOPT_OFFSET] += sizeof(n_time); } } - if (forward && ipforwarding) { + if (forward && V_ipforwarding) { ip_forward(m, 1); return (1); } return (0); bad: icmp_error(m, type, code, 0, 0); - ipstat.ips_badoptions++; + V_ipstat.ips_badoptions++; return (1); } --- /u/marko/p4/head/src/sys/netinet/ip_output.c 2008-02-03 08:16:01.000000000 +0100 +++ src/sys/netinet/ip_output.c 2008-02-27 18:00:07.000000000 +0100 @@ -36,6 +36,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_mbuf_stress_test.h" +#include "opt_vimage.h" #include #include @@ -49,12 +50,15 @@ #include #include #include +#include +#include #include #include #include #include +#include #include #include #include @@ -78,7 +82,9 @@ (ntohl(a.s_addr)>>8)&0xFF,\ (ntohl(a.s_addr))&0xFF, y); +#ifndef VIMAGE u_short ip_id; +#endif #ifdef MBUF_STRESS_TEST int mbuf_frag_size = 0; @@ -104,6 +110,8 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; @@ -151,7 +159,7 @@ ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; ip->ip_id = ip_newid(); - ipstat.ips_localout++; + V_ipstat.ips_localout++; } else { hlen = ip->ip_hl << 2; } @@ -190,7 +198,7 @@ if (flags & IP_SENDONES) { if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL && (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) { - ipstat.ips_noroute++; + V_ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } @@ -202,7 +210,7 @@ } else if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) { - ipstat.ips_noroute++; + V_ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } @@ -227,7 +235,7 @@ if (ro->ro_rt == NULL) rtalloc_ign(ro, 0); if (ro->ro_rt == NULL) { - ipstat.ips_noroute++; + V_ipstat.ips_noroute++; error = EHOSTUNREACH; goto bad; } @@ -286,7 +294,7 @@ */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { - ipstat.ips_noroute++; + V_ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } @@ -327,14 +335,14 @@ * above, will be forwarded by the ip_input() routine, * if necessary. */ - if (ip_mrouter && (flags & IP_FORWARDING) == 0) { + if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * If rsvp daemon is not running, do not * set ip_moptions. This ensures that the packet * is multicast and not just sent down one link * as prescribed by rsvpd. */ - if (!rsvp_on) + if (!V_rsvp_on) imo = NULL; if (ip_mforward && ip_mforward(ip, ifp, m, imo) != 0) { @@ -386,7 +394,7 @@ #endif /* ALTQ */ { error = ENOBUFS; - ipstat.ips_odropped++; + V_ipstat.ips_odropped++; ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1); goto bad; } @@ -450,7 +458,7 @@ if (in_localip(ip->ip_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) - m->m_pkthdr.rcvif = loif; + m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; @@ -469,7 +477,7 @@ /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) - m->m_pkthdr.rcvif = loif; + m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; @@ -497,7 +505,7 @@ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { - ipstat.ips_badaddr++; + V_ipstat.ips_badaddr++; error = EADDRNOTAVAIL; goto bad; } @@ -556,7 +564,7 @@ /* Balk when DF bit is set or the interface didn't support TSO. */ if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; - ipstat.ips_cantfrag++; + V_ipstat.ips_cantfrag++; goto bad; } @@ -589,7 +597,7 @@ } if (error == 0) - ipstat.ips_fragmented++; + V_ipstat.ips_fragmented++; done: if (ro == &iproute && ro->ro_rt) { @@ -614,6 +622,7 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags, int sw_csum) { + INIT_VNET_INET(curvnet); int error = 0; int hlen = ip->ip_hl << 2; int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ @@ -624,7 +633,7 @@ int nfrags; if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */ - ipstat.ips_cantfrag++; + V_ipstat.ips_cantfrag++; return EMSGSIZE; } @@ -699,7 +708,7 @@ MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; - ipstat.ips_odropped++; + V_ipstat.ips_odropped++; goto done; } m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; @@ -729,7 +738,7 @@ if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ - ipstat.ips_odropped++; + V_ipstat.ips_odropped++; goto done; } m->m_pkthdr.len = mhlen + len; @@ -745,7 +754,7 @@ *mnext = m; mnext = &m->m_nextpkt; } - ipstat.ips_ofragments += nfrags; + V_ipstat.ips_ofragments += nfrags; /* set first marker for fragment chain */ m0->m_flags |= M_FIRSTFRAG | M_FRAG; --- /u/marko/p4/head/src/sys/netinet/ip_var.h 2007-08-31 03:48:02.000000000 +0200 +++ src/sys/netinet/ip_var.h 2007-10-19 01:49:35.000000000 +0200 @@ -172,19 +172,22 @@ struct route; struct sockopt; +#ifndef VIMAGE extern struct ipstat ipstat; extern u_short ip_id; /* ip packet ctr, for ids */ extern int ip_defttl; /* default IP ttl */ extern int ipforwarding; /* ip forwarding */ +extern int ip_do_randomid; #ifdef IPSTEALTH extern int ipstealth; /* stealth forwarding */ #endif -extern u_char ip_protox[]; +extern int rsvp_on; extern struct socket *ip_rsvpd; /* reservation protocol daemon */ extern struct socket *ip_mrouter; /* multicast routing daemon */ +#endif +extern u_char ip_protox[]; extern int (*legal_vif_num)(int); extern u_long (*ip_mcast_src)(int); -extern int rsvp_on; extern struct pr_usrreqs rip_usrreqs; void inp_freemoptions(struct ip_moptions *); @@ -217,6 +220,9 @@ int rip_ctloutput(struct socket *, struct sockopt *); void rip_ctlinput(int, struct sockaddr *, void *); void rip_init(void); +#ifdef VIMAGE +void rip_destroy(void); +#endif void rip_input(struct mbuf *, int); int rip_output(struct mbuf *, struct socket *, u_long); void ipip_input(struct mbuf *, int); @@ -231,9 +237,7 @@ void in_delayed_cksum(struct mbuf *m); -static __inline uint16_t ip_newid(void); -extern int ip_do_randomid; - +#if 0 static __inline uint16_t ip_newid(void) { @@ -242,6 +246,9 @@ return htons(ip_id++); } +#else +#define ip_newid() (V_ip_do_randomid ? ip_randomid() : V_ip_id++) +#endif #endif /* _KERNEL */ --- /u/marko/p4/head/src/sys/netinet/ipprotosw.h 2007-08-31 03:48:02.000000000 +0200 +++ src/sys/netinet/ipprotosw.h 2007-10-05 12:27:05.000000000 +0200 @@ -87,6 +87,7 @@ void *pr_ousrreq; /* utility hooks */ pr_init_t *pr_init; + pr_destroy_t *pr_destroy; pr_fasttimo_t *pr_fasttimo; /* fast timeout (200ms) */ pr_slowtimo_t *pr_slowtimo; /* slow timeout (500ms) */ pr_drain_t *pr_drain; /* flush any excess space possible */ --- /u/marko/p4/head/src/sys/netinet/raw_ip.c 2007-10-29 17:17:43.000000000 +0100 +++ src/sys/netinet/raw_ip.c 2007-12-10 11:26:11.000000000 +0100 @@ -35,6 +35,7 @@ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -51,12 +52,15 @@ #include #include #include +#include #include +#include #include #include +#include #include #include #include @@ -74,8 +78,11 @@ #include +#ifndef VIMAGE struct inpcbhead ripcb; struct inpcbinfo ripcbinfo; +#endif +static struct uma_zone *ripcb_zone; /* control hooks for ipfw and dummynet */ ip_fw_ctl_t *ip_fw_ctl_ptr = NULL; @@ -87,7 +94,9 @@ */ /* The socket used to communicate with the multicast routing daemon. */ +#ifndef VIMAGE struct socket *ip_mrouter; +#endif /* The various mrouter and rsvp functions */ int (*ip_mrouter_set)(struct socket *, struct sockopt *); @@ -113,8 +122,9 @@ static void rip_zone_change(void *tag) { + INIT_VNET_INET(curvnet); - uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets); + uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); } static int @@ -129,25 +139,49 @@ void rip_init(void) { + INIT_VNET_INET(curvnet); - INP_INFO_LOCK_INIT(&ripcbinfo, "rip"); - LIST_INIT(&ripcb); - ripcbinfo.ipi_listhead = &ripcb; +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif + ripcb_zone = uma_zcreate("ripcb", sizeof(struct inpcb), + NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); +#ifdef VIMAGE + } + V_ripcbinfo.ipi_vnet = curvnet; +#endif + + INP_INFO_LOCK_INIT(&V_ripcbinfo, "rip"); + LIST_INIT(&V_ripcb); + V_ripcbinfo.ipi_listhead = &V_ripcb; /* * XXX We don't use the hash list for raw IP, but it's easier * to allocate a one entry hash list than it is to check all * over the place for hashbase == NULL. */ - ripcbinfo.ipi_hashbase = hashinit(1, M_PCB, &ripcbinfo.ipi_hashmask); - ripcbinfo.ipi_porthashbase = hashinit(1, M_PCB, - &ripcbinfo.ipi_porthashmask); - ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb), - NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets); + V_ripcbinfo.ipi_hashbase = + hashinit(1, M_PCB, &V_ripcbinfo.ipi_hashmask); + V_ripcbinfo.ipi_porthashbase = + hashinit(1, M_PCB, &V_ripcbinfo.ipi_porthashmask); + V_ripcbinfo.ipi_zone = ripcb_zone; + uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } +#ifdef VIMAGE +void +rip_destroy(void) +{ + INIT_VNET_INET(curvnet); + + hashdestroy(V_ripcbinfo.ipi_hashbase, M_PCB, + V_ripcbinfo.ipi_hashmask); + hashdestroy(V_ripcbinfo.ipi_porthashbase, M_PCB, + V_ripcbinfo.ipi_porthashmask); +} +#endif + static struct sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET }; static int @@ -201,14 +235,15 @@ void rip_input(struct mbuf *m, int off) { + INIT_VNET_INET(curvnet); struct ip *ip = mtod(m, struct ip *); int proto = ip->ip_p; struct inpcb *inp, *last; - INP_INFO_RLOCK(&ripcbinfo); + INP_INFO_RLOCK(&V_ripcbinfo); ripsrc.sin_addr = ip->ip_src; last = NULL; - LIST_FOREACH(inp, &ripcb, inp_list) { + LIST_FOREACH(inp, &V_ripcb, inp_list) { INP_LOCK(inp); if (inp->inp_ip_p && inp->inp_ip_p != proto) { docontinue: @@ -242,14 +277,14 @@ } if (last != NULL) { if (raw_append(last, ip, m) != 0) - ipstat.ips_delivered--; + V_ipstat.ips_delivered--; INP_UNLOCK(last); } else { m_freem(m); - ipstat.ips_noproto++; - ipstat.ips_delivered--; + V_ipstat.ips_noproto++; + V_ipstat.ips_delivered--; } - INP_INFO_RUNLOCK(&ripcbinfo); + INP_INFO_RUNLOCK(&V_ripcbinfo); } /* @@ -259,6 +294,7 @@ int rip_output(struct mbuf *m, struct socket *so, u_long dst) { + INIT_VNET_INET(so->so_vnet); struct ip *ip; int error; struct inpcb *inp = sotoinpcb(so); @@ -323,7 +359,7 @@ ip->ip_id = ip_newid(); /* XXX prevent ip_output from overwriting header fields */ flags |= IP_RAWOUTPUT; - ipstat.ips_rawout++; + V_ipstat.ips_rawout++; } if (inp->inp_flags & INP_ONESBCAST) @@ -538,6 +574,7 @@ void rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) { + INIT_VNET_INET(curvnet); struct in_ifaddr *ia; struct ifnet *ifp; int err; @@ -545,7 +582,7 @@ switch (cmd) { case PRC_IFDOWN: - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa && (ia->ia_flags & IFA_ROUTE)) { /* @@ -565,7 +602,7 @@ break; case PRC_IFUP: - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa) break; } @@ -596,6 +633,7 @@ static int rip_attach(struct socket *so, int proto, struct thread *td) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; int error; @@ -610,17 +648,17 @@ error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return error; - INP_INFO_WLOCK(&ripcbinfo); - error = in_pcballoc(so, &ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); + error = in_pcballoc(so, &V_ripcbinfo); if (error) { - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); return error; } inp = (struct inpcb *)so->so_pcb; - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); inp->inp_vflag |= INP_IPV4; inp->inp_ip_p = proto; - inp->inp_ip_ttl = ip_defttl; + inp->inp_ip_ttl = V_ip_defttl; INP_UNLOCK(inp); return 0; } @@ -628,6 +666,7 @@ static void rip_detach(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); @@ -635,17 +674,17 @@ KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("rip_detach: not closed")); - INP_INFO_WLOCK(&ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); INP_LOCK(inp); - if (so == ip_mrouter && ip_mrouter_done) + if (so == V_ip_mrouter && ip_mrouter_done) ip_mrouter_done(); if (ip_rsvp_force_done) ip_rsvp_force_done(so); - if (so == ip_rsvpd) + if (so == V_ip_rsvpd) ip_rsvp_done(); in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); } static void @@ -663,36 +702,39 @@ static void rip_abort(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_abort: inp == NULL")); - INP_INFO_WLOCK(&ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); INP_LOCK(inp); rip_dodisconnect(so, inp); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); } static void rip_close(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_close: inp == NULL")); - INP_INFO_WLOCK(&ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); INP_LOCK(inp); rip_dodisconnect(so, inp); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); } static int rip_disconnect(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; if ((so->so_state & SS_ISCONNECTED) == 0) @@ -700,17 +742,19 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); - INP_INFO_WLOCK(&ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); INP_LOCK(inp); rip_dodisconnect(so, inp); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { + INIT_VNET_NET(so->so_vnet); + INIT_VNET_INET(so->so_vnet); struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; @@ -725,7 +769,7 @@ return (EADDRNOTAVAIL); } - if (TAILQ_EMPTY(&ifnet) || + if (TAILQ_EMPTY(&V_ifnet) || (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || (addr->sin_addr.s_addr && ifa_ifwithaddr((struct sockaddr *)addr) == 0)) @@ -733,35 +777,37 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_bind: inp == NULL")); - INP_INFO_WLOCK(&ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); INP_LOCK(inp); inp->inp_laddr = addr->sin_addr; INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); return 0; } static int rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { + INIT_VNET_NET(so->so_vnet); + INIT_VNET_INET(so->so_vnet); struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; if (nam->sa_len != sizeof(*addr)) return EINVAL; - if (TAILQ_EMPTY(&ifnet)) + if (TAILQ_EMPTY(&V_ifnet)) return EADDRNOTAVAIL; if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) return EAFNOSUPPORT; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_connect: inp == NULL")); - INP_INFO_WLOCK(&ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); INP_LOCK(inp); inp->inp_faddr = addr->sin_addr; soisconnected(so); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); return 0; } @@ -809,6 +855,7 @@ static int rip_pcblist(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET(curvnet); int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; @@ -819,7 +866,7 @@ * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { - n = ripcbinfo.ipi_count; + n = V_ripcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); return 0; @@ -831,10 +878,10 @@ /* * OK, now we're committed to doing something. */ - INP_INFO_RLOCK(&ripcbinfo); - gencnt = ripcbinfo.ipi_gencnt; - n = ripcbinfo.ipi_count; - INP_INFO_RUNLOCK(&ripcbinfo); + INP_INFO_RLOCK(&V_ripcbinfo); + gencnt = V_ripcbinfo.ipi_gencnt; + n = V_ripcbinfo.ipi_count; + INP_INFO_RUNLOCK(&V_ripcbinfo); xig.xig_len = sizeof xig; xig.xig_count = n; @@ -848,8 +895,8 @@ if (inp_list == 0) return ENOMEM; - INP_INFO_RLOCK(&ripcbinfo); - for (inp = LIST_FIRST(ripcbinfo.ipi_listhead), i = 0; inp && i < n; + INP_INFO_RLOCK(&V_ripcbinfo); + for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_LOCK(inp); if (inp->inp_gencnt <= gencnt && @@ -859,7 +906,7 @@ } INP_UNLOCK(inp); } - INP_INFO_RUNLOCK(&ripcbinfo); + INP_INFO_RUNLOCK(&V_ripcbinfo); n = i; error = 0; @@ -887,11 +934,11 @@ * while we were processing this request, and it * might be necessary to retry. */ - INP_INFO_RLOCK(&ripcbinfo); - xig.xig_gen = ripcbinfo.ipi_gencnt; + INP_INFO_RLOCK(&V_ripcbinfo); + xig.xig_gen = V_ripcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; - xig.xig_count = ripcbinfo.ipi_count; - INP_INFO_RUNLOCK(&ripcbinfo); + xig.xig_count = V_ripcbinfo.ipi_count; + INP_INFO_RUNLOCK(&V_ripcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); --- /u/marko/p4/head/src/sys/netinet/sctp_output.c 2008-02-27 18:29:12.000000000 +0100 +++ src/sys/netinet/sctp_output.c 2008-02-27 11:49:22.000000000 +0100 @@ -33,8 +33,12 @@ #include __FBSDID("$FreeBSD: src/sys/netinet/sctp_output.c,v 1.67 2008/02/22 15:06:25 rrs Exp $"); +#include "opt_vimage.h" + #include #include +#include +#include #include #include #include --- /u/marko/p4/head/src/sys/netinet/tcp_hostcache.c 2007-10-16 13:53:39.000000000 +0200 +++ src/sys/netinet/tcp_hostcache.c 2007-10-22 18:06:43.000000000 +0200 @@ -57,15 +57,11 @@ * of bucket limit memory constrains. */ -/* - * Many thanks to jlemon for basic structure of tcp_syncache which is being - * followed here. - */ - #include __FBSDID("$FreeBSD: src/sys/netinet/tcp_hostcache.c,v 1.17 2007/10/07 20:44:23 silby Exp $"); #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -76,9 +72,12 @@ #include #include #include +#include +#include #include +#include #include #include #include @@ -94,88 +93,56 @@ #ifdef INET6 #include #endif +#include #include -TAILQ_HEAD(hc_qhead, hc_metrics); - -struct hc_head { - struct hc_qhead hch_bucket; - u_int hch_length; - struct mtx hch_mtx; -}; - -struct hc_metrics { - /* housekeeping */ - TAILQ_ENTRY(hc_metrics) rmx_q; - struct hc_head *rmx_head; /* head of bucket tail queue */ - struct in_addr ip4; /* IP address */ - struct in6_addr ip6; /* IP6 address */ - /* endpoint specific values for TCP */ - u_long rmx_mtu; /* MTU for this path */ - u_long rmx_ssthresh; /* outbound gateway buffer limit */ - u_long rmx_rtt; /* estimated round trip time */ - u_long rmx_rttvar; /* estimated rtt variance */ - u_long rmx_bandwidth; /* estimated bandwidth */ - u_long rmx_cwnd; /* congestion window */ - u_long rmx_sendpipe; /* outbound delay-bandwidth product */ - u_long rmx_recvpipe; /* inbound delay-bandwidth product */ - /* TCP hostcache internal data */ - int rmx_expire; /* lifetime for object */ - u_long rmx_hits; /* number of hits */ - u_long rmx_updates; /* number of updates */ -}; - /* Arbitrary values */ #define TCP_HOSTCACHE_HASHSIZE 512 #define TCP_HOSTCACHE_BUCKETLIMIT 30 #define TCP_HOSTCACHE_EXPIRE 60*60 /* one hour */ #define TCP_HOSTCACHE_PRUNE 5*60 /* every 5 minutes */ -struct tcp_hostcache { - struct hc_head *hashbase; - uma_zone_t zone; - u_int hashsize; - u_int hashmask; - u_int bucket_limit; - u_int cache_count; - u_int cache_limit; - int expire; - int prune; - int purgeall; -}; +#ifndef VIMAGE static struct tcp_hostcache tcp_hostcache; - static struct callout tcp_hc_callout; +#endif static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *); static struct hc_metrics *tcp_hc_insert(struct in_conninfo *); static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS); static void tcp_hc_purge(void *); -SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0, "TCP Host cache"); +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0, + "TCP Host cache"); -SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_RDTUN, - &tcp_hostcache.cache_limit, 0, "Overall entry limit for hostcache"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, cachelimit, + CTLFLAG_RDTUN, tcp_hostcache.cache_limit, 0, + "Overall entry limit for hostcache"); -SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_RDTUN, - &tcp_hostcache.hashsize, 0, "Size of TCP hostcache hashtable"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, hashsize, + CTLFLAG_RDTUN, tcp_hostcache.hashsize, 0, + "Size of TCP hostcache hashtable"); -SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN, - &tcp_hostcache.bucket_limit, 0, "Per-bucket hash limit for hostcache"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, bucketlimit, + CTLFLAG_RDTUN, tcp_hostcache.bucket_limit, 0, + "Per-bucket hash limit for hostcache"); -SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_RD, - &tcp_hostcache.cache_count, 0, "Current number of entries in hostcache"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, count, + CTLFLAG_RD, tcp_hostcache.cache_count, 0, + "Current number of entries in hostcache"); -SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_RW, - &tcp_hostcache.expire, 0, "Expire time of TCP hostcache entries"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, expire, + CTLFLAG_RW, tcp_hostcache.expire, 0, + "Expire time of TCP hostcache entries"); -SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_RW, - &tcp_hostcache.prune, 0, "Time between purge runs"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, prune, + CTLFLAG_RW, tcp_hostcache.prune, 0, "Time between purge runs"); -SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_RW, - &tcp_hostcache.purgeall, 0, "Expire all entires on next purge run"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, purge, + CTLFLAG_RW, tcp_hostcache.purgeall, 0, + "Expire all entires on next purge run"); SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP, 0, 0, @@ -186,7 +153,7 @@ #define HOSTCACHE_HASH(ip) \ (((ip)->s_addr ^ ((ip)->s_addr >> 7) ^ ((ip)->s_addr >> 17)) & \ - tcp_hostcache.hashmask) + V_tcp_hostcache.hashmask) /* XXX: What is the recommended hash to get good entropy for IPv6 addresses? */ #define HOSTCACHE_HASH6(ip6) \ @@ -194,7 +161,7 @@ (ip6)->s6_addr32[1] ^ \ (ip6)->s6_addr32[2] ^ \ (ip6)->s6_addr32[3]) & \ - tcp_hostcache.hashmask) + V_tcp_hostcache.hashmask) #define THC_LOCK(lp) mtx_lock(lp) #define THC_UNLOCK(lp) mtx_unlock(lp) @@ -202,60 +169,75 @@ void tcp_hc_init(void) { + INIT_VNET_INET(curvnet); int i; /* * Initialize hostcache structures. */ - tcp_hostcache.cache_count = 0; - tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; - tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT; - tcp_hostcache.cache_limit = - tcp_hostcache.hashsize * tcp_hostcache.bucket_limit; - tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE; - tcp_hostcache.prune = TCP_HOSTCACHE_PRUNE; + V_tcp_hostcache.cache_count = 0; + V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; + V_tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT; + V_tcp_hostcache.cache_limit = + V_tcp_hostcache.hashsize * V_tcp_hostcache.bucket_limit; + V_tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE; + V_tcp_hostcache.prune = TCP_HOSTCACHE_PRUNE; TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize", - &tcp_hostcache.hashsize); + &V_tcp_hostcache.hashsize); TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit", - &tcp_hostcache.cache_limit); + &V_tcp_hostcache.cache_limit); TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit", - &tcp_hostcache.bucket_limit); - if (!powerof2(tcp_hostcache.hashsize)) { + &V_tcp_hostcache.bucket_limit); + if (!powerof2(V_tcp_hostcache.hashsize)) { printf("WARNING: hostcache hash size is not a power of 2.\n"); - tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; /* default */ + V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; /* default */ } - tcp_hostcache.hashmask = tcp_hostcache.hashsize - 1; + V_tcp_hostcache.hashmask = V_tcp_hostcache.hashsize - 1; /* * Allocate the hash table. */ - tcp_hostcache.hashbase = (struct hc_head *) - malloc(tcp_hostcache.hashsize * sizeof(struct hc_head), + V_tcp_hostcache.hashbase = (struct hc_head *) + malloc(V_tcp_hostcache.hashsize * sizeof(struct hc_head), M_HOSTCACHE, M_WAITOK | M_ZERO); /* * Initialize the hash buckets. */ - for (i = 0; i < tcp_hostcache.hashsize; i++) { - TAILQ_INIT(&tcp_hostcache.hashbase[i].hch_bucket); - tcp_hostcache.hashbase[i].hch_length = 0; - mtx_init(&tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry", + for (i = 0; i < V_tcp_hostcache.hashsize; i++) { + TAILQ_INIT(&V_tcp_hostcache.hashbase[i].hch_bucket); + V_tcp_hostcache.hashbase[i].hch_length = 0; + mtx_init(&V_tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry", NULL, MTX_DEF); } /* * Allocate the hostcache entries. + * + * XXX don't need a separate zone for each hc instance - revisit!!! */ - tcp_hostcache.zone = uma_zcreate("hostcache", sizeof(struct hc_metrics), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - uma_zone_set_max(tcp_hostcache.zone, tcp_hostcache.cache_limit); + V_tcp_hostcache.zone = + uma_zcreate("hostcache", sizeof(struct hc_metrics), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + uma_zone_set_max(V_tcp_hostcache.zone, V_tcp_hostcache.cache_limit); /* * Set up periodic cache cleanup. */ - callout_init(&tcp_hc_callout, CALLOUT_MPSAFE); - callout_reset(&tcp_hc_callout, tcp_hostcache.prune * hz, tcp_hc_purge, 0); + callout_init(&V_tcp_hc_callout, CALLOUT_MPSAFE); + callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, + tcp_hc_purge, curvnet); +} + +void +tcp_hc_destroy(void) +{ + INIT_VNET_INET(curvnet); + + /* XXX TODO walk the hashtable and free all entries */ + + callout_drain(&V_tcp_hc_callout); } /* @@ -267,6 +249,7 @@ static struct hc_metrics * tcp_hc_lookup(struct in_conninfo *inc) { + INIT_VNET_INET(curvnet); int hash; struct hc_head *hc_head; struct hc_metrics *hc_entry; @@ -281,7 +264,7 @@ else hash = HOSTCACHE_HASH(&inc->inc_faddr); - hc_head = &tcp_hostcache.hashbase[hash]; + hc_head = &V_tcp_hostcache.hashbase[hash]; /* * Acquire lock for this bucket row; we release the lock if we don't @@ -322,6 +305,7 @@ static struct hc_metrics * tcp_hc_insert(struct in_conninfo *inc) { + INIT_VNET_INET(curvnet); int hash; struct hc_head *hc_head; struct hc_metrics *hc_entry; @@ -336,7 +320,7 @@ else hash = HOSTCACHE_HASH(&inc->inc_faddr); - hc_head = &tcp_hostcache.hashbase[hash]; + hc_head = &V_tcp_hostcache.hashbase[hash]; /* * Acquire lock for this bucket row; we release the lock if we don't @@ -348,8 +332,8 @@ /* * If the bucket limit is reached, reuse the least-used element. */ - if (hc_head->hch_length >= tcp_hostcache.bucket_limit || - tcp_hostcache.cache_count >= tcp_hostcache.cache_limit) { + if (hc_head->hch_length >= V_tcp_hostcache.bucket_limit || + V_tcp_hostcache.cache_count >= V_tcp_hostcache.cache_limit) { hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead); /* * At first we were dropping the last element, just to @@ -365,17 +349,17 @@ return NULL; } TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q); - tcp_hostcache.hashbase[hash].hch_length--; - tcp_hostcache.cache_count--; - tcpstat.tcps_hc_bucketoverflow++; + V_tcp_hostcache.hashbase[hash].hch_length--; + V_tcp_hostcache.cache_count--; + V_tcpstat.tcps_hc_bucketoverflow++; #if 0 - uma_zfree(tcp_hostcache.zone, hc_entry); + uma_zfree(V_tcp_hostcache.zone, hc_entry); #endif } else { /* * Allocate a new entry, or balk if not possible. */ - hc_entry = uma_zalloc(tcp_hostcache.zone, M_NOWAIT); + hc_entry = uma_zalloc(V_tcp_hostcache.zone, M_NOWAIT); if (hc_entry == NULL) { THC_UNLOCK(&hc_head->hch_mtx); return NULL; @@ -391,15 +375,15 @@ else hc_entry->ip4 = inc->inc_faddr; hc_entry->rmx_head = hc_head; - hc_entry->rmx_expire = tcp_hostcache.expire; + hc_entry->rmx_expire = V_tcp_hostcache.expire; /* * Put it upfront. */ TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q); - tcp_hostcache.hashbase[hash].hch_length++; - tcp_hostcache.cache_count++; - tcpstat.tcps_hc_added++; + V_tcp_hostcache.hashbase[hash].hch_length++; + V_tcp_hostcache.cache_count++; + V_tcpstat.tcps_hc_added++; return hc_entry; } @@ -412,6 +396,7 @@ void tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite) { + INIT_VNET_INET(curvnet); struct hc_metrics *hc_entry; /* @@ -427,7 +412,7 @@ return; } hc_entry->rmx_hits++; - hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */ + hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu; hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh; @@ -452,6 +437,7 @@ u_long tcp_hc_getmtu(struct in_conninfo *inc) { + INIT_VNET_INET(curvnet); struct hc_metrics *hc_entry; u_long mtu; @@ -460,7 +446,7 @@ return 0; } hc_entry->rmx_hits++; - hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */ + hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ mtu = hc_entry->rmx_mtu; THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); @@ -474,6 +460,7 @@ void tcp_hc_updatemtu(struct in_conninfo *inc, u_long mtu) { + INIT_VNET_INET(curvnet); struct hc_metrics *hc_entry; /* @@ -490,7 +477,7 @@ return; } hc_entry->rmx_updates++; - hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */ + hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ hc_entry->rmx_mtu = mtu; @@ -513,6 +500,7 @@ void tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml) { + INIT_VNET_INET(curvnet); struct hc_metrics *hc_entry; hc_entry = tcp_hc_lookup(inc); @@ -522,7 +510,7 @@ return; } hc_entry->rmx_updates++; - hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */ + hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ if (hcml->rmx_rtt != 0) { if (hc_entry->rmx_rtt == 0) @@ -530,7 +518,7 @@ else hc_entry->rmx_rtt = (hc_entry->rmx_rtt + hcml->rmx_rtt) / 2; - tcpstat.tcps_cachedrtt++; + V_tcpstat.tcps_cachedrtt++; } if (hcml->rmx_rttvar != 0) { if (hc_entry->rmx_rttvar == 0) @@ -538,7 +526,7 @@ else hc_entry->rmx_rttvar = (hc_entry->rmx_rttvar + hcml->rmx_rttvar) / 2; - tcpstat.tcps_cachedrttvar++; + V_tcpstat.tcps_cachedrttvar++; } if (hcml->rmx_ssthresh != 0) { if (hc_entry->rmx_ssthresh == 0) @@ -546,7 +534,7 @@ else hc_entry->rmx_ssthresh = (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2; - tcpstat.tcps_cachedssthresh++; + V_tcpstat.tcps_cachedssthresh++; } if (hcml->rmx_bandwidth != 0) { if (hc_entry->rmx_bandwidth == 0) @@ -554,7 +542,7 @@ else hc_entry->rmx_bandwidth = (hc_entry->rmx_bandwidth + hcml->rmx_bandwidth) / 2; - /* tcpstat.tcps_cachedbandwidth++; */ + /* V_tcpstat.tcps_cachedbandwidth++; */ } if (hcml->rmx_cwnd != 0) { if (hc_entry->rmx_cwnd == 0) @@ -562,7 +550,7 @@ else hc_entry->rmx_cwnd = (hc_entry->rmx_cwnd + hcml->rmx_cwnd) / 2; - /* tcpstat.tcps_cachedcwnd++; */ + /* V_tcpstat.tcps_cachedcwnd++; */ } if (hcml->rmx_sendpipe != 0) { if (hc_entry->rmx_sendpipe == 0) @@ -570,7 +558,7 @@ else hc_entry->rmx_sendpipe = (hc_entry->rmx_sendpipe + hcml->rmx_sendpipe) /2; - /* tcpstat.tcps_cachedsendpipe++; */ + /* V_tcpstat.tcps_cachedsendpipe++; */ } if (hcml->rmx_recvpipe != 0) { if (hc_entry->rmx_recvpipe == 0) @@ -578,7 +566,7 @@ else hc_entry->rmx_recvpipe = (hc_entry->rmx_recvpipe + hcml->rmx_recvpipe) /2; - /* tcpstat.tcps_cachedrecvpipe++; */ + /* V_tcpstat.tcps_cachedrecvpipe++; */ } TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); @@ -593,6 +581,7 @@ static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET(curvnet); int bufsize; int linesize = 128; char *p, *buf; @@ -602,7 +591,7 @@ char ip6buf[INET6_ADDRSTRLEN]; #endif - bufsize = linesize * (tcp_hostcache.cache_count + 1); + bufsize = linesize * (V_tcp_hostcache.cache_count + 1); p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO); @@ -612,9 +601,9 @@ p += len; #define msec(u) (((u) + 500) / 1000) - for (i = 0; i < tcp_hostcache.hashsize; i++) { - THC_LOCK(&tcp_hostcache.hashbase[i].hch_mtx); - TAILQ_FOREACH(hc_entry, &tcp_hostcache.hashbase[i].hch_bucket, + for (i = 0; i < V_tcp_hostcache.hashsize; i++) { + THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); + TAILQ_FOREACH(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q) { len = snprintf(p, linesize, "%-15s %5lu %8lu %6lums %6lums %9lu %8lu %8lu %8lu " @@ -640,7 +629,7 @@ hc_entry->rmx_expire); p += len; } - THC_UNLOCK(&tcp_hostcache.hashbase[i].hch_mtx); + THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); } #undef msec error = SYSCTL_OUT(req, buf, p - buf); @@ -655,29 +644,36 @@ static void tcp_hc_purge(void *arg) { + CURVNET_SET((struct vnet *) arg); + INIT_VNET_INET(curvnet); struct hc_metrics *hc_entry, *hc_next; - int all = (intptr_t)arg; + int all = 0; int i; - if (tcp_hostcache.purgeall) { + if (V_tcp_hostcache.purgeall) { all = 1; - tcp_hostcache.purgeall = 0; + V_tcp_hostcache.purgeall = 0; } - for (i = 0; i < tcp_hostcache.hashsize; i++) { - THC_LOCK(&tcp_hostcache.hashbase[i].hch_mtx); - TAILQ_FOREACH_SAFE(hc_entry, &tcp_hostcache.hashbase[i].hch_bucket, - rmx_q, hc_next) { + for (i = 0; i < V_tcp_hostcache.hashsize; i++) { + THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); + TAILQ_FOREACH_SAFE(hc_entry, + &V_tcp_hostcache.hashbase[i].hch_bucket, + rmx_q, hc_next) { if (all || hc_entry->rmx_expire <= 0) { - TAILQ_REMOVE(&tcp_hostcache.hashbase[i].hch_bucket, + TAILQ_REMOVE(&V_tcp_hostcache.hashbase[i].hch_bucket, hc_entry, rmx_q); - uma_zfree(tcp_hostcache.zone, hc_entry); - tcp_hostcache.hashbase[i].hch_length--; - tcp_hostcache.cache_count--; + uma_zfree(V_tcp_hostcache.zone, hc_entry); + V_tcp_hostcache.hashbase[i].hch_length--; + V_tcp_hostcache.cache_count--; } else - hc_entry->rmx_expire -= tcp_hostcache.prune; + hc_entry->rmx_expire -= V_tcp_hostcache.prune; } - THC_UNLOCK(&tcp_hostcache.hashbase[i].hch_mtx); + THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); } - callout_reset(&tcp_hc_callout, tcp_hostcache.prune * hz, tcp_hc_purge, 0); + + callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, + tcp_hc_purge, arg); + + CURVNET_RESTORE(); } --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/netinet/tcp_hostcache.h 2007-10-05 12:27:16.000000000 +0200 @@ -0,0 +1,82 @@ +/*- + * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * XXX RCS ID + */ + +/* + * Many thanks to jlemon for basic structure of tcp_syncache which is being + * followed here. + */ + +#ifndef _NETINET_TCP_HOSTCACHE_H_ +#define _NETINET_TCP_HOSTCACHE_H_ + +TAILQ_HEAD(hc_qhead, hc_metrics); + +struct hc_head { + struct hc_qhead hch_bucket; + u_int hch_length; + struct mtx hch_mtx; +}; + +struct hc_metrics { + /* housekeeping */ + TAILQ_ENTRY(hc_metrics) rmx_q; + struct hc_head *rmx_head; /* head of bucket tail queue */ + struct in_addr ip4; /* IP address */ + struct in6_addr ip6; /* IP6 address */ + /* endpoint specific values for tcp */ + u_long rmx_mtu; /* MTU for this path */ + u_long rmx_ssthresh; /* outbound gateway buffer limit */ + u_long rmx_rtt; /* estimated round trip time */ + u_long rmx_rttvar; /* estimated rtt variance */ + u_long rmx_bandwidth; /* estimated bandwidth */ + u_long rmx_cwnd; /* congestion window */ + u_long rmx_sendpipe; /* outbound delay-bandwidth product */ + u_long rmx_recvpipe; /* inbound delay-bandwidth product */ + /* TCP hostcache internal data */ + int rmx_expire; /* lifetime for object */ + u_long rmx_hits; /* number of hits */ + u_long rmx_updates; /* number of updates */ +}; + +struct tcp_hostcache { + struct hc_head *hashbase; + uma_zone_t zone; + u_int hashsize; + u_int hashmask; + u_int bucket_limit; + u_int cache_count; + u_int cache_limit; + int expire; + int prune; + int purgeall; +}; + +#endif /* !_NETINET_TCP_HOSTCACHE_H_*/ --- /u/marko/p4/head/src/sys/netinet/tcp_input.c 2007-10-29 17:17:43.000000000 +0100 +++ src/sys/netinet/tcp_input.c 2007-12-10 11:26:13.000000000 +0100 @@ -38,6 +38,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" +#include "opt_vimage.h" #include #include @@ -51,16 +52,19 @@ #include #include #include +#include #include /* before tcp_seq.h, for tcp_random18() */ #include +#include #include #include #define TCPSTATES /* for logging */ +#include #include #include #include @@ -72,6 +76,7 @@ #include #include #include +#include #include #include #include @@ -90,6 +95,7 @@ #ifdef IPSEC #include #include +#include #endif /*IPSEC*/ #include @@ -98,57 +104,62 @@ static const int tcprexmtthresh = 3; -struct tcpstat tcpstat; -SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, - &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); +#ifndef VIMAGE +struct inpcbhead tcb; +struct inpcbinfo tcbinfo; +struct tcpstat tcpstat; +int blackhole; +int tcp_delack_enabled; +int drop_synfin; +int tcp_do_rfc3042; +int tcp_do_rfc3390; +int tcp_insecure_rst; +int tcp_do_autorcvbuf; +int tcp_autorcvbuf_inc; +int tcp_autorcvbuf_max; +#endif + +SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_STATS, stats, + CTLFLAG_RW, tcpstat , tcpstat, + "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); int tcp_log_in_vain = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, &tcp_log_in_vain, 0, "Log all incoming TCP segments to closed ports"); -static int blackhole = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, - &blackhole, 0, "Do not send RST on segments to closed ports"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, + blackhole, 0, "Do not send RST on segments to closed ports"); -int tcp_delack_enabled = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, - &tcp_delack_enabled, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, delayed_ack, + CTLFLAG_RW, tcp_delack_enabled, 0, "Delay ACK to try and piggyback it onto a data packet"); -static int drop_synfin = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, - &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, drop_synfin, + CTLFLAG_RW, drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); -static int tcp_do_rfc3042 = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, - &tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, + tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); -static int tcp_do_rfc3390 = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, - &tcp_do_rfc3390, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, + tcp_do_rfc3390, 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); -static int tcp_insecure_rst = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, - &tcp_insecure_rst, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, insecure_rst, + CTLFLAG_RW, tcp_insecure_rst, 0, "Follow the old (insecure) criteria for accepting RST packets"); -int tcp_do_autorcvbuf = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, - &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_auto, + CTLFLAG_RW, tcp_do_autorcvbuf, 0, + "Enable automatic receive buffer sizing"); -int tcp_autorcvbuf_inc = 16*1024; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, - &tcp_autorcvbuf_inc, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_inc, + CTLFLAG_RW, tcp_autorcvbuf_inc, 0, "Incrementor step size of automatic receive buffer"); -int tcp_autorcvbuf_max = 256*1024; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, - &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_max, + CTLFLAG_RW, tcp_autorcvbuf_max, 0, + "Max size of automatic receive buffer"); -struct inpcbhead tcb; -#define tcb6 tcb /* for KAME src sync over BSD*'s */ -struct inpcbinfo tcbinfo; static void tcp_dooptions(struct tcpopt *, u_char *, int, int); static void tcp_do_segment(struct mbuf *, struct tcphdr *, @@ -183,8 +194,7 @@ #define DELAY_ACK(tp) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ - (tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) - + (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) /* * TCP input handling is split into multiple parts: @@ -199,6 +209,7 @@ int tcp6_input(struct mbuf **mp, int *offp, int proto) { + INIT_VNET_INET6(curvnet); struct mbuf *m = *mp; struct in6_ifaddr *ia6; @@ -226,6 +237,13 @@ void tcp_input(struct mbuf *m, int off0) { + INIT_VNET_INET(curvnet); +#ifdef INET6 + INIT_VNET_INET6(curvnet); +#endif +#ifdef IPSEC + INIT_VNET_IPSEC(curvnet); +#endif struct tcphdr *th; struct ip *ip = NULL; struct ipovly *ipov; @@ -266,7 +284,7 @@ #endif to.to_flags = 0; - tcpstat.tcps_rcvtotal++; + V_tcpstat.tcps_rcvtotal++; if (isipv6) { #ifdef INET6 @@ -274,7 +292,7 @@ ip6 = mtod(m, struct ip6_hdr *); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { - tcpstat.tcps_rcvbadsum++; + V_tcpstat.tcps_rcvbadsum++; goto drop; } th = (struct tcphdr *)((caddr_t)ip6 + off0); @@ -306,7 +324,7 @@ if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) { - tcpstat.tcps_rcvshort++; + V_tcpstat.tcps_rcvshort++; return; } } @@ -340,7 +358,7 @@ th->th_sum = in_cksum(m, len); } if (th->th_sum) { - tcpstat.tcps_rcvbadsum++; + V_tcpstat.tcps_rcvbadsum++; goto drop; } /* Re-initialization for later version check */ @@ -353,7 +371,7 @@ */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { - tcpstat.tcps_rcvbadoff++; + V_tcpstat.tcps_rcvbadoff++; goto drop; } tlen -= off; /* tlen is used instead of ti->ti_len */ @@ -368,7 +386,7 @@ if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == NULL) { - tcpstat.tcps_rcvshort++; + V_tcpstat.tcps_rcvshort++; return; } ip = mtod(m, struct ip *); @@ -397,9 +415,9 @@ /* * Locate pcb for segment. */ - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); findpcb: - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); #ifdef IPFIREWALL_FORWARD /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. @@ -414,13 +432,13 @@ * Transparently forwarded. Pretend to be the destination. * already got one like this? */ - inp = in_pcblookup_hash(&tcbinfo, + inp = in_pcblookup_hash(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); if (!inp) { /* It's new. Try to find the ambushing socket. */ - inp = in_pcblookup_hash(&tcbinfo, + inp = in_pcblookup_hash(&V_tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? @@ -436,14 +454,14 @@ { if (isipv6) { #ifdef INET6 - inp = in6_pcblookup_hash(&tcbinfo, + inp = in6_pcblookup_hash(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif); #endif } else - inp = in_pcblookup_hash(&tcbinfo, + inp = in_pcblookup_hash(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, INPLOOKUP_WILDCARD, @@ -469,8 +487,8 @@ * When blackholing do not respond with a RST but * completely ignore the segment and drop it. */ - if ((blackhole == 1 && (thflags & TH_SYN)) || - blackhole == 2) + if ((V_blackhole == 1 && (thflags & TH_SYN)) || + V_blackhole == 2) goto dropunlock; rstreason = BANDLIM_RST_CLOSEDPORT; @@ -481,12 +499,12 @@ #ifdef IPSEC #ifdef INET6 if (isipv6 && ipsec6_in_reject(m, inp)) { - ipsec6stat.in_polvio++; + V_ipsec6stat.in_polvio++; goto dropunlock; } else #endif /* INET6 */ if (ipsec4_in_reject(m, inp) != 0) { - ipsec4stat.in_polvio++; + V_ipsec4stat.in_polvio++; goto dropunlock; } #endif /* IPSEC */ @@ -518,7 +536,7 @@ */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return; } /* @@ -618,9 +636,10 @@ log(LOG_DEBUG, "%s; %s: Listen socket: " "Socket allocation failed due to " "limits or memory shortage, %s\n", - s, __func__, (tcp_sc_rst_sock_fail ? - "sending RST" : "try again")); - if (tcp_sc_rst_sock_fail) { + s, __func__, + V_tcp_sc_rst_sock_fail ? + "sending RST" : "try again"); + if (V_tcp_sc_rst_sock_fail) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } else @@ -643,7 +662,7 @@ * the mbuf chain and unlocks the inpcb. */ tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen); - INP_INFO_UNLOCK_ASSERT(&tcbinfo); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return; } /* @@ -668,7 +687,7 @@ log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN is missing, segment ignored\n", s, __func__); - tcpstat.tcps_badsyn++; + V_tcpstat.tcps_badsyn++; goto dropunlock; } /* @@ -680,7 +699,7 @@ "SYN|ACK invalid, segment rejected\n", s, __func__); syncache_badack(&inc); /* XXX: Not needed! */ - tcpstat.tcps_badsyn++; + V_tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } @@ -695,12 +714,12 @@ * XXX: This is a violation of the TCP specification * and was used by RFC1644. */ - if ((thflags & TH_FIN) && drop_synfin) { + if ((thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); - tcpstat.tcps_badsyn++; + V_tcpstat.tcps_badsyn++; goto dropunlock; } /* @@ -745,7 +764,7 @@ * handling - worse, they are not exactly the same. * I believe 5.5.4 is the best one, so we follow 5.5.4. */ - if (isipv6 && !ip6_use_deprecated) { + if (isipv6 && !V_ip6_use_deprecated) { struct in6_ifaddr *ia6; if ((ia6 = ip6_getdstifaddr(m)) && @@ -833,7 +852,7 @@ * Entry added to syncache and mbuf consumed. * Everything already unlocked by syncache_add(). */ - INP_INFO_UNLOCK_ASSERT(&tcbinfo); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return; } @@ -843,20 +862,20 @@ * the inpcb, and unlocks pcbinfo. */ tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen); - INP_INFO_UNLOCK_ASSERT(&tcbinfo); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return; dropwithreset: - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); tcp_dropwithreset(m, th, tp, tlen, rstreason); m = NULL; /* mbuf chain got consumed. */ dropunlock: - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); if (inp != NULL) INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); drop: - INP_INFO_UNLOCK_ASSERT(&tcbinfo); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); if (s != NULL) free(s, M_TCPLOG); if (m != NULL) @@ -868,6 +887,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen) { + INIT_VNET_INET(tp->t_vnet); int thflags, acked, ourfinisacked, needoutput = 0; int headlocked = 1; int rstreason, todrop, win; @@ -885,7 +905,7 @@ #endif thflags = th->th_flags; - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); @@ -1000,28 +1020,28 @@ if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && - ((!tcp_do_newreno && + ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT) && tp->t_dupacks < tcprexmtthresh) || - ((tcp_do_newreno || + ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && !IN_FASTRECOVERY(tp) && (to.to_flags & TOF_SACK) == 0 && TAILQ_EMPTY(&tp->snd_holes)))) { KASSERT(headlocked, ("%s: headlocked", __func__)); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; /* * This is a pure ack for outstanding data. */ - ++tcpstat.tcps_predack; + ++V_tcpstat.tcps_predack; /* * "bad retransmit" recovery. */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { - ++tcpstat.tcps_sndrexmitbad; + ++V_tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; @@ -1057,8 +1077,8 @@ } tcp_xmit_bandwidth_limit(tp, th->th_ack); acked = th->th_ack - tp->snd_una; - tcpstat.tcps_rcvackpack++; - tcpstat.tcps_rcvackbyte += acked; + V_tcpstat.tcps_rcvackpack++; + V_tcpstat.tcps_rcvackbyte += acked; sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) @@ -1103,7 +1123,7 @@ int newsize = 0; /* automatic sockbuf scaling */ KASSERT(headlocked, ("%s: headlocked", __func__)); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; /* * This is a pure, in-sequence data packet @@ -1113,7 +1133,7 @@ /* Clean receiver SACK report if present */ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) tcp_clean_sackreport(tp); - ++tcpstat.tcps_preddat; + ++V_tcpstat.tcps_preddat; tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to @@ -1125,8 +1145,8 @@ * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; - tcpstat.tcps_rcvpack++; - tcpstat.tcps_rcvbyte += tlen; + V_tcpstat.tcps_rcvpack++; + V_tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); /* Some progress has been made */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) @@ -1166,7 +1186,7 @@ * TODO: Only step up if the application is actually serving * the buffer to better manage the socket buffer resources. */ - if (tcp_do_autorcvbuf && + if (V_tcp_do_autorcvbuf && to.to_tsecr && (so->so_rcv.sb_flags & SB_AUTOSIZE)) { if (to.to_tsecr > tp->rfbuf_ts && @@ -1174,11 +1194,11 @@ if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) && so->so_rcv.sb_hiwat < - tcp_autorcvbuf_max) { + V_tcp_autorcvbuf_max) { newsize = min(so->so_rcv.sb_hiwat + - tcp_autorcvbuf_inc, - tcp_autorcvbuf_max); + V_tcp_autorcvbuf_inc, + V_tcp_autorcvbuf_max); } /* Start over with next RTT. */ tp->rfbuf_ts = 0; @@ -1274,7 +1294,7 @@ tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { - tcpstat.tcps_connects++; + V_tcpstat.tcps_connects++; soisconnected(so); #ifdef MAC SOCK_LOCK(so); @@ -1343,8 +1363,8 @@ m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; - tcpstat.tcps_rcvpackafterwin++; - tcpstat.tcps_rcvbyteafterwin += todrop; + V_tcpstat.tcps_rcvpackafterwin++; + V_tcpstat.tcps_rcvbyteafterwin += todrop; } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; @@ -1443,12 +1463,12 @@ goto close; case TCPS_ESTABLISHED: - if (tcp_insecure_rst == 0 && + if (V_tcp_insecure_rst == 0 && !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) && SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) && !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) { - tcpstat.tcps_badrst++; + V_tcpstat.tcps_badrst++; goto drop; } /* FALLTHROUGH */ @@ -1458,7 +1478,7 @@ so->so_error = ECONNRESET; close: tp->t_state = TCPS_CLOSED; - tcpstat.tcps_drops++; + V_tcpstat.tcps_drops++; KASSERT(headlocked, ("%s: trimthenstep6: " "tcp_close: head not locked", __func__)); tp = tcp_close(tp); @@ -1497,9 +1517,9 @@ */ tp->ts_recent = 0; } else { - tcpstat.tcps_rcvduppack++; - tcpstat.tcps_rcvdupbyte += tlen; - tcpstat.tcps_pawsdrop++; + V_tcpstat.tcps_rcvduppack++; + V_tcpstat.tcps_rcvdupbyte += tlen; + V_tcpstat.tcps_pawsdrop++; if (tlen) goto dropafterack; goto drop; @@ -1547,11 +1567,11 @@ */ tp->t_flags |= TF_ACKNOW; todrop = tlen; - tcpstat.tcps_rcvduppack++; - tcpstat.tcps_rcvdupbyte += todrop; + V_tcpstat.tcps_rcvduppack++; + V_tcpstat.tcps_rcvdupbyte += todrop; } else { - tcpstat.tcps_rcvpartduppack++; - tcpstat.tcps_rcvpartdupbyte += todrop; + V_tcpstat.tcps_rcvpartduppack++; + V_tcpstat.tcps_rcvpartdupbyte += todrop; } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; @@ -1581,7 +1601,7 @@ free(s, M_TCPLOG); } tp = tcp_close(tp); - tcpstat.tcps_rcvafterclose++; + V_tcpstat.tcps_rcvafterclose++; rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } @@ -1592,9 +1612,9 @@ */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { - tcpstat.tcps_rcvpackafterwin++; + V_tcpstat.tcps_rcvpackafterwin++; if (todrop >= tlen) { - tcpstat.tcps_rcvbyteafterwin += tlen; + V_tcpstat.tcps_rcvbyteafterwin += tlen; /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from @@ -1604,11 +1624,11 @@ */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; - tcpstat.tcps_rcvwinprobe++; + V_tcpstat.tcps_rcvwinprobe++; } else goto dropafterack; } else - tcpstat.tcps_rcvbyteafterwin += todrop; + V_tcpstat.tcps_rcvbyteafterwin += todrop; m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); @@ -1679,7 +1699,7 @@ */ case TCPS_SYN_RECEIVED: - tcpstat.tcps_connects++; + V_tcpstat.tcps_connects++; soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == @@ -1725,7 +1745,7 @@ case TCPS_CLOSING: case TCPS_LAST_ACK: if (SEQ_GT(th->th_ack, tp->snd_max)) { - tcpstat.tcps_rcvacktoomuch++; + V_tcpstat.tcps_rcvacktoomuch++; goto dropafterack; } if ((tp->t_flags & TF_SACK_PERMIT) && @@ -1734,7 +1754,7 @@ tcp_sack_doack(tp, &to, th->th_ack); if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (tlen == 0 && tiwin == tp->snd_wnd) { - tcpstat.tcps_rcvdupack++; + V_tcpstat.tcps_rcvdupack++; /* * If we have outstanding data (other than * a window probe), this is a completely @@ -1763,7 +1783,7 @@ th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || - ((tcp_do_newreno || + ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && IN_FASTRECOVERY(tp))) { if ((tp->t_flags & TF_SACK_PERMIT) && @@ -1803,7 +1823,7 @@ tp->t_dupacks = 0; break; } - } else if (tcp_do_newreno) { + } else if (V_tcp_do_newreno) { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; @@ -1820,7 +1840,7 @@ tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (tp->t_flags & TF_SACK_PERMIT) { - tcpstat.tcps_sack_recovery_episode++; + V_tcpstat.tcps_sack_recovery_episode++; tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); @@ -1838,7 +1858,7 @@ if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; - } else if (tcp_do_rfc3042) { + } else if (V_tcp_do_rfc3042) { u_long oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; @@ -1880,7 +1900,7 @@ * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ - if (tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) { + if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) { if (IN_FASTRECOVERY(tp)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->t_flags & TF_SACK_PERMIT) @@ -1941,8 +1961,8 @@ INP_LOCK_ASSERT(tp->t_inpcb); acked = th->th_ack - tp->snd_una; - tcpstat.tcps_rcvackpack++; - tcpstat.tcps_rcvackbyte += acked; + V_tcpstat.tcps_rcvackpack++; + V_tcpstat.tcps_rcvackbyte += acked; /* * If we just performed our first retransmit, and the ACK @@ -1952,7 +1972,7 @@ * we left off. */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { - ++tcpstat.tcps_sndrexmitbad; + ++V_tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; @@ -2014,7 +2034,7 @@ * Otherwise open linearly: maxseg per window * (maxseg^2 / cwnd per packet). */ - if ((!tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) || + if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) || !IN_FASTRECOVERY(tp)) { u_int cw = tp->snd_cwnd; u_int incr = tp->t_maxseg; @@ -2035,12 +2055,12 @@ /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); /* Detect una wraparound. */ - if ((tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && + if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && !IN_FASTRECOVERY(tp) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; - if ((tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && + if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover)) EXIT_FASTRECOVERY(tp); @@ -2095,7 +2115,7 @@ KASSERT(headlocked, ("%s: process_ACK: " "head not locked", __func__)); tcp_twstart(tp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; m_freem(m); return; @@ -2134,7 +2154,7 @@ /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) - tcpstat.tcps_rcvwinupd++; + V_tcpstat.tcps_rcvwinupd++; tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; @@ -2242,8 +2262,8 @@ tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; - tcpstat.tcps_rcvpack++; - tcpstat.tcps_rcvbyte += tlen; + V_tcpstat.tcps_rcvpack++; + V_tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) @@ -2328,11 +2348,11 @@ KASSERT(headlocked == 1, ("%s: dodata: " "TCP_FIN_WAIT_2: head not locked", __func__)); tcp_twstart(tp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return; } } - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) @@ -2349,7 +2369,7 @@ check_delack: KASSERT(headlocked == 0, ("%s: check_delack: head locked", __func__)); - INP_INFO_UNLOCK_ASSERT(&tcbinfo); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; @@ -2387,7 +2407,7 @@ &tcp_savetcp, 0); #endif KASSERT(headlocked, ("%s: headlocked should be 1", __func__)); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); INP_UNLOCK(tp->t_inpcb); @@ -2402,7 +2422,7 @@ if (tp != NULL) INP_UNLOCK(tp->t_inpcb); if (headlocked) - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return; drop: @@ -2417,7 +2437,7 @@ if (tp != NULL) INP_UNLOCK(tp->t_inpcb); if (headlocked) - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); m_freem(m); return; } @@ -2482,6 +2502,7 @@ static void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { + INIT_VNET_INET(curvnet); int opt, optlen; to->to_flags = 0; @@ -2547,7 +2568,7 @@ continue; if (!(flags & TO_SYN)) continue; - if (!tcp_do_sack) + if (!V_tcp_do_sack) continue; to->to_flags |= TOF_SACKPERM; break; @@ -2559,7 +2580,7 @@ to->to_flags |= TOF_SACK; to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; to->to_sacks = cp + 2; - tcpstat.tcps_sack_rcv_blocks++; + V_tcpstat.tcps_sack_rcv_blocks++; break; default: continue; @@ -2607,11 +2628,12 @@ static void tcp_xmit_timer(struct tcpcb *tp, int rtt) { + INIT_VNET_INET(tp->t_inpcb->inp_vnet); int delta; INP_LOCK_ASSERT(tp->t_inpcb); - tcpstat.tcps_rttupdated++; + V_tcpstat.tcps_rttupdated++; tp->t_rttupdated++; if (tp->t_srtt != 0) { /* @@ -2712,6 +2734,7 @@ void tcp_mss(struct tcpcb *tp, int offer) { + INIT_VNET_INET(tp->t_inpcb->inp_vnet); int rtt, mss; u_long bufsize; u_long maxmtu; @@ -2733,12 +2756,12 @@ #ifdef INET6 if (isipv6) { maxmtu = tcp_maxmtu6(&inp->inp_inc, &mtuflags); - tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt; + tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt; } else #endif { maxmtu = tcp_maxmtu(&inp->inp_inc, &mtuflags); - tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; + tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt; } so = inp->inp_socket; @@ -2757,9 +2780,9 @@ */ offer = #ifdef INET6 - isipv6 ? tcp_v6mssdflt : + isipv6 ? V_tcp_v6mssdflt : #endif - tcp_mssdflt; + V_tcp_mssdflt; break; case -1: @@ -2773,7 +2796,7 @@ * Prevent DoS attack with too small MSS. Round up * to at least minmss. */ - offer = max(offer, tcp_minmss); + offer = max(offer, V_tcp_minmss); /* * Sanity check: make sure that maxopd will be large * enough to allow some data on segments even if the @@ -2798,16 +2821,16 @@ #ifdef INET6 if (isipv6) { mss = maxmtu - min_protoh; - if (!path_mtu_discovery && + if (!V_path_mtu_discovery && !in6_localaddr(&inp->in6p_faddr)) - mss = min(mss, tcp_v6mssdflt); + mss = min(mss, V_tcp_v6mssdflt); } else #endif { mss = maxmtu - min_protoh; - if (!path_mtu_discovery && + if (!V_path_mtu_discovery && !in_localaddr(inp->inp_faddr)) - mss = min(mss, tcp_mssdflt); + mss = min(mss, V_tcp_mssdflt); } } mss = min(mss, offer); @@ -2883,10 +2906,10 @@ if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; - tcpstat.tcps_usedrtt++; + V_tcpstat.tcps_usedrtt++; if (metrics.rmx_rttvar) { tp->t_rttvar = metrics.rmx_rttvar; - tcpstat.tcps_usedrttvar++; + V_tcpstat.tcps_usedrttvar++; } else { /* default variation is +- 1 rtt */ tp->t_rttvar = @@ -2904,7 +2927,7 @@ * threshold to no less than 2*mss. */ tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); - tcpstat.tcps_usedssthresh++; + V_tcpstat.tcps_usedssthresh++; } if (metrics.rmx_bandwidth) tp->snd_bandwidth = metrics.rmx_bandwidth; @@ -2933,7 +2956,7 @@ min(tp->snd_wnd, so->so_snd.sb_hiwat))); else #endif - if (tcp_do_rfc3390) + if (V_tcp_do_rfc3390) tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); #ifdef INET6 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || @@ -2941,9 +2964,9 @@ #else else if (in_localaddr(inp->inp_faddr)) #endif - tp->snd_cwnd = mss * ss_fltsz_local; + tp->snd_cwnd = mss * V_ss_fltsz_local; else - tp->snd_cwnd = mss * ss_fltsz; + tp->snd_cwnd = mss * V_ss_fltsz; /* Check the interface for TSO capabilities. */ if (mtuflags & CSUM_TSO) @@ -2956,6 +2979,7 @@ int tcp_mssopt(struct in_conninfo *inc) { + INIT_VNET_INET(curvnet); int mss = 0; u_long maxmtu = 0; u_long thcmtu = 0; @@ -2968,14 +2992,14 @@ #ifdef INET6 if (isipv6) { - mss = tcp_v6mssdflt; + mss = V_tcp_v6mssdflt; maxmtu = tcp_maxmtu6(inc, NULL); thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); } else #endif { - mss = tcp_mssdflt; + mss = V_tcp_mssdflt; maxmtu = tcp_maxmtu(inc, NULL); thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct tcpiphdr); --- /u/marko/p4/head/src/sys/netinet/tcp_output.c 2007-12-03 11:00:10.000000000 +0100 +++ src/sys/netinet/tcp_output.c 2007-12-10 11:26:13.000000000 +0100 @@ -37,6 +37,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" +#include "opt_vimage.h" #include #include @@ -49,9 +50,12 @@ #include #include #include +#include +#include #include +#include #include #include #include @@ -86,37 +90,45 @@ extern struct mbuf *m_copypack(); #endif -int path_mtu_discovery = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, - &path_mtu_discovery, 1, "Enable Path MTU Discovery"); +#ifndef VIMAGE +int path_mtu_discovery; +int ss_fltsz; +int ss_fltsz_local; +int tcp_do_newreno; +int tcp_do_tso; +int tcp_do_autosndbuf; +int tcp_autosndbuf_inc; +int tcp_autosndbuf_max; +#endif -int ss_fltsz = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW, - &ss_fltsz, 1, "Slow start flight size"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, path_mtu_discovery, + CTLFLAG_RW, path_mtu_discovery, 1, "Enable Path MTU Discovery"); -int ss_fltsz_local = 4; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW, - &ss_fltsz_local, 1, "Slow start flight size for local networks"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, + slowstart_flightsize, CTLFLAG_RW, + ss_fltsz, 1, "Slow start flight size"); -int tcp_do_newreno = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, - &tcp_do_newreno, 0, "Enable NewReno Algorithms"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, + local_slowstart_flightsize, CTLFLAG_RW, + ss_fltsz_local, 1, "Slow start flight size for local networks"); -int tcp_do_tso = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, - &tcp_do_tso, 0, "Enable TCP Segmentation Offload"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, + tcp_do_newreno, 0, "Enable NewReno Algorithms"); -int tcp_do_autosndbuf = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW, - &tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, + tcp_do_tso, 0, "Enable TCP Segmentation Offload"); -int tcp_autosndbuf_inc = 8*1024; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW, - &tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, sendbuf_auto, + CTLFLAG_RW, + tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing"); -int tcp_autosndbuf_max = 256*1024; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, - &tcp_autosndbuf_max, 0, "Max size of automatic send buffer"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, sendbuf_inc, + CTLFLAG_RW, tcp_autosndbuf_inc, 0, + "Incrementor step size of automatic send buffer"); + +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, sendbuf_max, + CTLFLAG_RW, tcp_autosndbuf_max, 0, + "Max size of automatic send buffer"); /* @@ -125,6 +137,7 @@ int tcp_output(struct tcpcb *tp) { + INIT_VNET_INET(tp->t_inpcb->inp_vnet); struct socket *so = tp->t_inpcb->inp_socket; long len, recwin, sendwin; int off, flags, error; @@ -170,15 +183,15 @@ * Set the slow-start flight size depending on whether * this is a local network or not. */ - int ss = ss_fltsz; + int ss = V_ss_fltsz; #ifdef INET6 if (isipv6) { if (in6_localaddr(&tp->t_inpcb->in6p_faddr)) - ss = ss_fltsz_local; + ss = V_ss_fltsz_local; } else #endif /* INET6 */ if (in_localaddr(tp->t_inpcb->inp_faddr)) - ss = ss_fltsz_local; + ss = V_ss_fltsz_local; tp->snd_cwnd = tp->t_maxseg * ss; } tp->t_flags &= ~TF_LASTIDLE; @@ -252,8 +265,8 @@ if (len > 0) { sack_rxmit = 1; sendalot = 1; - tcpstat.tcps_sack_rexmits++; - tcpstat.tcps_sack_rexmit_bytes += + V_tcpstat.tcps_sack_rexmits++; + V_tcpstat.tcps_sack_rexmit_bytes += min(len, tp->t_maxseg); } } @@ -428,14 +441,14 @@ * with congestion window. Requires another timer. Has to * wait for upcoming tcp timer rewrite. */ - if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { + if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && - so->so_snd.sb_cc < tcp_autosndbuf_max && + so->so_snd.sb_cc < V_tcp_autosndbuf_max && sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { if (!sbreserve_locked(&so->so_snd, - min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc, - tcp_autosndbuf_max), so, curthread)) + min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc, + V_tcp_autosndbuf_max), so, curthread)) so->so_snd.sb_flags &= ~SB_AUTOSIZE; } } @@ -464,7 +477,7 @@ ipsec_optlen = ipsec_hdrsiz_tcp(tp); #endif if (len > tp->t_maxseg) { - if ((tp->t_flags & TF_TSO) && tcp_do_tso && + if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && tp->t_inpcb->inp_options == NULL && @@ -754,13 +767,13 @@ u_int moff; if ((tp->t_flags & TF_FORCEDATA) && len == 1) - tcpstat.tcps_sndprobe++; + V_tcpstat.tcps_sndprobe++; else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { - tcpstat.tcps_sndrexmitpack++; - tcpstat.tcps_sndrexmitbyte += len; + V_tcpstat.tcps_sndrexmitpack++; + V_tcpstat.tcps_sndrexmitbyte += len; } else { - tcpstat.tcps_sndpack++; - tcpstat.tcps_sndbyte += len; + V_tcpstat.tcps_sndpack++; + V_tcpstat.tcps_sndbyte += len; } #ifdef notyet if ((m = m_copypack(so->so_snd.sb_mb, off, @@ -827,13 +840,13 @@ } else { SOCKBUF_UNLOCK(&so->so_snd); if (tp->t_flags & TF_ACKNOW) - tcpstat.tcps_sndacks++; + V_tcpstat.tcps_sndacks++; else if (flags & (TH_SYN|TH_FIN|TH_RST)) - tcpstat.tcps_sndctrl++; + V_tcpstat.tcps_sndctrl++; else if (SEQ_GT(tp->snd_up, tp->snd_una)) - tcpstat.tcps_sndurg++; + V_tcpstat.tcps_sndurg++; else - tcpstat.tcps_sndwinup++; + V_tcpstat.tcps_sndwinup++; MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) { @@ -1031,7 +1044,7 @@ if (tp->t_rtttime == 0) { tp->t_rtttime = ticks; tp->t_rtseq = startseq; - tcpstat.tcps_segstimed++; + V_tcpstat.tcps_segstimed++; } } @@ -1129,7 +1142,7 @@ * Section 2. However the tcp hostcache migitates the problem * so it affects only the first tcp connection with a host. */ - if (path_mtu_discovery) + if (V_path_mtu_discovery) ip->ip_off |= IP_DF; error = ip_output(m, tp->t_inpcb->inp_options, NULL, @@ -1208,7 +1221,7 @@ return (error); } } - tcpstat.tcps_sndtotal++; + V_tcpstat.tcps_sndtotal++; /* * Data sent (as far as we can tell). @@ -1275,6 +1288,7 @@ int tcp_addoptions(struct tcpopt *to, u_char *optp) { + INIT_VNET_INET(curvnet); u_int mask, optlen = 0; for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { @@ -1372,7 +1386,7 @@ optlen += TCPOLEN_SACK; sack++; } - tcpstat.tcps_sack_send_blocks++; + V_tcpstat.tcps_sack_send_blocks++; break; } default: --- /u/marko/p4/head/src/sys/netinet/tcp_reass.c 2007-10-16 13:53:39.000000000 +0200 +++ src/sys/netinet/tcp_reass.c 2007-10-22 18:06:43.000000000 +0200 @@ -35,6 +35,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" +#include "opt_vimage.h" #include #include @@ -45,12 +46,14 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -76,33 +79,37 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); +#ifndef VIMAGE static int tcp_reass_maxseg = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, - &tcp_reass_maxseg, 0, +int tcp_reass_qsize = 0; +static int tcp_reass_maxqlen = 48; +static int tcp_reass_overflows = 0; +#endif + +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, maxsegments, + CTLFLAG_RDTUN, tcp_reass_maxseg, 0, "Global maximum number of TCP Segments in Reassembly Queue"); -int tcp_reass_qsize = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD, - &tcp_reass_qsize, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, cursegments, + CTLFLAG_RD, tcp_reass_qsize, 0, "Global number of TCP Segments currently in Reassembly Queue"); -static int tcp_reass_maxqlen = 48; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxqlen, CTLFLAG_RW, - &tcp_reass_maxqlen, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, maxqlen, + CTLFLAG_RW, tcp_reass_maxqlen, 0, "Maximum number of TCP Segments per individual Reassembly Queue"); -static int tcp_reass_overflows = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD, - &tcp_reass_overflows, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, overflows, + CTLFLAG_RD, tcp_reass_overflows, 0, "Global number of TCP Segment Reassembly Queue Overflows"); /* Initialize TCP reassembly queue */ static void tcp_reass_zone_change(void *tag) { + INIT_VNET_INET(curvnet); - tcp_reass_maxseg = nmbclusters / 16; - uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg); + V_tcp_reass_maxseg = nmbclusters / 16; + uma_zone_set_max(tcp_reass_zone, V_tcp_reass_maxseg); } uma_zone_t tcp_reass_zone; @@ -110,13 +117,14 @@ void tcp_reass_init(void) { + INIT_VNET_INET(curvnet); - tcp_reass_maxseg = nmbclusters / 16; + V_tcp_reass_maxseg = nmbclusters / 16; TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments", - &tcp_reass_maxseg); + &V_tcp_reass_maxseg); tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg); + uma_zone_set_max(tcp_reass_zone, V_tcp_reass_maxseg); EVENTHANDLER_REGISTER(nmbclusters_change, tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); } @@ -124,6 +132,7 @@ int tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) { + INIT_VNET_INET(curvnet); struct tseg_qent *q; struct tseg_qent *p = NULL; struct tseg_qent *nq; @@ -154,10 +163,10 @@ * process the missing segment. */ if (th->th_seq != tp->rcv_nxt && - (tcp_reass_qsize + 1 >= tcp_reass_maxseg || - tp->t_segqlen >= tcp_reass_maxqlen)) { - tcp_reass_overflows++; - tcpstat.tcps_rcvmemdrop++; + (V_tcp_reass_qsize + 1 >= V_tcp_reass_maxseg || + tp->t_segqlen >= V_tcp_reass_maxqlen)) { + V_tcp_reass_overflows++; + V_tcpstat.tcps_rcvmemdrop++; m_freem(m); *tlenp = 0; return (0); @@ -169,13 +178,13 @@ */ te = uma_zalloc(tcp_reass_zone, M_NOWAIT); if (te == NULL) { - tcpstat.tcps_rcvmemdrop++; + V_tcpstat.tcps_rcvmemdrop++; m_freem(m); *tlenp = 0; return (0); } tp->t_segqlen++; - tcp_reass_qsize++; + V_tcp_reass_qsize++; /* * Find a segment which begins after this one does. @@ -197,12 +206,12 @@ i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; if (i > 0) { if (i >= *tlenp) { - tcpstat.tcps_rcvduppack++; - tcpstat.tcps_rcvdupbyte += *tlenp; + V_tcpstat.tcps_rcvduppack++; + V_tcpstat.tcps_rcvdupbyte += *tlenp; m_freem(m); uma_zfree(tcp_reass_zone, te); tp->t_segqlen--; - tcp_reass_qsize--; + V_tcp_reass_qsize--; /* * Try to present any queued data * at the left window edge to the user. @@ -216,8 +225,8 @@ th->th_seq += i; } } - tcpstat.tcps_rcvoopack++; - tcpstat.tcps_rcvoobyte += *tlenp; + V_tcpstat.tcps_rcvoopack++; + V_tcpstat.tcps_rcvoobyte += *tlenp; /* * While we overlap succeeding segments trim them or, @@ -239,7 +248,7 @@ m_freem(q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; - tcp_reass_qsize--; + V_tcp_reass_qsize--; q = nq; } @@ -276,7 +285,7 @@ sbappendstream_locked(&so->so_rcv, q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; - tcp_reass_qsize--; + V_tcp_reass_qsize--; q = nq; } while (q && q->tqe_th->th_seq == tp->rcv_nxt); ND6_HINT(tp); --- /u/marko/p4/head/src/sys/netinet/tcp_sack.c 2007-08-31 03:48:07.000000000 +0200 +++ src/sys/netinet/tcp_sack.c 2007-10-22 18:06:43.000000000 +0200 @@ -76,6 +76,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" +#include "opt_vimage.h" #include #include @@ -89,14 +90,17 @@ #include #include #include +#include #include /* before tcp_seq.h, for tcp_random18() */ #include +#include #include #include +#include #include #include #include @@ -123,25 +127,27 @@ extern struct uma_zone *sack_hole_zone; +#ifndef VIMAGE +int tcp_do_sack; +int tcp_sack_maxholes; +int tcp_sack_globalmaxholes; +int tcp_sack_globalholes; +#endif + SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK"); -int tcp_do_sack = 1; -SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_RW, - &tcp_do_sack, 0, "Enable/Disable TCP SACK support"); -TUNABLE_INT("net.inet.tcp.sack.enable", &tcp_do_sack); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, enable, + CTLFLAG_RW, tcp_do_sack, 0, "Enable/Disable TCP SACK support"); -static int tcp_sack_maxholes = 128; -SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_RW, - &tcp_sack_maxholes, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, maxholes, + CTLFLAG_RW, tcp_sack_maxholes, 0, "Maximum number of TCP SACK holes allowed per connection"); -static int tcp_sack_globalmaxholes = 65536; -SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_RW, - &tcp_sack_globalmaxholes, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, globalmaxholes, + CTLFLAG_RW, tcp_sack_globalmaxholes, 0, "Global maximum number of TCP SACK holes"); -static int tcp_sack_globalholes = 0; -SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_RD, - &tcp_sack_globalholes, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, globalholes, + CTLFLAG_RD, tcp_sack_globalholes, 0, "Global number of TCP SACK holes currently allocated"); /* @@ -252,11 +258,12 @@ static struct sackhole * tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end) { + INIT_VNET_INET(tp->t_inpcb->inp_vnet); struct sackhole *hole; - if (tp->snd_numholes >= tcp_sack_maxholes || - tcp_sack_globalholes >= tcp_sack_globalmaxholes) { - tcpstat.tcps_sack_sboverflow++; + if (tp->snd_numholes >= V_tcp_sack_maxholes || + V_tcp_sack_globalholes >= V_tcp_sack_globalmaxholes) { + V_tcpstat.tcps_sack_sboverflow++; return NULL; } @@ -269,7 +276,7 @@ hole->rxmit = start; tp->snd_numholes++; - tcp_sack_globalholes++; + V_tcp_sack_globalholes++; return hole; } @@ -280,14 +287,15 @@ static void tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole) { + INIT_VNET_INET(tp->t_vnet); uma_zfree(sack_hole_zone, hole); tp->snd_numholes--; - tcp_sack_globalholes--; + V_tcp_sack_globalholes--; KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes >= 0")); - KASSERT(tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0")); + KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0")); } /* --- /u/marko/p4/head/src/sys/netinet/tcp_subr.c 2007-12-27 19:32:56.000000000 +0100 +++ src/sys/netinet/tcp_subr.c 2008-01-14 19:23:54.000000000 +0100 @@ -38,6 +38,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" +#include "opt_vimage.h" #include #include @@ -55,12 +56,15 @@ #include #include #include +#include #include +#include #include #include +#include #include #include #include @@ -94,6 +98,7 @@ #include #endif #include +#include #ifdef IPSEC #include @@ -109,14 +114,33 @@ #include -int tcp_mssdflt = TCP_MSS; -SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, - &tcp_mssdflt, 0, "Default TCP Maximum Segment Size"); +static int tcp_tcbhashsize = 0; +static int do_tcpdrain = 1; +static int tcp_inflight_debug = 0; + +#ifndef VIMAGE +int tcp_mssdflt; +int tcp_minmss; +int tcp_do_rfc1323; +static int icmp_may_rst; +static int tcp_isn_reseed_interval; +static int tcp_inflight_enable; +static int tcp_inflight_rttthresh; +static int tcp_inflight_min; +static int tcp_inflight_max; +static int tcp_inflight_stab; +static int nolocaltimewait; +#endif + +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, + CTLFLAG_RW, tcp_mssdflt, 0, "Default TCP Maximum Segment Size"); #ifdef INET6 -int tcp_v6mssdflt = TCP6_MSS; -SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, - CTLFLAG_RW, &tcp_v6mssdflt , 0, +#ifndef VIMAGE +int tcp_v6mssdflt; +#endif +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, + CTLFLAG_RW, tcp_v6mssdflt, 0, "Default TCP Maximum Segment Size for IPv6"); #endif @@ -128,38 +152,33 @@ * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ -int tcp_minmss = TCP_MINMSS; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, - &tcp_minmss , 0, "Minmum TCP Maximum Segment Size"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, minmss, + CTLFLAG_RW, tcp_minmss , 0, "Minmum TCP Maximum Segment Size"); -int tcp_do_rfc1323 = 1; -SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, - &tcp_do_rfc1323, 0, "Enable rfc1323 (high performance TCP) extensions"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, + CTLFLAG_RW, tcp_do_rfc1323, 0, + "Enable rfc1323 (high performance TCP) extensions"); static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); -static int tcp_tcbhashsize = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); -static int do_tcpdrain = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, - &do_tcpdrain, 0, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); -SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, - &tcbinfo.ipi_count, 0, "Number of active PCBs"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, pcbcount, + CTLFLAG_RD, tcbinfo.ipi_count, 0, "Number of active PCBs"); -static int icmp_may_rst = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, - &icmp_may_rst, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, icmp_may_rst, + CTLFLAG_RW, icmp_may_rst, 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); -static int tcp_isn_reseed_interval = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, - &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, isn_reseed_interval, + CTLFLAG_RW, tcp_isn_reseed_interval, 0, + "Seconds between reseeding of ISN secret"); /* * TCP bandwidth limiting sysctls. Note that the default lower bound of @@ -169,30 +188,31 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0, "TCP inflight data limiting"); -static int tcp_inflight_enable = 1; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW, - &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, enable, + CTLFLAG_RW, tcp_inflight_enable, 0, + "Enable automatic TCP inflight data limiting"); -static int tcp_inflight_debug = 0; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW, &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); -static int tcp_inflight_rttthresh; -SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW, - &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I", +static int sysctl_tcp_inflight_rttthresh(SYSCTL_HANDLER_ARGS); +int sysctl_tcp_inflight_rttthresh(SYSCTL_HANDLER_ARGS) +{ + return (0); /* XXX MARKO REVISIT */ +} +SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, + CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_tcp_inflight_rttthresh, "I", "RTT threshold below which inflight will deactivate itself"); -static int tcp_inflight_min = 6144; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW, - &tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, min, + CTLFLAG_RW, tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); -static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW, - &tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, max, + CTLFLAG_RW, tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); -static int tcp_inflight_stab = 20; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW, - &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, stab, + CTLFLAG_RW, tcp_inflight_stab, 0, + "Inflight Algorithm Stabilization 20 = 2 packets"); uma_zone_t sack_hole_zone; @@ -229,14 +249,15 @@ #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) +static struct uma_zone *tcp_ipi_zone; + /* * TCP initialization. */ static void tcp_zone_change(void *tag) { - - uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); + uma_zone_set_max(tcp_ipi_zone, maxsockets); uma_zone_set_max(tcpcb_zone, maxsockets); tcp_tw_zone_change(); } @@ -253,6 +274,27 @@ void tcp_init(void) { + INIT_VNET_INET(curvnet); + +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif + tcp_ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), + NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(tcp_ipi_zone, maxsockets); + /* + * These have to be type stable for the benefit of the timers. + */ + tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(tcpcb_zone, maxsockets); + sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); +#ifdef VIMAGE + } +#endif + + tcp_tw_init(); int hashsize = TCBHASHSIZE; tcp_delacktime = TCPTV_DELACK; @@ -265,25 +307,59 @@ if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; tcp_rexmit_slop = TCPTV_CPU_VAR; - tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; + V_path_mtu_discovery = 1; + V_ss_fltsz = 1; + V_ss_fltsz_local = 4; + V_tcp_do_newreno = 1; + V_tcp_do_tso = 1; + V_tcp_do_autosndbuf = 1; + V_tcp_autosndbuf_inc = 8*1024; + V_tcp_autosndbuf_max = 256*1024; + V_blackhole = 0; + V_tcp_delack_enabled = 1; + V_drop_synfin = 0; + V_tcp_do_rfc3042 = 1; + V_tcp_do_rfc3390 = 1; + V_tcp_insecure_rst = 0; + V_tcp_do_autorcvbuf = 1; + V_tcp_autorcvbuf_inc = 16*1024; + V_tcp_autorcvbuf_max = 256*1024; + V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; + V_tcp_mssdflt = TCP_MSS; +#ifdef INET6 + V_tcp_v6mssdflt = TCP6_MSS; +#endif + V_tcp_minmss = TCP_MINMSS; + V_tcp_do_rfc1323 = 1; + V_icmp_may_rst = 1; + V_tcp_isn_reseed_interval = 0; + V_tcp_inflight_enable = 1; + V_tcp_inflight_min = 6144; + V_tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; + V_tcp_inflight_stab = 20; + V_nolocaltimewait = 0; + V_tcp_do_sack = 1; + V_tcp_sack_maxholes = 128; + V_tcp_sack_globalmaxholes = 65536; + V_tcp_sack_globalholes = 0; + TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); - INP_INFO_LOCK_INIT(&tcbinfo, "tcp"); - LIST_INIT(&tcb); - tcbinfo.ipi_listhead = &tcb; + INP_INFO_LOCK_INIT(&V_tcbinfo, "tcp"); + LIST_INIT(&V_tcb); + V_tcbinfo.ipi_listhead = &V_tcb; TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); if (!powerof2(hashsize)) { printf("WARNING: TCB hash size not a power of 2\n"); hashsize = 512; /* safe default */ } tcp_tcbhashsize = hashsize; - tcbinfo.ipi_hashbase = hashinit(hashsize, M_PCB, - &tcbinfo.ipi_hashmask); - tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB, - &tcbinfo.ipi_porthashmask); - tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), - NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); + V_tcbinfo.ipi_hashbase = hashinit(hashsize, M_PCB, + &V_tcbinfo.ipi_hashmask); + V_tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB, + &V_tcbinfo.ipi_porthashmask); + V_tcbinfo.ipi_zone = tcp_ipi_zone; + V_tcbinfo.ipi_vnet = curvnet; #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ @@ -294,27 +370,44 @@ if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR - /* - * These have to be type stable for the benefit of the timers. - */ - tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(tcpcb_zone, maxsockets); - tcp_tw_init(); + syncache_init(); tcp_hc_init(); + +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif + tcp_reass_init(); ISN_LOCK_INIT(); callout_init(&isn_callout, CALLOUT_MPSAFE); - tcp_isn_tick(NULL); + callout_reset(&isn_callout, 1, tcp_isn_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); - sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } +#ifdef VIMAGE +void +tcp_destroy(void) +{ + INIT_VNET_INET(curvnet); + + tcp_tw_destroy(); + tcp_hc_destroy(); + syncache_destroy(); + + /* XXX check that hashes are empty! */ + hashdestroy(V_tcbinfo.ipi_hashbase, M_PCB, + V_tcbinfo.ipi_hashmask); + hashdestroy(V_tcbinfo.ipi_porthashbase, M_PCB, + V_tcbinfo.ipi_porthashmask); + INP_INFO_LOCK_DESTROY(&V_tcbinfo); +} +#endif + void tcp_fini(void *xtp) { @@ -416,6 +509,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags) { + INIT_VNET_INET(curvnet); int tlen; int win = 0; struct ip *ip; @@ -512,8 +606,8 @@ { tlen += sizeof (struct tcpiphdr); ip->ip_len = tlen; - ip->ip_ttl = ip_defttl; - if (path_mtu_discovery) + ip->ip_ttl = V_ip_defttl; + if (V_path_mtu_discovery) ip->ip_off |= IP_DF; } m->m_len = tlen; @@ -582,6 +676,8 @@ struct tcpcb * tcp_newtcpcb(struct inpcb *inp) { + INIT_VNET_INET(inp->inp_vnet); + struct tcpcb_mem *tm; struct tcpcb *tp; #ifdef INET6 @@ -593,12 +689,15 @@ return (NULL); tp = &tm->tcb; tp->t_timers = &tm->tt; +#ifdef VIMAGE + tp->t_vnet = inp->inp_vnet; +#endif /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = tp->t_maxopd = #ifdef INET6 - isipv6 ? tcp_v6mssdflt : + isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ - tcp_mssdflt; + V_tcp_mssdflt; /* Set up our timeouts. */ callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE); @@ -607,9 +706,9 @@ callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE); callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE); - if (tcp_do_rfc1323) + if (V_tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); - if (tcp_do_sack) + if (V_tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); tp->t_inpcb = inp; /* XXX */ @@ -632,7 +731,7 @@ * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ - inp->inp_ip_ttl = ip_defttl; + inp->inp_ip_ttl = V_ip_defttl; inp->inp_ppcb = tp; return (tp); /* XXX */ } @@ -645,17 +744,18 @@ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { + INIT_VNET_INET(tp->t_inpcb->inp_vnet); struct socket *so = tp->t_inpcb->inp_socket; - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_state = TCPS_CLOSED; (void) tcp_output_reset(tp); - tcpstat.tcps_drops++; + V_tcpstat.tcps_drops++; } else - tcpstat.tcps_conndrops++; + V_tcpstat.tcps_conndrops++; if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; @@ -665,6 +765,7 @@ void tcp_discardcb(struct tcpcb *tp) { + INIT_VNET_INET(tp->t_vnet); struct tseg_qent *q; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; @@ -748,7 +849,7 @@ m_freem(q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; - tcp_reass_qsize--; + V_tcp_reass_qsize--; } /* Disconnect offload device, if any. */ tcp_offload_detach(tp); @@ -766,17 +867,18 @@ struct tcpcb * tcp_close(struct tcpcb *tp) { + INIT_VNET_INET(tp->t_inpcb->inp_vnet); struct inpcb *inp = tp->t_inpcb; struct socket *so; - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(inp); /* Notify any offload devices of listener close */ if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_close(tp); in_pcbdrop(inp); - tcpstat.tcps_closed++; + V_tcpstat.tcps_closed++; KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; soisdisconnected(so); @@ -797,8 +899,9 @@ void tcp_drain(void) { - if (do_tcpdrain) { + VNET_ITERLOOP_BEGIN(); + INIT_VNET_INET(vnet_iter); struct inpcb *inpb; struct tcpcb *tcpb; struct tseg_qent *te; @@ -811,8 +914,8 @@ * where we're really low on mbufs, this is potentially * usefull. */ - INP_INFO_RLOCK(&tcbinfo); - LIST_FOREACH(inpb, tcbinfo.ipi_listhead, inp_list) { + INP_INFO_RLOCK(&V_tcbinfo); + LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { if (inpb->inp_vflag & INP_TIMEWAIT) continue; INP_LOCK(inpb); @@ -823,13 +926,14 @@ m_freem(te->tqe_m); uma_zfree(tcp_reass_zone, te); tcpb->t_segqlen--; - tcp_reass_qsize--; + V_tcp_reass_qsize--; } tcp_clean_sackreport(tcpb); } INP_UNLOCK(inpb); } - INP_INFO_RUNLOCK(&tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); + VNET_ITERLOOP_END(); } } @@ -845,8 +949,11 @@ tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; +#ifdef INVARIANTS + INIT_VNET_INET(inp->inp_vnet); +#endif - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(inp); if ((inp->inp_vflag & INP_TIMEWAIT) || @@ -888,6 +995,7 @@ static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET(curvnet); int error, i, m, n, pcb_count; struct inpcb *inp, **inp_list; inp_gen_t gencnt; @@ -899,7 +1007,7 @@ */ if (req->oldptr == NULL) { m = syncache_pcbcount(); - n = tcbinfo.ipi_count; + n = V_tcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + ((m + n) + n/8) * sizeof(struct xtcpcb); return (0); @@ -911,10 +1019,10 @@ /* * OK, now we're committed to doing something. */ - INP_INFO_RLOCK(&tcbinfo); - gencnt = tcbinfo.ipi_gencnt; - n = tcbinfo.ipi_count; - INP_INFO_RUNLOCK(&tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); + gencnt = V_tcbinfo.ipi_gencnt; + n = V_tcbinfo.ipi_count; + INP_INFO_RUNLOCK(&V_tcbinfo); m = syncache_pcbcount(); @@ -939,9 +1047,9 @@ if (inp_list == NULL) return (ENOMEM); - INP_INFO_RLOCK(&tcbinfo); - for (inp = LIST_FIRST(tcbinfo.ipi_listhead), i = 0; inp != NULL && i - < n; inp = LIST_NEXT(inp, inp_list)) { + INP_INFO_RLOCK(&V_tcbinfo); + for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; + inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_LOCK(inp); if (inp->inp_gencnt <= gencnt) { /* @@ -963,7 +1071,7 @@ } INP_UNLOCK(inp); } - INP_INFO_RUNLOCK(&tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); n = i; error = 0; @@ -1007,11 +1115,11 @@ * while we were processing this request, and it * might be necessary to retry. */ - INP_INFO_RLOCK(&tcbinfo); - xig.xig_gen = tcbinfo.ipi_gencnt; + INP_INFO_RLOCK(&V_tcbinfo); + xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; - xig.xig_count = tcbinfo.ipi_count + pcb_count; - INP_INFO_RUNLOCK(&tcbinfo); + xig.xig_count = V_tcbinfo.ipi_count + pcb_count; + INP_INFO_RUNLOCK(&V_tcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); @@ -1024,6 +1132,7 @@ static int tcp_getcred(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET(curvnet); struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; @@ -1035,9 +1144,9 @@ error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); - INP_INFO_RLOCK(&tcbinfo); - inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, - addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); + INP_INFO_RLOCK(&V_tcbinfo); + inp = in_pcblookup_hash(&V_tcbinfo, addrs[1].sin_addr, + addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); if (inp == NULL) { error = ENOENT; goto outunlocked; @@ -1054,7 +1163,7 @@ out: INP_UNLOCK(inp); outunlocked: - INP_INFO_RUNLOCK(&tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); @@ -1068,6 +1177,8 @@ static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET(curvnet); + INIT_VNET_INET6(curvnet); struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; @@ -1079,8 +1190,8 @@ error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); - if ((error = sa6_embedscope(&addrs[0], ip6_use_defzone)) != 0 || - (error = sa6_embedscope(&addrs[1], ip6_use_defzone)) != 0) { + if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || + (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { @@ -1090,16 +1201,16 @@ return (EINVAL); } - INP_INFO_RLOCK(&tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); if (mapped == 1) - inp = in_pcblookup_hash(&tcbinfo, + inp = in_pcblookup_hash(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, 0, NULL); else - inp = in6_pcblookup_hash(&tcbinfo, + inp = in6_pcblookup_hash(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL); if (inp == NULL) { @@ -1118,7 +1229,7 @@ out: INP_UNLOCK(inp); outunlocked: - INP_INFO_RUNLOCK(&tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); @@ -1133,6 +1244,7 @@ void tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { + INIT_VNET_INET(curvnet); struct ip *ip = vip; struct tcphdr *th; struct in_addr faddr; @@ -1150,7 +1262,7 @@ if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; - else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || + else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; /* @@ -1177,8 +1289,8 @@ - offsetof(struct icmp, icmp_ip)); th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); - INP_INFO_WLOCK(&tcbinfo); - inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, + INP_INFO_WLOCK(&V_tcbinfo); + inp = in_pcblookup_hash(&V_tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, 0, NULL); if (inp != NULL) { INP_LOCK(inp); @@ -1210,11 +1322,11 @@ if (!mtu) mtu = ip_next_mtu(ip->ip_len, 1); - if (mtu < max(296, (tcp_minmss) + if (mtu < max(296, V_tcp_minmss + sizeof(struct tcpiphdr))) mtu = 0; if (!mtu) - mtu = tcp_mssdflt + mtu = V_tcp_mssdflt + sizeof(struct tcpiphdr); /* * Only cache the the MTU if it @@ -1241,15 +1353,16 @@ #endif syncache_unreach(&inc, th); } - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); } else - in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); + in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); } #ifdef INET6 void tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { + INIT_VNET_INET(curvnet); struct tcphdr th; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; @@ -1303,7 +1416,7 @@ bzero(&th, sizeof(th)); m_copydata(m, off, sizeof(*thp), (caddr_t)&th); - in6_pcbnotify(&tcbinfo, sa, th.th_dport, + in6_pcbnotify(&V_tcbinfo, sa, th.th_dport, (struct sockaddr *)ip6cp->ip6c_src, th.th_sport, cmd, NULL, notify); @@ -1312,11 +1425,11 @@ inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; inc.inc_isipv6 = 1; - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); syncache_unreach(&inc, &th); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); } else - in6_pcbnotify(&tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, + in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, NULL, notify); } #endif /* INET6 */ @@ -1370,14 +1483,17 @@ #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) +#ifndef VIMAGE static u_char isn_secret[32]; static int isn_last_reseed; static u_int32_t isn_offset, isn_offset_old; static MD5_CTX isn_ctx; +#endif tcp_seq tcp_new_isn(struct tcpcb *tp) { + INIT_VNET_INET(tp->t_vnet); u_int32_t md5_buffer[4]; tcp_seq new_isn; @@ -1385,37 +1501,37 @@ ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ - if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) && - (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) + if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && + (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) < (u_int)ticks))) { - read_random(&isn_secret, sizeof(isn_secret)); - isn_last_reseed = ticks; + read_random(&V_isn_secret, sizeof(V_isn_secret)); + V_isn_last_reseed = ticks; } /* Compute the md5 hash and return the ISN. */ - MD5Init(&isn_ctx); - MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); - MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); + MD5Init(&V_isn_ctx); + MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); + MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { - MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, + MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, sizeof(struct in6_addr)); - MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, + MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, sizeof(struct in6_addr)); } else #endif { - MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, + MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, sizeof(struct in_addr)); - MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, + MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, sizeof(struct in_addr)); } - MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); - MD5Final((u_char *) &md5_buffer, &isn_ctx); + MD5Update(&V_isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret)); + MD5Final((u_char *) &md5_buffer, &V_isn_ctx); new_isn = (tcp_seq) md5_buffer[0]; - isn_offset += ISN_STATIC_INCREMENT + + V_isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); - new_isn += isn_offset; + new_isn += V_isn_offset; ISN_UNLOCK(); return (new_isn); } @@ -1431,12 +1547,15 @@ u_int32_t projected_offset; ISN_LOCK(); - projected_offset = isn_offset_old + ISN_BYTES_PER_SECOND / 100; + VNET_ITERLOOP_BEGIN(); + INIT_VNET_INET(curvnet); + projected_offset = V_isn_offset_old + ISN_BYTES_PER_SECOND / 100; - if (SEQ_GT(projected_offset, isn_offset)) - isn_offset = projected_offset; + if (SEQ_GT(projected_offset, V_isn_offset)) + V_isn_offset = projected_offset; - isn_offset_old = isn_offset; + V_isn_offset_old = V_isn_offset; + VNET_ITERLOOP_END(); callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL); ISN_UNLOCK(); } @@ -1450,8 +1569,11 @@ tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; +#ifdef INVARIANTS + INIT_VNET_INET(inp->inp_vnet); +#endif - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(inp); if ((inp->inp_vflag & INP_TIMEWAIT) || @@ -1478,6 +1600,7 @@ struct inpcb * tcp_mtudisc(struct inpcb *inp, int errno) { + INIT_VNET_INET(inp->inp_vnet); struct tcpcb *tp; struct socket *so = inp->inp_socket; u_int maxmtu; @@ -1511,9 +1634,9 @@ if (!maxmtu) { tp->t_maxopd = tp->t_maxseg = #ifdef INET6 - isipv6 ? tcp_v6mssdflt : + isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ - tcp_mssdflt; + V_tcp_mssdflt; return (inp); } mss = maxmtu - @@ -1562,7 +1685,7 @@ tp->t_maxseg = mss; - tcpstat.tcps_mturesent++; + V_tcpstat.tcps_mturesent++; tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_free_sackholes(tp); @@ -1748,6 +1871,7 @@ void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) { + INIT_VNET_INET(tp->t_vnet); u_long bw; u_long bwnd; int save_ticks; @@ -1758,7 +1882,8 @@ * If inflight_enable is disabled in the middle of a tcp connection, * make sure snd_bwnd is effectively disabled. */ - if (tcp_inflight_enable == 0 || tp->t_rttlow < tcp_inflight_rttthresh) { + if (V_tcp_inflight_enable == 0 || + tp->t_rttlow < V_tcp_inflight_rttthresh) { tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_bandwidth = 0; return; @@ -1818,7 +1943,7 @@ * no other choice. */ #define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) - bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10; + bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + V_tcp_inflight_stab * tp->t_maxseg / 10; #undef USERTT if (tcp_inflight_debug > 0) { @@ -1834,10 +1959,10 @@ ); } } - if ((long)bwnd < tcp_inflight_min) - bwnd = tcp_inflight_min; - if (bwnd > tcp_inflight_max) - bwnd = tcp_inflight_max; + if ((long)bwnd < V_tcp_inflight_min) + bwnd = V_tcp_inflight_min; + if (bwnd > V_tcp_inflight_max) + bwnd = V_tcp_inflight_max; if ((long)bwnd < tp->t_maxseg * 2) bwnd = tp->t_maxseg * 2; tp->snd_bwnd = bwnd; @@ -1967,6 +2092,10 @@ static int sysctl_drop(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET(curvnet); +#ifdef INET6 + INIT_VNET_INET6(curvnet); +#endif /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; @@ -2013,10 +2142,10 @@ lin = (struct sockaddr_in *)&addrs[1]; break; } - error = sa6_embedscope(fin6, ip6_use_defzone); + error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); - error = sa6_embedscope(lin6, ip6_use_defzone); + error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; @@ -2031,17 +2160,17 @@ default: return (EINVAL); } - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: - inp = in6_pcblookup_hash(&tcbinfo, &f6, fin6->sin6_port, + inp = in6_pcblookup_hash(&V_tcbinfo, &f6, fin6->sin6_port, &l6, lin6->sin6_port, 0, NULL); break; #endif case AF_INET: - inp = in_pcblookup_hash(&tcbinfo, fin->sin_addr, fin->sin_port, - lin->sin_addr, lin->sin_port, 0, NULL); + inp = in_pcblookup_hash(&V_tcbinfo, fin->sin_addr, + fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL); break; } if (inp != NULL) { @@ -2068,7 +2197,7 @@ INP_UNLOCK(inp); } else error = ESRCH; - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } --- /u/marko/p4/head/src/sys/netinet/tcp_syncache.c 2007-12-27 19:32:57.000000000 +0100 +++ src/sys/netinet/tcp_syncache.c 2008-01-14 19:23:55.000000000 +0100 @@ -37,6 +37,7 @@ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -53,12 +54,15 @@ #include #include #include +#include #include +#include #include #include +#include #include #include #include @@ -96,6 +100,12 @@ #include +#ifdef TCP_OFFLOAD_DISABLE +#define TOEPCB_ISSET(sc) (0) +#else +#define TOEPCB_ISSET(sc) ((sc)->sc_toepcb != NULL) +#endif + static int tcp_syncookies = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW, &tcp_syncookies, 0, @@ -106,65 +116,6 @@ &tcp_syncookiesonly, 0, "Use only TCP SYN cookies"); -#define SYNCOOKIE_SECRET_SIZE 8 /* dwords */ -#define SYNCOOKIE_LIFETIME 16 /* seconds */ - -struct syncache { - TAILQ_ENTRY(syncache) sc_hash; - struct in_conninfo sc_inc; /* addresses */ - int sc_rxttime; /* retransmit time */ - u_int16_t sc_rxmits; /* retransmit counter */ - - u_int32_t sc_tsreflect; /* timestamp to reflect */ - u_int32_t sc_ts; /* our timestamp to send */ - u_int32_t sc_tsoff; /* ts offset w/ syncookies */ - u_int32_t sc_flowlabel; /* IPv6 flowlabel */ - tcp_seq sc_irs; /* seq from peer */ - tcp_seq sc_iss; /* our ISS */ - struct mbuf *sc_ipopts; /* source route */ - - u_int16_t sc_peer_mss; /* peer's MSS */ - u_int16_t sc_wnd; /* advertised window */ - u_int8_t sc_ip_ttl; /* IPv4 TTL */ - u_int8_t sc_ip_tos; /* IPv4 TOS */ - u_int8_t sc_requested_s_scale:4, - sc_requested_r_scale:4; - u_int8_t sc_flags; -#define SCF_NOOPT 0x01 /* no TCP options */ -#define SCF_WINSCALE 0x02 /* negotiated window scaling */ -#define SCF_TIMESTAMP 0x04 /* negotiated timestamps */ - /* MSS is implicit */ -#define SCF_UNREACH 0x10 /* icmp unreachable received */ -#define SCF_SIGNATURE 0x20 /* send MD5 digests */ -#define SCF_SACK 0x80 /* send SACK option */ -#ifndef TCP_OFFLOAD_DISABLE - struct toe_usrreqs *sc_tu; /* TOE operations */ - void *sc_toepcb; /* TOE protocol block */ -#endif -#ifdef MAC - struct label *sc_label; /* MAC label reference */ -#endif -}; - -#ifdef TCP_OFFLOAD_DISABLE -#define TOEPCB_ISSET(sc) (0) -#else -#define TOEPCB_ISSET(sc) ((sc)->sc_toepcb != NULL) -#endif - - -struct syncache_head { - struct mtx sch_mtx; - TAILQ_HEAD(sch_head, syncache) sch_bucket; - struct callout sch_timer; - int sch_nextc; - u_int sch_length; - u_int sch_oddeven; - u_int32_t sch_secbits_odd[SYNCOOKIE_SECRET_SIZE]; - u_int32_t sch_secbits_even[SYNCOOKIE_SECRET_SIZE]; - u_int sch_reseed; /* time_uptime, seconds */ -}; - static void syncache_drop(struct syncache *, struct syncache_head *); static void syncache_free(struct syncache *); static void syncache_insert(struct syncache *, struct syncache_head *); @@ -193,50 +144,47 @@ #define TCP_SYNCACHE_HASHSIZE 512 #define TCP_SYNCACHE_BUCKETLIMIT 30 -struct tcp_syncache { - struct syncache_head *hashbase; - uma_zone_t zone; - u_int hashsize; - u_int hashmask; - u_int bucket_limit; - u_int cache_count; /* XXX: unprotected */ - u_int cache_limit; - u_int rexmt_limit; - u_int hash_secret; -}; +#ifndef VIMAGE static struct tcp_syncache tcp_syncache; +int tcp_sc_rst_sock_fail; +#endif SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache"); -SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN, - &tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO, + bucketlimit, CTLFLAG_RDTUN, + tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache"); -SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN, - &tcp_syncache.cache_limit, 0, "Overall entry limit for syncache"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO, + cachelimit, CTLFLAG_RDTUN, + tcp_syncache.cache_limit, 0, "Overall entry limit for syncache"); -SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD, - &tcp_syncache.cache_count, 0, "Current number of entries in syncache"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO, + count, CTLFLAG_RD, + tcp_syncache.cache_count, 0, "Current number of entries in syncache"); -SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN, - &tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO, + hashsize, CTLFLAG_RDTUN, + tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable"); -SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW, - &tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO, + rexmtlimit, CTLFLAG_RW, + tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions"); -int tcp_sc_rst_sock_fail = 1; -SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail, CTLFLAG_RW, - &tcp_sc_rst_sock_fail, 0, "Send reset on socket allocation failure"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO, + rst_on_sock_fail, CTLFLAG_RW, + tcp_sc_rst_sock_fail, 0, "Send reset on socket allocation failure"); static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache"); #define SYNCACHE_HASH(inc, mask) \ - ((tcp_syncache.hash_secret ^ \ + ((V_tcp_syncache.hash_secret ^ \ (inc)->inc_faddr.s_addr ^ \ ((inc)->inc_faddr.s_addr >> 16) ^ \ (inc)->inc_fport ^ (inc)->inc_lport) & mask) #define SYNCACHE_HASH6(inc, mask) \ - ((tcp_syncache.hash_secret ^ \ + ((V_tcp_syncache.hash_secret ^ \ (inc)->inc6_faddr.s6_addr32[0] ^ \ (inc)->inc6_faddr.s6_addr32[3] ^ \ (inc)->inc_fport ^ (inc)->inc_lport) & mask) @@ -260,63 +208,85 @@ static void syncache_free(struct syncache *sc) { + INIT_VNET_INET(curvnet); + if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); #ifdef MAC mac_syncache_destroy(&sc->sc_label); #endif - uma_zfree(tcp_syncache.zone, sc); + uma_zfree(V_tcp_syncache.zone, sc); } void syncache_init(void) { + INIT_VNET_INET(curvnet); int i; - tcp_syncache.cache_count = 0; - tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; - tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; - tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; - tcp_syncache.hash_secret = arc4random(); + V_tcp_syncache.cache_count = 0; + V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; + V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; + V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; + V_tcp_syncache.hash_secret = arc4random(); + V_tcp_sc_rst_sock_fail = 1; TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize", - &tcp_syncache.hashsize); + &V_tcp_syncache.hashsize); TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit", - &tcp_syncache.bucket_limit); - if (!powerof2(tcp_syncache.hashsize) || tcp_syncache.hashsize == 0) { + &V_tcp_syncache.bucket_limit); + if (!powerof2(V_tcp_syncache.hashsize) || + V_tcp_syncache.hashsize == 0) { printf("WARNING: syncache hash size is not a power of 2.\n"); - tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; + V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; } - tcp_syncache.hashmask = tcp_syncache.hashsize - 1; + V_tcp_syncache.hashmask = V_tcp_syncache.hashsize - 1; /* Set limits. */ - tcp_syncache.cache_limit = - tcp_syncache.hashsize * tcp_syncache.bucket_limit; + V_tcp_syncache.cache_limit = + V_tcp_syncache.hashsize * V_tcp_syncache.bucket_limit; TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit", - &tcp_syncache.cache_limit); + &V_tcp_syncache.cache_limit); /* Allocate the hash table. */ - MALLOC(tcp_syncache.hashbase, struct syncache_head *, - tcp_syncache.hashsize * sizeof(struct syncache_head), + MALLOC(V_tcp_syncache.hashbase, struct syncache_head *, + V_tcp_syncache.hashsize * sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO); /* Initialize the hash buckets. */ - for (i = 0; i < tcp_syncache.hashsize; i++) { - TAILQ_INIT(&tcp_syncache.hashbase[i].sch_bucket); - mtx_init(&tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head", + for (i = 0; i < V_tcp_syncache.hashsize; i++) { +#ifdef VIMAGE + V_tcp_syncache.hashbase[i].sch_vnet = curvnet; +#endif + TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket); + mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head", NULL, MTX_DEF); - callout_init_mtx(&tcp_syncache.hashbase[i].sch_timer, - &tcp_syncache.hashbase[i].sch_mtx, 0); - tcp_syncache.hashbase[i].sch_length = 0; + callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer, + &V_tcp_syncache.hashbase[i].sch_mtx, 0); + V_tcp_syncache.hashbase[i].sch_length = 0; } /* Create the syncache entry zone. */ - tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache), + /* XXX one zone for all vnets should do fine - revisit!!! */ + V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - uma_zone_set_max(tcp_syncache.zone, tcp_syncache.cache_limit); + uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit); } +#ifdef VIMAGE +void +syncache_destroy(void) +{ + INIT_VNET_INET(curvnet); + + /* XXX walk the cache, free remaining objects, stop timers */ + + uma_zdestroy(V_tcp_syncache.zone); + FREE(V_tcp_syncache.hashbase, M_SYNCACHE); +} +#endif + /* * Inserts a syncache entry into the specified bucket row. * Locks and unlocks the syncache_head autonomously. @@ -324,6 +294,7 @@ static void syncache_insert(struct syncache *sc, struct syncache_head *sch) { + INIT_VNET_INET(sch->sch_vnet); struct syncache *sc2; SCH_LOCK(sch); @@ -332,12 +303,12 @@ * Make sure that we don't overflow the per-bucket limit. * If the bucket is full, toss the oldest element. */ - if (sch->sch_length >= tcp_syncache.bucket_limit) { + if (sch->sch_length >= V_tcp_syncache.bucket_limit) { KASSERT(!TAILQ_EMPTY(&sch->sch_bucket), ("sch->sch_length incorrect")); sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head); syncache_drop(sc2, sch); - tcpstat.tcps_sc_bucketoverflow++; + V_tcpstat.tcps_sc_bucketoverflow++; } /* Put it into the bucket. */ @@ -351,8 +322,8 @@ SCH_UNLOCK(sch); - tcp_syncache.cache_count++; - tcpstat.tcps_sc_added++; + V_tcp_syncache.cache_count++; + V_tcpstat.tcps_sc_added++; } /* @@ -362,6 +333,7 @@ static void syncache_drop(struct syncache *sc, struct syncache_head *sch) { + INIT_VNET_INET(sch->sch_vnet); SCH_LOCK_ASSERT(sch); @@ -373,7 +345,7 @@ sc->sc_tu->tu_syncache_event(TOE_SC_DROP, sc->sc_toepcb); #endif syncache_free(sc); - tcp_syncache.cache_count--; + V_tcp_syncache.cache_count--; } /* @@ -405,6 +377,8 @@ struct syncache *sc, *nsc; int tick = ticks; char *s; + CURVNET_SET(sch->sch_vnet); + INIT_VNET_INET(sch->sch_vnet); /* NB: syncache_head has already been locked by the callout. */ SCH_LOCK_ASSERT(sch); @@ -429,7 +403,7 @@ sch->sch_nextc = sc->sc_rxttime; continue; } - if (sc->sc_rxmits > tcp_syncache.rexmt_limit) { + if (sc->sc_rxmits > V_tcp_syncache.rexmt_limit) { if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Retransmits exhausted, " "giving up and removing syncache entry\n", @@ -437,7 +411,7 @@ free(s, M_TCPLOG); } syncache_drop(sc, sch); - tcpstat.tcps_sc_stale++; + V_tcpstat.tcps_sc_stale++; continue; } if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { @@ -448,12 +422,13 @@ } (void) syncache_respond(sc); - tcpstat.tcps_sc_retransmitted++; + V_tcpstat.tcps_sc_retransmitted++; syncache_timeout(sc, sch, 0); } if (!TAILQ_EMPTY(&(sch)->sch_bucket)) callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick, syncache_timer, (void *)(sch)); + CURVNET_RESTORE(); } /* @@ -463,13 +438,14 @@ struct syncache * syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp) { + INIT_VNET_INET(curvnet); struct syncache *sc; struct syncache_head *sch; #ifdef INET6 if (inc->inc_isipv6) { - sch = &tcp_syncache.hashbase[ - SYNCACHE_HASH6(inc, tcp_syncache.hashmask)]; + sch = &V_tcp_syncache.hashbase[ + SYNCACHE_HASH6(inc, V_tcp_syncache.hashmask)]; *schp = sch; SCH_LOCK(sch); @@ -482,8 +458,8 @@ } else #endif { - sch = &tcp_syncache.hashbase[ - SYNCACHE_HASH(inc, tcp_syncache.hashmask)]; + sch = &V_tcp_syncache.hashbase[ + SYNCACHE_HASH(inc, V_tcp_syncache.hashmask)]; *schp = sch; SCH_LOCK(sch); @@ -510,6 +486,7 @@ void syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th) { + INIT_VNET_INET(curvnet); struct syncache *sc; struct syncache_head *sch; char *s = NULL; @@ -525,7 +502,7 @@ if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST with ACK, SYN or " "FIN flag set, segment ignored\n", s, __func__); - tcpstat.tcps_badrst++; + V_tcpstat.tcps_badrst++; goto done; } @@ -542,7 +519,7 @@ log(LOG_DEBUG, "%s; %s: Spurious RST without matching " "syncache entry (possibly syncookie only), " "segment ignored\n", s, __func__); - tcpstat.tcps_badrst++; + V_tcpstat.tcps_badrst++; goto done; } @@ -566,12 +543,12 @@ log(LOG_DEBUG, "%s; %s: Our SYN|ACK was rejected, " "connection attempt aborted by remote endpoint\n", s, __func__); - tcpstat.tcps_sc_reset++; + V_tcpstat.tcps_sc_reset++; } else if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: RST with invalid SEQ %u != IRS %u " "(+WND %u), segment ignored\n", s, __func__, th->th_seq, sc->sc_irs, sc->sc_wnd); - tcpstat.tcps_badrst++; + V_tcpstat.tcps_badrst++; } done: @@ -583,6 +560,7 @@ void syncache_badack(struct in_conninfo *inc) { + INIT_VNET_INET(curvnet); struct syncache *sc; struct syncache_head *sch; @@ -590,7 +568,7 @@ SCH_LOCK_ASSERT(sch); if (sc != NULL) { syncache_drop(sc, sch); - tcpstat.tcps_sc_badack++; + V_tcpstat.tcps_sc_badack++; } SCH_UNLOCK(sch); } @@ -598,6 +576,7 @@ void syncache_unreach(struct in_conninfo *inc, struct tcphdr *th) { + INIT_VNET_INET(curvnet); struct syncache *sc; struct syncache_head *sch; @@ -623,7 +602,7 @@ goto done; } syncache_drop(sc, sch); - tcpstat.tcps_sc_unreach++; + V_tcpstat.tcps_sc_unreach++; done: SCH_UNLOCK(sch); } @@ -634,12 +613,13 @@ static struct socket * syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) { + INIT_VNET_INET(lso->so_vnet); struct inpcb *inp = NULL; struct socket *so; struct tcpcb *tp; char *s; - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* * Ok, create the full blown connection, and set things up @@ -654,7 +634,7 @@ * have the peer retransmit its SYN again after its * RTO and try again. */ - tcpstat.tcps_listendrop++; + V_tcpstat.tcps_listendrop++; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Socket create failed " "due to limits or memory shortage\n", @@ -819,7 +799,7 @@ INP_UNLOCK(inp); - tcpstat.tcps_accepts++; + V_tcpstat.tcps_accepts++; return (so); abort: @@ -841,6 +821,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct socket **lsop, struct mbuf *m) { + INIT_VNET_INET(curvnet); struct syncache *sc; struct syncache_head *sch; struct syncache scs; @@ -850,7 +831,7 @@ * Global TCP locks are held because we manipulate the PCB lists * and create a new socket. */ - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK, ("%s: can handle only ACK", __func__)); @@ -888,7 +869,7 @@ /* Pull out the entry to unlock the bucket row. */ TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; - tcp_syncache.cache_count--; + V_tcp_syncache.cache_count--; SCH_UNLOCK(sch); } @@ -936,9 +917,9 @@ *lsop = syncache_socket(sc, *lsop, m); if (*lsop == NULL) - tcpstat.tcps_sc_aborted++; + V_tcpstat.tcps_sc_aborted++; else - tcpstat.tcps_sc_completed++; + V_tcpstat.tcps_sc_completed++; if (sc != &scs) syncache_free(sc); @@ -970,6 +951,7 @@ struct inpcb *inp, struct socket **lsop, struct mbuf *m, struct toe_usrreqs *tu, void *toepcb) { + INIT_VNET_INET(inp->inp_vnet); struct tcpcb *tp; struct socket *so; struct syncache *sc = NULL; @@ -986,7 +968,7 @@ #endif struct syncache scs; - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(inp); /* listen socket */ KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN, ("%s: unexpected tcp flags", __func__)); @@ -1015,13 +997,13 @@ #ifdef MAC if (mac_syncache_init(&maclabel) != 0) { INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); goto done; } else mac_syncache_create(maclabel, inp); #endif INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); /* * Remember the IP options, if any. @@ -1051,7 +1033,7 @@ sc->sc_tu->tu_syncache_event(TOE_SC_ENTRY_PRESENT, sc->sc_toepcb); #endif - tcpstat.tcps_sc_dupsyn++; + V_tcpstat.tcps_sc_dupsyn++; if (ipopts) { /* * If we were remembering a previous source route, @@ -1088,24 +1070,24 @@ if (!TOEPCB_ISSET(sc) && syncache_respond(sc) == 0) { sc->sc_rxmits = 0; syncache_timeout(sc, sch, 1); - tcpstat.tcps_sndacks++; - tcpstat.tcps_sndtotal++; + V_tcpstat.tcps_sndacks++; + V_tcpstat.tcps_sndtotal++; } SCH_UNLOCK(sch); goto done; } - sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO); + sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { /* * The zone allocator couldn't provide more entries. * Treat this as if the cache was full; drop the oldest * entry and insert the new one. */ - tcpstat.tcps_sc_zonefail++; + V_tcpstat.tcps_sc_zonefail++; if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL) syncache_drop(sc, sch); - sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO); + sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { if (tcp_syncookies) { bzero(&scs, sizeof(scs)); @@ -1151,7 +1133,7 @@ win = imin(win, TCP_MAXWIN); sc->sc_wnd = win; - if (tcp_do_rfc1323) { + if (V_tcp_do_rfc1323) { /* * A timestamp received in a SYN makes * it ok to send timestamp requests and replies. @@ -1234,12 +1216,12 @@ syncache_free(sc); else if (sc != &scs) syncache_insert(sc, sch); /* locks and unlocks sch */ - tcpstat.tcps_sndacks++; - tcpstat.tcps_sndtotal++; + V_tcpstat.tcps_sndacks++; + V_tcpstat.tcps_sndtotal++; } else { if (sc != &scs) syncache_free(sc); - tcpstat.tcps_sc_dropped++; + V_tcpstat.tcps_sc_dropped++; } done: @@ -1258,6 +1240,7 @@ static int syncache_respond(struct syncache *sc) { + INIT_VNET_INET(curvnet); struct ip *ip = NULL; struct mbuf *m; struct tcphdr *th; @@ -1278,7 +1261,7 @@ /* Determine MSS we advertize to other end of connection. */ mssopt = tcp_mssopt(&sc->sc_inc); if (sc->sc_peer_mss) - mssopt = max( min(sc->sc_peer_mss, mssopt), tcp_minmss); + mssopt = max( min(sc->sc_peer_mss, mssopt), V_tcp_minmss); /* XXX: Assume that the entire packet will fit in a header mbuf. */ KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN, @@ -1332,7 +1315,7 @@ * 1) path_mtu_discovery is disabled * 2) the SCF_UNREACH flag has been set */ - if (path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0)) + if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0)) ip->ip_off |= IP_DF; th = (struct tcphdr *)(ip + 1); @@ -1507,6 +1490,7 @@ syncookie_generate(struct syncache_head *sch, struct syncache *sc, u_int32_t *flowlabel) { + INIT_VNET_INET(curvnet); MD5_CTX ctx; u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)]; u_int32_t data; @@ -1534,7 +1518,8 @@ off = sc->sc_iss & 0x7; /* iss was randomized before */ /* Maximum segment size calculation. */ - pmss = max( min(sc->sc_peer_mss, tcp_mssopt(&sc->sc_inc)), tcp_minmss); + pmss = max( min(sc->sc_peer_mss, tcp_mssopt(&sc->sc_inc)), + V_tcp_minmss); for (mss = sizeof(tcp_sc_msstab) / sizeof(int) - 1; mss > 0; mss--) if (tcp_sc_msstab[mss] <= pmss) break; @@ -1572,7 +1557,7 @@ sc->sc_tsoff = data - ticks; /* after XOR */ } - tcpstat.tcps_sc_sendcookie++; + V_tcpstat.tcps_sc_sendcookie++; return; } @@ -1581,6 +1566,7 @@ struct syncache *sc, struct tcpopt *to, struct tcphdr *th, struct socket *so) { + INIT_VNET_INET(curvnet); MD5_CTX ctx; u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)]; u_int32_t data = 0; @@ -1675,7 +1661,7 @@ sc->sc_rxmits = 0; sc->sc_peer_mss = tcp_sc_msstab[mss]; - tcpstat.tcps_sc_recvcookie++; + V_tcpstat.tcps_sc_recvcookie++; return (sc); } @@ -1688,12 +1674,13 @@ int syncache_pcbcount(void) { + INIT_VNET_INET(curvnet); struct syncache_head *sch; int count, i; - for (count = 0, i = 0; i < tcp_syncache.hashsize; i++) { + for (count = 0, i = 0; i < V_tcp_syncache.hashsize; i++) { /* No need to lock for a read. */ - sch = &tcp_syncache.hashbase[i]; + sch = &V_tcp_syncache.hashbase[i]; count += sch->sch_length; } return count; @@ -1711,13 +1698,14 @@ int syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported) { + INIT_VNET_INET(curvnet); struct xtcpcb xt; struct syncache *sc; struct syncache_head *sch; int count, error, i; - for (count = 0, error = 0, i = 0; i < tcp_syncache.hashsize; i++) { - sch = &tcp_syncache.hashbase[i]; + for (count = 0, error = 0, i = 0; i < V_tcp_syncache.hashsize; i++) { + sch = &V_tcp_syncache.hashbase[i]; SCH_LOCK(sch); TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { if (count >= max_pcbs) { --- /u/marko/p4/head/src/sys/netinet/tcp_syncache.h 2007-12-27 19:32:57.000000000 +0100 +++ src/sys/netinet/tcp_syncache.h 2008-01-14 19:23:55.000000000 +0100 @@ -1,6 +1,12 @@ /*- - * Copyright (c) 1982, 1986, 1993, 1994, 1995 - * The Regents of the University of California. All rights reserved. + * Copyright (c) 2001 McAfee, Inc. + * Copyright (c) 2006 Andre Oppermann, Internet Business Solutions AG + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Jonathan Lemon + * and McAfee Research, the Security Research Division of McAfee, Inc. under + * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -10,14 +16,11 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) @@ -26,27 +29,95 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 - * $FreeBSD: src/sys/netinet/tcp_syncache.h,v 1.2 2007/12/12 20:35:59 kmacy Exp $ + * XXX RCS ID */ #ifndef _NETINET_TCP_SYNCACHE_H_ #define _NETINET_TCP_SYNCACHE_H_ #ifdef _KERNEL -void syncache_init(void); -void syncache_unreach(struct in_conninfo *, struct tcphdr *); -int syncache_expand(struct in_conninfo *, struct tcpopt *, - struct tcphdr *, struct socket **, struct mbuf *); -void syncache_add(struct in_conninfo *, struct tcpopt *, - struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *); -void syncache_offload_add(struct in_conninfo *, struct tcpopt *, - struct tcphdr *, struct inpcb *, struct socket **, - struct toe_usrreqs *tu, void *toepcb); -void syncache_chkrst(struct in_conninfo *, struct tcphdr *); -void syncache_badack(struct in_conninfo *); -int syncache_pcbcount(void); -int syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported); +void syncache_init(void); +#ifdef VIMAGE +void syncache_destroy(void); +#endif +void syncache_unreach(struct in_conninfo *, struct tcphdr *); +int syncache_expand(struct in_conninfo *, struct tcpopt *, + struct tcphdr *, struct socket **, struct mbuf *); +void syncache_add(struct in_conninfo *, struct tcpopt *, + struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *); +void syncache_offload_add(struct in_conninfo *, struct tcpopt *, + struct tcphdr *, struct inpcb *, struct socket **, + struct toe_usrreqs *tu, void *toepcb); +void syncache_chkrst(struct in_conninfo *, struct tcphdr *); +void syncache_badack(struct in_conninfo *); +int syncache_pcbcount(void); +int syncache_pcblist(struct sysctl_req *req, int max_pcbs, + int *pcbs_exported); + +struct syncache { + TAILQ_ENTRY(syncache) sc_hash; + struct in_conninfo sc_inc; /* addresses */ + u_long sc_rxttime; /* retransmit time */ + u_int16_t sc_rxmits; /* retransmit counter */ + + u_int32_t sc_tsreflect; /* timestamp to reflect */ + u_int32_t sc_ts; /* our timestamp to send */ + u_int32_t sc_tsoff; /* ts offset w/ syncookies */ + u_int32_t sc_flowlabel; /* IPv6 flowlabel */ + tcp_seq sc_irs; /* seq from peer */ + tcp_seq sc_iss; /* our ISS */ + struct mbuf *sc_ipopts; /* source route */ + + u_int16_t sc_peer_mss; /* peer's MSS */ + u_int16_t sc_wnd; /* advertised window */ + u_int8_t sc_ip_ttl; /* IPv4 TTL */ + u_int8_t sc_ip_tos; /* IPv4 TOS */ + u_int8_t sc_requested_s_scale:4, + sc_requested_r_scale:4; + u_int8_t sc_flags; +#define SCF_NOOPT 0x01 /* no TCP options */ +#define SCF_WINSCALE 0x02 /* negotiated window scaling */ +#define SCF_TIMESTAMP 0x04 /* negotiated timestamps */ + /* MSS is implicit */ +#define SCF_UNREACH 0x10 /* icmp unreachable received */ +#define SCF_SIGNATURE 0x20 /* send MD5 digests */ +#define SCF_SACK 0x80 /* send SACK option */ +#ifndef TCP_OFFLOAD_DISABLE + struct toe_usrreqs *sc_tu; /* TOE operations */ + void *sc_toepcb; /* TOE protocol block */ +#endif +#ifdef MAC + struct label *sc_label; /* MAC label reference */ +#endif +}; + +#define SYNCOOKIE_SECRET_SIZE 8 /* dwords */ +#define SYNCOOKIE_LIFETIME 16 /* seconds */ + +struct syncache_head { + struct mtx sch_mtx; + TAILQ_HEAD(sch_head, syncache) sch_bucket; + struct vnet *sch_vnet; + struct callout sch_timer; + int sch_nextc; + u_int sch_length; + u_int sch_oddeven; + u_int32_t sch_secbits_odd[SYNCOOKIE_SECRET_SIZE]; + u_int32_t sch_secbits_even[SYNCOOKIE_SECRET_SIZE]; + u_int sch_reseed; /* time_uptime, seconds */ +}; + +struct tcp_syncache { + struct syncache_head *hashbase; + uma_zone_t zone; + u_int hashsize; + u_int hashmask; + u_int bucket_limit; + u_int cache_count; /* XXX: unprotected */ + u_int cache_limit; + u_int rexmt_limit; + u_int hash_secret; +}; #endif /* _KERNEL */ -#endif /* _NETINET_TCP_SYNCACHE_H_ */ +#endif /* !_NETINET_TCP_SYNCACHE_H_ */ --- /u/marko/p4/head/src/sys/netinet/tcp_timer.c 2007-10-16 13:53:40.000000000 +0200 +++ src/sys/netinet/tcp_timer.c 2007-10-22 18:06:44.000000000 +0200 @@ -34,6 +34,7 @@ #include "opt_inet6.h" #include "opt_tcpdebug.h" +#include "opt_vimage.h" #include #include @@ -45,9 +46,12 @@ #include #include #include +#include +#include #include +#include #include #include #include @@ -124,10 +128,13 @@ tcp_slowtimo(void) { + VNET_ITERLOOP_BEGIN(); + INIT_VNET_INET(vnet_iter); tcp_maxidle = tcp_keepcnt * tcp_keepintvl; - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); (void) tcp_tw_2msl_scan(0); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + VNET_ITERLOOP_END(); } int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = @@ -151,8 +158,10 @@ { struct tcpcb *tp = xtp; struct inpcb *inp; + CURVNET_SET(tp->t_vnet); + INIT_VNET_INET(tp->t_vnet); - INP_INFO_RLOCK(&tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; /* * XXXRW: While this assert is in fact correct, bugs in the tcpcb @@ -163,22 +172,25 @@ */ if (inp == NULL) { tcp_timer_race++; - INP_INFO_RUNLOCK(&tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); return; } INP_LOCK(inp); - INP_INFO_RUNLOCK(&tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); if ((inp->inp_vflag & INP_DROPPED) || callout_pending(&tp->t_timers->tt_delack) || !callout_active(&tp->t_timers->tt_delack)) { INP_UNLOCK(inp); + CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_delack); tp->t_flags |= TF_ACKNOW; - tcpstat.tcps_delack++; + V_tcpstat.tcps_delack++; (void) tcp_output(tp); INP_UNLOCK(inp); + CURVNET_RESTORE(); } void @@ -186,6 +198,8 @@ { struct tcpcb *tp = xtp; struct inpcb *inp; + CURVNET_SET(tp->t_vnet); + INIT_VNET_INET(tp->t_vnet); #ifdef TCPDEBUG int ostate; @@ -194,7 +208,7 @@ /* * XXXRW: Does this actually happen? */ - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); inp = tp->t_inpcb; /* * XXXRW: While this assert is in fact correct, bugs in the tcpcb @@ -205,7 +219,8 @@ */ if (inp == NULL) { tcp_timer_race++; - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); return; } INP_LOCK(inp); @@ -213,7 +228,8 @@ if ((inp->inp_vflag & INP_DROPPED) || callout_pending(&tp->t_timers->tt_2msl) || !callout_active(&tp->t_timers->tt_2msl)) { INP_UNLOCK(tp->t_inpcb); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_2msl); @@ -230,7 +246,7 @@ if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && tp->t_inpcb && tp->t_inpcb->inp_socket && (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { - tcpstat.tcps_finwait2_drops++; + V_tcpstat.tcps_finwait2_drops++; tp = tcp_close(tp); } else { if (tp->t_state != TCPS_TIME_WAIT && @@ -248,7 +264,8 @@ #endif if (tp != NULL) INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); } void @@ -257,12 +274,14 @@ struct tcpcb *tp = xtp; struct tcptemp *t_template; struct inpcb *inp; + CURVNET_SET(tp->t_vnet); + INIT_VNET_INET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); inp = tp->t_inpcb; /* * XXXRW: While this assert is in fact correct, bugs in the tcpcb @@ -273,14 +292,16 @@ */ if (inp == NULL) { tcp_timer_race++; - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); return; } INP_LOCK(inp); if ((inp->inp_vflag & INP_DROPPED) || callout_pending(&tp->t_timers->tt_keep) || !callout_active(&tp->t_timers->tt_keep)) { INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_keep); @@ -288,7 +309,7 @@ * Keep-alive timer went off; send something * or drop connection if idle for too long. */ - tcpstat.tcps_keeptimeo++; + V_tcpstat.tcps_keeptimeo++; if (tp->t_state < TCPS_ESTABLISHED) goto dropit; if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && @@ -307,7 +328,7 @@ * by the protocol spec, this requires the * correspondent TCP to respond. */ - tcpstat.tcps_keepprobe++; + V_tcpstat.tcps_keepprobe++; t_template = tcpip_maketemplate(inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, @@ -325,11 +346,12 @@ PRU_SLOWTIMO); #endif INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); return; dropit: - tcpstat.tcps_keepdrops++; + V_tcpstat.tcps_keepdrops++; tp = tcp_drop(tp, ETIMEDOUT); #ifdef TCPDEBUG @@ -339,7 +361,8 @@ #endif if (tp != NULL) INP_UNLOCK(tp->t_inpcb); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); } void @@ -347,12 +370,14 @@ { struct tcpcb *tp = xtp; struct inpcb *inp; + CURVNET_SET(tp->t_vnet); + INIT_VNET_INET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); inp = tp->t_inpcb; /* * XXXRW: While this assert is in fact correct, bugs in the tcpcb @@ -363,14 +388,16 @@ */ if (inp == NULL) { tcp_timer_race++; - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); return; } INP_LOCK(inp); if ((inp->inp_vflag & INP_DROPPED) || callout_pending(&tp->t_timers->tt_persist) || !callout_active(&tp->t_timers->tt_persist)) { INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_persist); @@ -378,7 +405,7 @@ * Persistance timer into zero window. * Force a byte to be output, if possible. */ - tcpstat.tcps_persisttimeo++; + V_tcpstat.tcps_persisttimeo++; /* * Hack: if the peer is dead/unreachable, we do not * time out if the window is closed. After a full @@ -389,7 +416,7 @@ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && ((ticks - tp->t_rcvtime) >= tcp_maxpersistidle || (ticks - tp->t_rcvtime) >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { - tcpstat.tcps_persistdrop++; + V_tcpstat.tcps_persistdrop++; tp = tcp_drop(tp, ETIMEDOUT); goto out; } @@ -405,13 +432,16 @@ #endif if (tp != NULL) INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); } void tcp_timer_rexmt(void * xtp) { struct tcpcb *tp = xtp; + CURVNET_SET(tp->t_vnet); + INIT_VNET_INET(tp->t_vnet); int rexmt; int headlocked; struct inpcb *inp; @@ -420,7 +450,7 @@ ostate = tp->t_state; #endif - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); headlocked = 1; inp = tp->t_inpcb; /* @@ -432,14 +462,16 @@ */ if (inp == NULL) { tcp_timer_race++; - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); return; } INP_LOCK(inp); if ((inp->inp_vflag & INP_DROPPED) || callout_pending(&tp->t_timers->tt_rexmt) || !callout_active(&tp->t_timers->tt_rexmt)) { INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_rexmt); @@ -451,12 +483,12 @@ */ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; - tcpstat.tcps_timeoutdrop++; + V_tcpstat.tcps_timeoutdrop++; tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); goto out; } - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; if (tp->t_rxtshift == 1) { /* @@ -477,7 +509,7 @@ tp->t_flags &= ~TF_WASFRECOVERY; tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); } - tcpstat.tcps_rexmttimeo++; + V_tcpstat.tcps_rexmttimeo++; if (tp->t_state == TCPS_SYN_SENT) rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; else @@ -562,7 +594,8 @@ if (tp != NULL) INP_UNLOCK(inp); if (headlocked) - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); } void --- /u/marko/p4/head/src/sys/netinet/tcp_timewait.c 2007-10-29 17:17:44.000000000 +0100 +++ src/sys/netinet/tcp_timewait.c 2007-12-10 11:26:13.000000000 +0100 @@ -36,6 +36,7 @@ #include "opt_inet6.h" #include "opt_mac.h" #include "opt_tcpdebug.h" +#include "opt_vimage.h" #include #include @@ -50,12 +51,14 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -101,7 +104,9 @@ * queue pointers in each tcptw structure, are protected using the global * tcbinfo lock, which must be held over queue iteration and modification. */ +#ifndef VIMAGE static TAILQ_HEAD(, tcptw) twq_2msl; +#endif static void tcp_tw_2msl_reset(struct tcptw *, int); static void tcp_tw_2msl_stop(struct tcptw *); @@ -109,16 +114,17 @@ static int tcptw_auto_size(void) { + INIT_VNET_INET(curvnet); int halfrange; /* * Max out at half the ephemeral port range so that TIME_WAIT * sockets don't tie up too many ephemeral ports. */ - if (ipport_lastauto > ipport_firstauto) - halfrange = (ipport_lastauto - ipport_firstauto) / 2; + if (V_ipport_lastauto > V_ipport_firstauto) + halfrange = (V_ipport_lastauto - V_ipport_firstauto) / 2; else - halfrange = (ipport_firstauto - ipport_lastauto) / 2; + halfrange = (V_ipport_firstauto - V_ipport_lastauto) / 2; /* Protect against goofy port ranges smaller than 32. */ return (imin(imax(halfrange, 32), maxsockets / 5)); } @@ -161,6 +167,14 @@ void tcp_tw_init(void) { + INIT_VNET_INET(curvnet); + + TAILQ_INIT(&V_twq_2msl); + +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); @@ -169,9 +183,22 @@ uma_zone_set_max(tcptw_zone, tcptw_auto_size()); else uma_zone_set_max(tcptw_zone, maxtcptw); - TAILQ_INIT(&twq_2msl); } +#ifdef VIMAGE +void +tcp_tw_destroy(void) +{ + INIT_VNET_INET(curvnet); + struct tcptw *tw; + + INP_INFO_WLOCK(&V_tcbinfo); + while((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) + tcp_twclose(tw, 0); + INP_INFO_WUNLOCK(&V_tcbinfo); +} +#endif + /* * Move a TCP connection into TIME_WAIT state. * tcbinfo is locked. @@ -180,12 +207,15 @@ void tcp_twstart(struct tcpcb *tp) { +#if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) + INIT_VNET_INET(tp->t_vnet); +#endif struct tcptw *tw; struct inpcb *inp = tp->t_inpcb; int acknow; struct socket *so; - INP_INFO_WLOCK_ASSERT(&tcbinfo); /* tcp_tw_2msl_reset(). */ + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_reset(). */ INP_LOCK_ASSERT(inp); if (nolocaltimewait && in_localip(inp->inp_faddr)) { @@ -295,10 +325,11 @@ int tcp_twrecycleable(struct tcptw *tw) { + INIT_VNET_INET(curvnet); tcp_seq new_iss = tw->iss; tcp_seq new_irs = tw->irs; - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz); new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz); @@ -317,6 +348,9 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th, struct mbuf *m, int tlen) { +#if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) + INIT_VNET_INET(curvnet); +#endif struct tcptw *tw; int thflags; tcp_seq seq; @@ -327,7 +361,7 @@ #endif /* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */ - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(inp); /* @@ -453,6 +487,7 @@ void tcp_twclose(struct tcptw *tw, int reuse) { + INIT_VNET_INET(curvnet); struct socket *so; struct inpcb *inp; @@ -468,7 +503,7 @@ inp = tw->tw_inpcb; KASSERT((inp->inp_vflag & INP_TIMEWAIT), ("tcp_twclose: !timewait")); KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw")); - INP_INFO_WLOCK_ASSERT(&tcbinfo); /* tcp_tw_2msl_stop(). */ + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_stop(). */ INP_LOCK_ASSERT(inp); tw->tw_inpcb = NULL; @@ -509,7 +544,7 @@ #endif in_pcbfree(inp); } - tcpstat.tcps_closed++; + V_tcpstat.tcps_closed++; crfree(tw->tw_cred); tw->tw_cred = NULL; if (reuse) @@ -520,6 +555,7 @@ int tcp_twrespond(struct tcptw *tw, int flags) { + INIT_VNET_INET(curvnet); struct inpcb *inp = tw->tw_inpcb; struct tcphdr *th; struct mbuf *m; @@ -596,48 +632,51 @@ m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); ip->ip_len = m->m_pkthdr.len; - if (path_mtu_discovery) + if (V_path_mtu_discovery) ip->ip_off |= IP_DF; error = ip_output(m, inp->inp_options, NULL, ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, inp); } if (flags & TH_ACK) - tcpstat.tcps_sndacks++; + V_tcpstat.tcps_sndacks++; else - tcpstat.tcps_sndctrl++; - tcpstat.tcps_sndtotal++; + V_tcpstat.tcps_sndctrl++; + V_tcpstat.tcps_sndtotal++; return (error); } static void tcp_tw_2msl_reset(struct tcptw *tw, int rearm) { + INIT_VNET_INET(curvnet); - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(tw->tw_inpcb); if (rearm) - TAILQ_REMOVE(&twq_2msl, tw, tw_2msl); + TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); tw->tw_time = ticks + 2 * tcp_msl; - TAILQ_INSERT_TAIL(&twq_2msl, tw, tw_2msl); + TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl); } static void tcp_tw_2msl_stop(struct tcptw *tw) { + INIT_VNET_INET(curvnet); - INP_INFO_WLOCK_ASSERT(&tcbinfo); - TAILQ_REMOVE(&twq_2msl, tw, tw_2msl); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); } struct tcptw * tcp_tw_2msl_scan(int reuse) { + INIT_VNET_INET(curvnet); struct tcptw *tw; - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); for (;;) { - tw = TAILQ_FIRST(&twq_2msl); + tw = TAILQ_FIRST(&V_twq_2msl); if (tw == NULL || (!reuse && tw->tw_time > ticks)) break; INP_LOCK(tw->tw_inpcb); --- /u/marko/p4/head/src/sys/netinet/tcp_usrreq.c 2008-01-28 23:53:56.000000000 +0100 +++ src/sys/netinet/tcp_usrreq.c 2008-02-27 18:26:24.000000000 +0100 @@ -38,6 +38,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" +#include "opt_vimage.h" #include #include @@ -53,14 +54,17 @@ #include #include #include +#include #ifdef DDB #include #endif +#include #include #include +#include #include #include #ifdef INET6 @@ -158,8 +162,11 @@ #ifdef INET6 int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0; #endif +#ifdef INVARIANTS + INIT_VNET_INET(so->so_vnet); +#endif - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(inp); KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp")); @@ -248,16 +255,17 @@ static void tcp_usr_detach(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL")); - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); INP_LOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_detach: inp_socket == NULL")); tcp_detach(so, inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); } /* @@ -266,6 +274,7 @@ static int tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { + INIT_VNET_INET(so->so_vnet); int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; @@ -283,7 +292,7 @@ return (EAFNOSUPPORT); TCPDEBUG0; - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL")); INP_LOCK(inp); @@ -297,7 +306,7 @@ out: TCPDEBUG2(PRU_BIND); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } @@ -306,6 +315,7 @@ static int tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { + INIT_VNET_INET(so->so_vnet); int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; @@ -323,7 +333,7 @@ return (EAFNOSUPPORT); TCPDEBUG0; - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); INP_LOCK(inp); @@ -353,7 +363,7 @@ out: TCPDEBUG2(PRU_BIND); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET6 */ @@ -364,12 +374,13 @@ static int tcp_usr_listen(struct socket *so, int backlog, struct thread *td) { + INIT_VNET_INET(so->so_vnet); int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); INP_LOCK(inp); @@ -393,7 +404,7 @@ out: TCPDEBUG2(PRU_LISTEN); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } @@ -401,12 +412,13 @@ static int tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) { + INIT_VNET_INET(so->so_vnet); int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); INP_LOCK(inp); @@ -433,7 +445,7 @@ out: TCPDEBUG2(PRU_LISTEN); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET6 */ @@ -448,6 +460,7 @@ static int tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { + INIT_VNET_INET(so->so_vnet); int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; @@ -466,7 +479,7 @@ prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr); TCPDEBUG0; - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); INP_LOCK(inp); @@ -482,7 +495,7 @@ out: TCPDEBUG2(PRU_CONNECT); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } @@ -490,6 +503,7 @@ static int tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { + INIT_VNET_INET(so->so_vnet); int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; @@ -507,7 +521,7 @@ && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) return (EAFNOSUPPORT); - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); INP_LOCK(inp); @@ -543,7 +557,7 @@ out: TCPDEBUG2(PRU_CONNECT); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET6 */ @@ -562,12 +576,13 @@ static int tcp_usr_disconnect(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; struct tcpcb *tp = NULL; int error = 0; TCPDEBUG0; - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); INP_LOCK(inp); @@ -581,7 +596,7 @@ out: TCPDEBUG2(PRU_DISCONNECT); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } @@ -593,6 +608,7 @@ static int tcp_usr_accept(struct socket *so, struct sockaddr **nam) { + INIT_VNET_INET(so->so_vnet); int error = 0; struct inpcb *inp = NULL; struct tcpcb *tp = NULL; @@ -605,7 +621,7 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL")); - INP_INFO_RLOCK(&tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_LOCK(inp); if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; @@ -625,7 +641,7 @@ out: TCPDEBUG2(PRU_ACCEPT); INP_UNLOCK(inp); - INP_INFO_RUNLOCK(&tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); if (error == 0) *nam = in_sockaddr(port, &addr); return error; @@ -690,12 +706,13 @@ static int tcp_usr_shutdown(struct socket *so) { + INIT_VNET_INET(so->so_vnet); int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("inp == NULL")); INP_LOCK(inp); @@ -712,7 +729,7 @@ out: TCPDEBUG2(PRU_SHUTDOWN); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } @@ -756,6 +773,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { + INIT_VNET_INET(so->so_vnet); int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; @@ -775,7 +793,7 @@ * (2) PRUS_EOF is set, resulting in explicit close on the send. */ if ((nam != NULL) || (flags & PRUS_EOF)) { - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); headlocked = 1; } inp = sotoinpcb(so); @@ -814,7 +832,7 @@ * initialize maxseg/maxopd using peer's cached * MSS. */ - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); @@ -831,12 +849,12 @@ * Close the send side of the connection after * the data is sent. */ - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); socantsendmore(so); tcp_usrclosed(tp); } if (headlocked) { - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; } if (tp != NULL) { @@ -874,7 +892,7 @@ * initialize maxseg/maxopd using peer's cached * MSS. */ - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); @@ -885,10 +903,10 @@ goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; } else if (nam) { - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; } tp->snd_up = tp->snd_una + so->so_snd.sb_cc; @@ -901,7 +919,7 @@ ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); INP_UNLOCK(inp); if (headlocked) - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } @@ -911,6 +929,7 @@ static void tcp_usr_abort(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; @@ -918,7 +937,7 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); INP_LOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_abort: inp_socket == NULL")); @@ -940,7 +959,7 @@ inp->inp_vflag |= INP_SOCKREF; } INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); } /* @@ -949,6 +968,7 @@ static void tcp_usr_close(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; @@ -956,7 +976,7 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); INP_LOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_close: inp_socket == NULL")); @@ -979,7 +999,7 @@ inp->inp_vflag |= INP_SOCKREF; } INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); } /* @@ -1084,8 +1104,9 @@ struct in_addr laddr; u_short lport; int error; + INIT_VNET_INET(so->so_vnet); - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(inp); if (inp->inp_lport == 0) { @@ -1120,7 +1141,7 @@ tp->request_r_scale++; soisconnecting(so); - tcpstat.tcps_connattempt++; + V_tcpstat.tcps_connattempt++; tp->t_state = TCPS_SYN_SENT; tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); tp->iss = tcp_new_isn(tp); @@ -1139,8 +1160,9 @@ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; struct in6_addr *addr6; int error; + INIT_VNET_INET(so->so_vnet); - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(inp); if (inp->inp_lport == 0) { @@ -1183,7 +1205,7 @@ tp->request_r_scale++; soisconnecting(so); - tcpstat.tcps_connattempt++; + V_tcpstat.tcps_connattempt++; tp->t_state = TCPS_SYN_SENT; tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); tp->iss = tcp_new_isn(tp); @@ -1251,6 +1273,7 @@ int tcp_ctloutput(struct socket *so, struct sockopt *sopt) { + INIT_VNET_INET(so->so_vnet); int error, opt, optval; struct inpcb *inp; struct tcpcb *tp; @@ -1352,7 +1375,7 @@ INP_LOCK_RECHECK(inp); if (optval > 0 && optval <= tp->t_maxseg && - optval + 40 >= tcp_minmss) + optval + 40 >= V_tcp_minmss) tp->t_maxseg = optval; else error = EINVAL; @@ -1438,6 +1461,7 @@ static int tcp_attach(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct tcpcb *tp; struct inpcb *inp; int error; @@ -1452,10 +1476,10 @@ } so->so_rcv.sb_flags |= SB_AUTOSIZE; so->so_snd.sb_flags |= SB_AUTOSIZE; - INP_INFO_WLOCK(&tcbinfo); - error = in_pcballoc(so, &tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); + error = in_pcballoc(so, &V_tcbinfo); if (error) { - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } inp = sotoinpcb(so); @@ -1480,12 +1504,12 @@ #ifdef INET6 } #endif - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return (ENOBUFS); } tp->t_state = TCPS_CLOSED; INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return (0); } @@ -1502,8 +1526,11 @@ { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; +#ifdef INVARIANTS + INIT_VNET_INET(so->so_vnet); +#endif - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(inp); /* @@ -1540,8 +1567,11 @@ static void tcp_usrclosed(struct tcpcb *tp) { +#ifdef INVARIANTS + INIT_VNET_INET(tp->t_inpcb->inp_vnet); +#endif - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(tp->t_inpcb); switch (tp->t_state) { --- /u/marko/p4/head/src/sys/netinet/tcp_var.h 2008-02-27 18:29:13.000000000 +0100 +++ src/sys/netinet/tcp_var.h 2008-02-27 11:49:30.000000000 +0100 @@ -35,6 +35,8 @@ #include +struct vnet; + /* * Kernel variables for tcp. */ @@ -48,7 +50,6 @@ struct mbuf *tqe_m; /* mbuf contains packet */ }; LIST_HEAD(tsegqe_head, tseg_qent); -extern int tcp_reass_qsize; extern struct uma_zone *tcp_reass_zone; struct sackblk { @@ -183,6 +184,7 @@ u_char snd_scale; /* window scaling for send window */ u_char rcv_scale; /* window scaling for recv window */ u_char request_r_scale; /* pending window scaling */ + u_char snd_limited; /* segments limited transmitted */ u_int32_t ts_recent; /* timestamp echo data */ u_long ts_recent_age; /* when last updated */ u_int32_t ts_offset; /* our timestamp offset */ @@ -193,7 +195,7 @@ u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ u_long t_badrxtwin; /* window for retransmit recovery */ - u_char snd_limited; /* segments limited transmitted */ + struct vnet *t_vnet; /* back pointer to parent vnet */ /* SACK related state */ int snd_numholes; /* number of holes seen by sender */ TAILQ_HEAD(sackhole_head, sackhole) snd_holes; @@ -498,10 +500,10 @@ MALLOC_DECLARE(M_TCPLOG); #endif +#ifndef VIMAGE extern struct inpcbhead tcb; /* head of queue of active tcpcb's */ extern struct inpcbinfo tcbinfo; extern struct tcpstat tcpstat; /* tcp statistics */ -extern int tcp_log_in_vain; extern int tcp_mssdflt; /* XXX */ extern int tcp_minmss; extern int tcp_delack_enabled; @@ -509,9 +511,28 @@ extern int path_mtu_discovery; extern int ss_fltsz; extern int ss_fltsz_local; - extern int tcp_do_sack; /* SACK enabled/disabled */ extern int tcp_sc_rst_sock_fail; /* RST on sock alloc failure */ +extern int tcp_sack_maxholes; +extern int tcp_sack_globalmaxholes; +extern int tcp_sack_globalholes; +extern int tcp_do_tso; +extern int tcp_do_autosndbuf; +extern int tcp_autosndbuf_max; +extern int tcp_autosndbuf_inc; +extern int tcp_autorcvbuf; +extern int tcp_do_autorcvbuf; +extern int tcp_autorcvbuf_inc; +extern int tcp_autorcvbuf_max; +extern int blackhole; +extern int drop_synfin; +extern int tcp_do_rfc3042; +extern int tcp_do_rfc3390; +extern int tcp_insecure_rst; +extern int tcp_reass_qsize; +#endif + +extern int tcp_log_in_vain; int tcp_addoptions(struct tcpopt *, u_char *); struct tcpcb * @@ -529,6 +550,7 @@ void tcp_drain(void); void tcp_fasttimo(void); void tcp_init(void); +void tcp_destroy(void); void tcp_fini(void *); char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *, const void *); @@ -549,6 +571,9 @@ void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); void tcp_tw_init(void); +#ifdef VIMAGE +void tcp_tw_destroy(void); +#endif void tcp_tw_zone_change(void); int tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); @@ -569,6 +594,7 @@ * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ void tcp_hc_init(void); +void tcp_hc_destroy(void); void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *); u_long tcp_hc_getmtu(struct in_conninfo *); void tcp_hc_updatemtu(struct in_conninfo *, u_long); --- /u/marko/p4/head/src/sys/netinet/udp_usrreq.c 2007-10-29 17:17:44.000000000 +0100 +++ src/sys/netinet/udp_usrreq.c 2007-12-10 11:26:13.000000000 +0100 @@ -37,6 +37,7 @@ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -56,12 +57,15 @@ #include #include #include +#include #include +#include #include #include +#include #include #include #include @@ -82,6 +86,7 @@ #ifdef IPSEC #include +#include #endif #include @@ -127,26 +132,32 @@ SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); +#ifndef VIMAGE struct inpcbhead udb; /* from udp_var.h */ struct inpcbinfo udbinfo; +#endif #ifndef UDBHASHSIZE #define UDBHASHSIZE 16 #endif +#ifndef VIMAGE struct udpstat udpstat; /* from udp_var.h */ -SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW, &udpstat, - udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); +#endif +SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_udp, UDPCTL_STATS, stats, + CTLFLAG_RW, udpstat, udpstat, + "UDP statistics (struct udpstat, netinet/udp_var.h)"); static void udp_detach(struct socket *so); static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, struct mbuf *, struct thread *); +static struct uma_zone *udp_ipi_zone; + static void udp_zone_change(void *tag) { - - uma_zone_set_max(udbinfo.ipi_zone, maxsockets); + uma_zone_set_max(udp_ipi_zone, maxsockets); } static int @@ -162,20 +173,44 @@ void udp_init(void) { + INIT_VNET_INET(curvnet); - INP_INFO_LOCK_INIT(&udbinfo, "udp"); - LIST_INIT(&udb); - udbinfo.ipi_listhead = &udb; - udbinfo.ipi_hashbase = hashinit(UDBHASHSIZE, M_PCB, - &udbinfo.ipi_hashmask); - udbinfo.ipi_porthashbase = hashinit(UDBHASHSIZE, M_PCB, - &udbinfo.ipi_porthashmask); - udbinfo.ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL, +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif + udp_ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL, NULL, udp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(udbinfo.ipi_zone, maxsockets); + uma_zone_set_max(udp_ipi_zone, maxsockets); EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL, EVENTHANDLER_PRI_ANY); +#ifdef VIMAGE + } + V_udbinfo.ipi_vnet = curvnet; +#endif + + INP_INFO_LOCK_INIT(&V_udbinfo, "udp"); + LIST_INIT(&V_udb); + V_udbinfo.ipi_listhead = &V_udb; + V_udbinfo.ipi_hashbase = hashinit(UDBHASHSIZE, M_PCB, + &V_udbinfo.ipi_hashmask); + V_udbinfo.ipi_porthashbase = hashinit(UDBHASHSIZE, M_PCB, + &V_udbinfo.ipi_porthashmask); + V_udbinfo.ipi_zone = udp_ipi_zone; +} + +#ifdef VIMAGE +void +udp_destroy(void) +{ + INIT_VNET_INET(curvnet); + + hashdestroy(V_udbinfo.ipi_hashbase, M_PCB, + V_udbinfo.ipi_hashmask); + hashdestroy(V_udbinfo.ipi_porthashbase, M_PCB, + V_udbinfo.ipi_porthashmask); + INP_INFO_LOCK_DESTROY(&V_udbinfo); } +#endif /* * Subroutine of udp_input(), which appends the provided mbuf chain to the @@ -200,8 +235,9 @@ #ifdef IPSEC /* Check AH/ESP integrity. */ if (ipsec4_in_reject(n, inp)) { + INIT_VNET_IPSEC(curvnet); m_freem(n); - ipsec4stat.in_polvio++; + V_ipsec4stat.in_polvio++; return; } #endif /* IPSEC */ @@ -240,11 +276,12 @@ so = inp->inp_socket; SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) { + INIT_VNET_INET(so->so_vnet); SOCKBUF_UNLOCK(&so->so_rcv); m_freem(n); if (opts) m_freem(opts); - udpstat.udps_fullsock++; + V_udpstat.udps_fullsock++; } else sorwakeup_locked(so); } @@ -252,6 +289,7 @@ void udp_input(struct mbuf *m, int off) { + INIT_VNET_INET(curvnet); int iphlen = off; struct ip *ip; struct udphdr *uh; @@ -265,7 +303,7 @@ #endif ifp = m->m_pkthdr.rcvif; - udpstat.udps_ipackets++; + V_udpstat.udps_ipackets++; /* * Strip IP options, if any; should skip this, make available to @@ -283,7 +321,7 @@ ip = mtod(m, struct ip *); if (m->m_len < iphlen + sizeof(struct udphdr)) { if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) { - udpstat.udps_hdrops++; + V_udpstat.udps_hdrops++; return; } ip = mtod(m, struct ip *); @@ -313,7 +351,7 @@ len = ntohs((u_short)uh->uh_ulen); if (ip->ip_len != len) { if (len > ip->ip_len || len < sizeof(struct udphdr)) { - udpstat.udps_badlen++; + V_udpstat.udps_badlen++; goto badunlocked; } m_adj(m, len - ip->ip_len); @@ -353,12 +391,12 @@ bcopy(b, ((struct ipovly *)ip)->ih_x1, 9); } if (uh_sum) { - udpstat.udps_badsum++; + V_udpstat.udps_badsum++; m_freem(m); return; } } else - udpstat.udps_nosum++; + V_udpstat.udps_nosum++; #ifdef IPFIREWALL_FORWARD /* @@ -382,14 +420,14 @@ } #endif - INP_INFO_RLOCK(&udbinfo); + INP_INFO_RLOCK(&V_udbinfo); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || in_broadcast(ip->ip_dst, ifp)) { struct inpcb *last; struct ip_moptions *imo; last = NULL; - LIST_FOREACH(inp, &udb, inp_list) { + LIST_FOREACH(inp, &V_udb, inp_list) { if (inp->inp_lport != uh->uh_dport) continue; #ifdef INET6 @@ -464,7 +502,7 @@ __func__); } #endif - udpstat.udps_filtermcast++; + V_udpstat.udps_filtermcast++; blocked++; } } @@ -502,20 +540,20 @@ * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ - udpstat.udps_noportbcast++; + V_udpstat.udps_noportbcast++; goto badheadlocked; } udp_append(last, ip, m, iphlen + sizeof(struct udphdr), &udp_in); INP_UNLOCK(last); - INP_INFO_RUNLOCK(&udbinfo); + INP_INFO_RUNLOCK(&V_udbinfo); return; } /* * Locate pcb for datagram. */ - inp = in_pcblookup_hash(&udbinfo, ip->ip_src, uh->uh_sport, + inp = in_pcblookup_hash(&V_udbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, 1, ifp); if (inp == NULL) { if (udp_log_in_vain) { @@ -527,9 +565,9 @@ buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src), ntohs(uh->uh_sport)); } - udpstat.udps_noport++; + V_udpstat.udps_noport++; if (m->m_flags & (M_BCAST | M_MCAST)) { - udpstat.udps_noportbcast++; + V_udpstat.udps_noportbcast++; goto badheadlocked; } if (udp_blackhole) @@ -539,7 +577,7 @@ *ip = save_ip; ip->ip_len += iphlen; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); - INP_INFO_RUNLOCK(&udbinfo); + INP_INFO_RUNLOCK(&V_udbinfo); return; } @@ -551,13 +589,13 @@ goto badheadlocked; udp_append(inp, ip, m, iphlen + sizeof(struct udphdr), &udp_in); INP_UNLOCK(inp); - INP_INFO_RUNLOCK(&udbinfo); + INP_INFO_RUNLOCK(&V_udbinfo); return; badheadlocked: if (inp) INP_UNLOCK(inp); - INP_INFO_RUNLOCK(&udbinfo); + INP_INFO_RUNLOCK(&V_udbinfo); badunlocked: m_freem(m); } @@ -579,6 +617,7 @@ void udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { + INIT_VNET_INET(curvnet); struct ip *ip = vip; struct udphdr *uh; struct in_addr faddr; @@ -606,8 +645,8 @@ return; if (ip != NULL) { uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); - INP_INFO_RLOCK(&udbinfo); - inp = in_pcblookup_hash(&udbinfo, faddr, uh->uh_dport, + INP_INFO_RLOCK(&V_udbinfo); + inp = in_pcblookup_hash(&V_udbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, 0, NULL); if (inp != NULL) { INP_LOCK(inp); @@ -616,15 +655,16 @@ } INP_UNLOCK(inp); } - INP_INFO_RUNLOCK(&udbinfo); + INP_INFO_RUNLOCK(&V_udbinfo); } else - in_pcbnotifyall(&udbinfo, faddr, inetctlerrmap[cmd], + in_pcbnotifyall(&V_udbinfo, faddr, inetctlerrmap[cmd], udp_notify); } static int udp_pcblist(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET(curvnet); int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; @@ -635,7 +675,7 @@ * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { - n = udbinfo.ipi_count; + n = V_udbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); return (0); @@ -647,10 +687,10 @@ /* * OK, now we're committed to doing something. */ - INP_INFO_RLOCK(&udbinfo); - gencnt = udbinfo.ipi_gencnt; - n = udbinfo.ipi_count; - INP_INFO_RUNLOCK(&udbinfo); + INP_INFO_RLOCK(&V_udbinfo); + gencnt = V_udbinfo.ipi_gencnt; + n = V_udbinfo.ipi_count; + INP_INFO_RUNLOCK(&V_udbinfo); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xinpcb)); @@ -669,8 +709,8 @@ if (inp_list == 0) return (ENOMEM); - INP_INFO_RLOCK(&udbinfo); - for (inp = LIST_FIRST(udbinfo.ipi_listhead), i = 0; inp && i < n; + INP_INFO_RLOCK(&V_udbinfo); + for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_LOCK(inp); if (inp->inp_gencnt <= gencnt && @@ -678,7 +718,7 @@ inp_list[i++] = inp; INP_UNLOCK(inp); } - INP_INFO_RUNLOCK(&udbinfo); + INP_INFO_RUNLOCK(&V_udbinfo); n = i; error = 0; @@ -706,11 +746,11 @@ * that something happened while we were processing this * request, and it might be necessary to retry. */ - INP_INFO_RLOCK(&udbinfo); - xig.xig_gen = udbinfo.ipi_gencnt; + INP_INFO_RLOCK(&V_udbinfo); + xig.xig_gen = V_udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; - xig.xig_count = udbinfo.ipi_count; - INP_INFO_RUNLOCK(&udbinfo); + xig.xig_count = V_udbinfo.ipi_count; + INP_INFO_RUNLOCK(&V_udbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); @@ -723,6 +763,7 @@ static int udp_getcred(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET(curvnet); struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; @@ -734,8 +775,8 @@ error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); - INP_INFO_RLOCK(&udbinfo); - inp = in_pcblookup_hash(&udbinfo, addrs[1].sin_addr, addrs[1].sin_port, + INP_INFO_RLOCK(&V_udbinfo); + inp = in_pcblookup_hash(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 1, NULL); if (inp == NULL || inp->inp_socket == NULL) { error = ENOENT; @@ -746,7 +787,7 @@ goto out; cru2x(inp->inp_socket->so_cred, &xuc); out: - INP_INFO_RUNLOCK(&udbinfo); + INP_INFO_RUNLOCK(&V_udbinfo); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); @@ -760,6 +801,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { + INIT_VNET_INET(inp->inp_vnet); struct udpiphdr *ui; int len = m->m_pkthdr.len; struct in_addr faddr, laddr; @@ -836,7 +878,7 @@ } if (src.sin_family == AF_INET || addr != NULL) { - INP_INFO_WLOCK(&udbinfo); + INP_INFO_WLOCK(&V_udbinfo); unlock_udbinfo = 1; } else unlock_udbinfo = 0; @@ -965,10 +1007,10 @@ ((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len; ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ ((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */ - udpstat.udps_opackets++; + V_udpstat.udps_opackets++; if (unlock_udbinfo) - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); error = ip_output(m, inp->inp_options, NULL, ipflags, inp->inp_moptions, inp); INP_UNLOCK(inp); @@ -977,7 +1019,7 @@ release: INP_UNLOCK(inp); if (unlock_udbinfo) - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); m_freem(m); return (error); } @@ -985,11 +1027,12 @@ static void udp_abort(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_abort: inp == NULL")); - INP_INFO_WLOCK(&udbinfo); + INP_INFO_WLOCK(&V_udbinfo); INP_LOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { in_pcbdisconnect(inp); @@ -997,12 +1040,13 @@ soisdisconnected(so); } INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); } static int udp_attach(struct socket *so, int proto, struct thread *td) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; int error; @@ -1011,17 +1055,17 @@ error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); - INP_INFO_WLOCK(&udbinfo); - error = in_pcballoc(so, &udbinfo); + INP_INFO_WLOCK(&V_udbinfo); + error = in_pcballoc(so, &V_udbinfo); if (error) { - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); return (error); } inp = (struct inpcb *)so->so_pcb; - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); inp->inp_vflag |= INP_IPV4; - inp->inp_ip_ttl = ip_defttl; + inp->inp_ip_ttl = V_ip_defttl; INP_UNLOCK(inp); return (0); } @@ -1029,27 +1073,29 @@ static int udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_bind: inp == NULL")); - INP_INFO_WLOCK(&udbinfo); + INP_INFO_WLOCK(&V_udbinfo); INP_LOCK(inp); error = in_pcbbind(inp, nam, td->td_ucred); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); return (error); } static void udp_close(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_close: inp == NULL")); - INP_INFO_WLOCK(&udbinfo); + INP_INFO_WLOCK(&V_udbinfo); INP_LOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { in_pcbdisconnect(inp); @@ -1057,23 +1103,24 @@ soisdisconnected(so); } INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); } static int udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; int error; struct sockaddr_in *sin; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_connect: inp == NULL")); - INP_INFO_WLOCK(&udbinfo); + INP_INFO_WLOCK(&V_udbinfo); INP_LOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); return (EISCONN); } sin = (struct sockaddr_in *)nam; @@ -1083,37 +1130,39 @@ if (error == 0) soisconnected(so); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); return (error); } static void udp_detach(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("udp_detach: not disconnected")); - INP_INFO_WLOCK(&udbinfo); + INP_INFO_WLOCK(&V_udbinfo); INP_LOCK(inp); in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); } static int udp_disconnect(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_disconnect: inp == NULL")); - INP_INFO_WLOCK(&udbinfo); + INP_INFO_WLOCK(&V_udbinfo); INP_LOCK(inp); if (inp->inp_faddr.s_addr == INADDR_ANY) { - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); INP_UNLOCK(inp); return (ENOTCONN); } @@ -1124,7 +1173,7 @@ so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); return (0); } --- /u/marko/p4/head/src/sys/netinet/udp_var.h 2007-08-31 03:48:08.000000000 +0200 +++ src/sys/netinet/udp_var.h 2007-10-16 13:48:34.000000000 +0200 @@ -94,16 +94,22 @@ SYSCTL_DECL(_net_inet_udp); extern struct pr_usrreqs udp_usrreqs; + +#ifndef VIMAGE extern struct inpcbhead udb; extern struct inpcbinfo udbinfo; +extern struct udpstat udpstat; +#endif extern u_long udp_sendspace; extern u_long udp_recvspace; -extern struct udpstat udpstat; extern int udp_blackhole; extern int udp_log_in_vain; void udp_ctlinput(int, struct sockaddr *, void *); void udp_init(void); +#ifdef VIMAGE +void udp_destroy(void); +#endif void udp_input(struct mbuf *, int); struct inpcb *udp_notify(struct inpcb *inp, int errno); int udp_shutdown(struct socket *so); --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/netinet/vinet.h 2007-10-05 12:27:19.000000000 +0200 @@ -0,0 +1,311 @@ +/*- + * Copyright (c) 2006 University of Zagreb + * Copyright (c) 2006 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * XXX RCS tag goes here + */ + +#ifndef _NETINET_VINET_H_ +#define _NETINET_VINET_H_ + +#ifdef VIMAGE +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct vnet_inet { + struct in_ifaddrhashhead *_in_ifaddrhashtbl; + struct in_ifaddrhead _in_ifaddrhead; + u_long _in_ifaddrhmask; + struct in_multihead _in_multihead; + + int _arpt_keep; + int _arp_maxtries; + int _useloopback; + int _arp_proxyall; + int _subnetsarelocal; + int _sameprefixcarponly; + + int _ipforwarding; + int _ipfastforward_active; + int _ipsendredirects; + int _ip_defttl; + int _ip_keepfaith; + int _ip_sendsourcequench; + int _ip_do_randomid; + int _ip_checkinterface; + u_short _ip_id; + + uma_zone_t _ipq_zone; + int _nipq; /* Total # of reass queues */ + int _maxnipq; /* Admin. limit on # reass queues. */ + int _maxfragsperpacket; + TAILQ_HEAD(ipqhead, ipq) _ipq[IPREASS_NHASH]; + + struct inpcbhead _tcb; /* head of queue of active tcpcb's */ + struct inpcbinfo _tcbinfo; + struct tcpstat _tcpstat; /* tcp statistics */ + struct tcp_hostcache _tcp_hostcache; + struct callout _tcp_hc_callout; + struct tcp_syncache _tcp_syncache; + TAILQ_HEAD(, tcptw) _twq_2msl; + + int _tcp_sc_rst_sock_fail; + int _tcp_mssdflt; + int _tcp_v6mssdflt; + int _tcp_minmss; + int _tcp_do_rfc1323; + int _icmp_may_rst; + int _tcp_isn_reseed_interval; + int _tcp_inflight_enable; + int _tcp_inflight_rttthresh; + int _tcp_inflight_min; + int _tcp_inflight_max; + int _tcp_inflight_stab; + int _nolocaltimewait; + int _path_mtu_discovery; + int _ss_fltsz; + int _ss_fltsz_local; + int _tcp_do_newreno; + int _tcp_do_tso; + int _tcp_do_autosndbuf; + int _tcp_autosndbuf_inc; + int _tcp_autosndbuf_max; + int _tcp_do_sack; + int _tcp_sack_maxholes; + int _tcp_sack_globalmaxholes; + int _tcp_sack_globalholes; + int _blackhole; + int _tcp_delack_enabled; + int _drop_synfin; + int _tcp_do_rfc3042; + int _tcp_do_rfc3390; + int _tcp_insecure_rst; + int _tcp_do_autorcvbuf; + int _tcp_autorcvbuf_inc; + int _tcp_autorcvbuf_max; + int _tcp_reass_maxseg; + int _tcp_reass_qsize; + int _tcp_reass_maxqlen; + int _tcp_reass_overflows; + + u_char _isn_secret[32]; + int _isn_last_reseed; + u_int32_t _isn_offset; + u_int32_t _isn_offset_old; + MD5_CTX _isn_ctx; + + struct inpcbhead _udb; + struct inpcbinfo _udbinfo; + struct udpstat _udpstat; + + struct inpcbhead _ripcb; + struct inpcbinfo _ripcbinfo; + struct socket *_ip_mrouter; + + struct socket *_ip_rsvpd; + int _ip_rsvp_on; + int _rsvp_on; + + struct icmpstat _icmpstat; + struct ipstat _ipstat; + struct igmpstat _igmpstat; + + SLIST_HEAD(, router_info) _router_info_head; + + int _rtq_timeout; + int _rtq_reallyold; + int _rtq_minreallyold; + int _rtq_toomany; + struct callout _rtq_timer; + + int _ipport_lowfirstauto; + int _ipport_lowlastauto; + int _ipport_firstauto; + int _ipport_lastauto; + int _ipport_hifirstauto; + int _ipport_hilastauto; + int _ipport_reservedhigh; + int _ipport_reservedlow; + int _ipport_randomized; + int _ipport_randomcps; + int _ipport_randomtime; + int _ipport_stoprandom; + int _ipport_tcpallocs; + int _ipport_tcplastcount; +}; +#endif + + +/* + * Symbol translation macros + */ +#define INIT_VNET_INET(vnet) \ + INIT_FROM_VNET(vnet, VNET_MOD_INET, struct vnet_inet, vnet_inet) + +#define VNET_INET(sym) VSYM(vnet_inet, sym) + +#define V_in_ifaddrhead VNET_INET(in_ifaddrhead) +#define V_in_ifaddrhashtbl VNET_INET(in_ifaddrhashtbl) +#define V_in_ifaddrhmask VNET_INET(in_ifaddrhmask) +#define V_in_multihead VNET_INET(in_multihead) + +#define V_llinfo_arp VNET_INET(llinfo_arp) +#define V_arpt_prune VNET_INET(arpt_prune) +#define V_arpt_keep VNET_INET(arpt_keep) +#define V_arp_maxtries VNET_INET(arp_maxtries) +#define V_useloopback VNET_INET(useloopback) +#define V_arp_proxyall VNET_INET(arp_proxyall) +#define V_subnetsarelocal VNET_INET(subnetsarelocal) +#define V_sameprefixcarponly VNET_INET(sameprefixcarponly) + +#define V_ipforwarding VNET_INET(ipforwarding) +#define V_ipfastforward_active VNET_INET(ipfastforward_active) +#define V_ipsendredirects VNET_INET(ipsendredirects) +#define V_ip_defttl VNET_INET(ip_defttl) +#define V_ip_keepfaith VNET_INET(ip_keepfaith) +#define V_ip_sendsourcequench VNET_INET(ip_sendsourcequench) +#define V_ip_id VNET_INET(ip_id) +#define V_ip_do_randomid VNET_INET(ip_do_randomid) +#define V_ip_checkinterface VNET_INET(ip_checkinterface) + +#define V_ipq VNET_INET(ipq) +#define V_ipq_zone VNET_INET(ipq_zone) +#define V_nipq VNET_INET(nipq) +#define V_maxnipq VNET_INET(maxnipq) +#define V_maxfragsperpacket VNET_INET(maxfragsperpacket) + +#define V_tcb VNET_INET(tcb) +#define V_tcbinfo VNET_INET(tcbinfo) +#define V_tcpstat VNET_INET(tcpstat) +#define V_twq_2msl VNET_INET(twq_2msl) +#define V_tcp_hostcache VNET_INET(tcp_hostcache) +#define V_tcp_hc_callout VNET_INET(tcp_hc_callout) +#define V_tcp_syncache VNET_INET(tcp_syncache) +#define V_tcp_sc_rst_sock_fail VNET_INET(tcp_sc_rst_sock_fail) + +#define V_tcp_mssdflt VNET_INET(tcp_mssdflt) +#define V_tcp_v6mssdflt VNET_INET(tcp_v6mssdflt) +#define V_tcp_minmss VNET_INET(tcp_minmss) +#define V_tcp_do_rfc1323 VNET_INET(tcp_do_rfc1323) +#define V_icmp_may_rst VNET_INET(icmp_may_rst) +#define V_tcp_isn_reseed_interval VNET_INET(tcp_isn_reseed_interval) +#define V_tcp_inflight_enable VNET_INET(tcp_inflight_enable) +#define V_tcp_inflight_rttthresh VNET_INET(tcp_inflight_rttthresh) +#define V_tcp_inflight_min VNET_INET(tcp_inflight_min) +#define V_tcp_inflight_max VNET_INET(tcp_inflight_max) +#define V_tcp_inflight_stab VNET_INET(tcp_inflight_stab) +#define V_nolocaltimewait VNET_INET(nolocaltimewait) +#define V_path_mtu_discovery VNET_INET(path_mtu_discovery) +#define V_ss_fltsz VNET_INET(ss_fltsz) +#define V_ss_fltsz_local VNET_INET(ss_fltsz_local) +#define V_tcp_do_newreno VNET_INET(tcp_do_newreno) +#define V_tcp_do_tso VNET_INET(tcp_do_tso) +#define V_tcp_do_autosndbuf VNET_INET(tcp_do_autosndbuf) +#define V_tcp_autosndbuf_inc VNET_INET(tcp_autosndbuf_inc) +#define V_tcp_autosndbuf_max VNET_INET(tcp_autosndbuf_max) +#define V_tcp_do_sack VNET_INET(tcp_do_sack) +#define V_tcp_sack_maxholes VNET_INET(tcp_sack_maxholes) +#define V_tcp_sack_globalmaxholes VNET_INET(tcp_sack_globalmaxholes) +#define V_tcp_sack_globalholes VNET_INET(tcp_sack_globalholes) +#define V_blackhole VNET_INET(blackhole) +#define V_tcp_delack_enabled VNET_INET(tcp_delack_enabled) +#define V_drop_synfin VNET_INET(drop_synfin) +#define V_tcp_do_rfc3042 VNET_INET(tcp_do_rfc3042) +#define V_tcp_do_rfc3390 VNET_INET(tcp_do_rfc3390) +#define V_tcp_insecure_rst VNET_INET(tcp_insecure_rst) +#define V_tcp_do_autorcvbuf VNET_INET(tcp_do_autorcvbuf) +#define V_tcp_autorcvbuf_inc VNET_INET(tcp_autorcvbuf_inc) +#define V_tcp_autorcvbuf_max VNET_INET(tcp_autorcvbuf_max) +#define V_tcp_reass_maxseg VNET_INET(tcp_reass_maxseg) +#define V_tcp_reass_qsize VNET_INET(tcp_reass_qsize) +#define V_tcp_reass_maxqlen VNET_INET(tcp_reass_maxqlen) +#define V_tcp_reass_overflows VNET_INET(tcp_reass_overflows) + +#define V_isn_secret VNET_INET(isn_secret) +#define V_isn_last_reseed VNET_INET(isn_last_reseed) +#define V_isn_offset VNET_INET(isn_offset) +#define V_isn_offset_old VNET_INET(isn_offset_old) +#define V_isn_ctx VNET_INET(isn_ctx) + +#define V_udb VNET_INET(udb) +#define V_udbinfo VNET_INET(udbinfo) +#define V_udpstat VNET_INET(udpstat) + +#define V_ripcb VNET_INET(ripcb) +#define V_ripcbinfo VNET_INET(ripcbinfo) +#define V_ip_mrouter VNET_INET(ip_mrouter) + +#define V_rsvp_on VNET_INET(rsvp_on) +#define V_ip_rsvp_on VNET_INET(ip_rsvp_on) +#define V_ip_rsvpd VNET_INET(ip_rsvpd) + +#define V_icmpstat VNET_INET(icmpstat) +#define V_ipstat VNET_INET(ipstat) +#define V_igmpstat VNET_INET(igmpstat) + +#define V_router_info_head VNET_INET(router_info_head) + +#define V_rtq_timeout VNET_INET(rtq_timeout) +#define V_rtq_reallyold VNET_INET(rtq_reallyold) +#define V_rtq_minreallyold VNET_INET(rtq_minreallyold) +#define V_rtq_toomany VNET_INET(rtq_toomany) +#define V_rtq_timer VNET_INET(rtq_timer) + +#define V_ipport_lowfirstauto VNET_INET(ipport_lowfirstauto) +#define V_ipport_lowlastauto VNET_INET(ipport_lowlastauto) +#define V_ipport_firstauto VNET_INET(ipport_firstauto) +#define V_ipport_lastauto VNET_INET(ipport_lastauto) +#define V_ipport_hifirstauto VNET_INET(ipport_hifirstauto) +#define V_ipport_hilastauto VNET_INET(ipport_hilastauto) +#define V_ipport_reservedhigh VNET_INET(ipport_reservedhigh) +#define V_ipport_reservedlow VNET_INET(ipport_reservedlow) +#define V_ipport_randomized VNET_INET(ipport_randomized) +#define V_ipport_randomcps VNET_INET(ipport_randomcps) +#define V_ipport_randomtime VNET_INET(ipport_randomtime) +#define V_ipport_stoprandom VNET_INET(ipport_stoprandom) +#define V_ipport_tcpallocs VNET_INET(ipport_tcpallocs) +#define V_ipport_tcplastcount VNET_INET(ipport_tcplastcount) + +#endif /* !_NETINET_VINET_H_ */ --- /u/marko/p4/head/src/sys/netinet6/dest6.c 2007-12-27 19:32:57.000000000 +0100 +++ src/sys/netinet6/dest6.c 2008-01-14 19:23:56.000000000 +0100 @@ -34,6 +34,7 @@ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -45,6 +46,8 @@ #include #include #include +#include +#include #include #include @@ -52,6 +55,7 @@ #include #include #include +#include #include #include @@ -61,6 +65,7 @@ int dest6_input(struct mbuf **mp, int *offp, int proto) { + INIT_VNET_INET6(curvnet); struct mbuf *m = *mp; int off = *offp, dstoptlen, optlen; struct ip6_dest *dstopts; @@ -93,7 +98,7 @@ for (optlen = 0; dstoptlen > 0; dstoptlen -= optlen, opt += optlen) { if (*opt != IP6OPT_PAD1 && (dstoptlen < IP6OPT_MINLEN || *(opt + 1) + 2 > dstoptlen)) { - ip6stat.ip6s_toosmall++; + V_ip6stat.ip6s_toosmall++; goto bad; } --- /u/marko/p4/head/src/sys/netinet6/frag6.c 2008-01-15 18:00:34.000000000 +0100 +++ src/sys/netinet6/frag6.c 2008-02-27 11:49:32.000000000 +0100 @@ -32,6 +32,8 @@ #include __FBSDID("$FreeBSD: src/sys/netinet6/frag6.c,v 1.35 2008/01/08 19:08:57 obrien Exp $"); +#include "opt_vimage.h" + #include #include #include @@ -43,6 +45,8 @@ #include #include #include +#include +#include #include #include @@ -50,6 +54,7 @@ #include #include #include +#include #include #include #include /* for ECN definitions */ @@ -72,9 +77,11 @@ /* * These fields all protected by ip6qlock. */ +#ifndef VIMAGE static u_int frag6_nfragpackets; static u_int frag6_nfrags; static struct ip6q ip6q; /* ip6 reassemble queue */ +#endif #define IP6Q_LOCK_INIT() mtx_init(&ip6qlock, "ip6qlock", NULL, MTX_DEF); #define IP6Q_LOCK() mtx_lock(&ip6qlock) @@ -90,23 +97,28 @@ static void frag6_change(void *tag) { + INIT_VNET_INET6(curvnet); - ip6_maxfragpackets = nmbclusters / 4; - ip6_maxfrags = nmbclusters / 4; + V_ip6_maxfragpackets = nmbclusters / 4; + V_ip6_maxfrags = nmbclusters / 4; } void frag6_init(void) { + INIT_VNET_INET6(curvnet); - ip6_maxfragpackets = nmbclusters / 4; - ip6_maxfrags = nmbclusters / 4; + V_ip6q.ip6q_next = V_ip6q.ip6q_prev = &V_ip6q; + V_ip6_maxfragpackets = nmbclusters / 4; + V_ip6_maxfrags = nmbclusters / 4; +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif EVENTHANDLER_REGISTER(nmbclusters_change, frag6_change, NULL, EVENTHANDLER_PRI_ANY); IP6Q_LOCK_INIT(); - - ip6q.ip6q_next = ip6q.ip6q_prev = &ip6q; } /* @@ -144,6 +156,7 @@ int frag6_input(struct mbuf **mp, int *offp, int proto) { + INIT_VNET_INET6(curvnet); struct mbuf *m = *mp, *t; struct ip6_hdr *ip6; struct ip6_frag *ip6f; @@ -203,7 +216,7 @@ return IPPROTO_DONE; } - ip6stat.ip6s_fragments++; + V_ip6stat.ip6s_fragments++; in6_ifstat_inc(dstifp, ifs6_reass_reqd); /* offset now points to data portion */ @@ -216,18 +229,18 @@ * If maxfrag is 0, never accept fragments. * If maxfrag is -1, accept all fragments without limitation. */ - if (ip6_maxfrags < 0) + if (V_ip6_maxfrags < 0) ; - else if (frag6_nfrags >= (u_int)ip6_maxfrags) + else if (V_frag6_nfrags >= (u_int)V_ip6_maxfrags) goto dropfrag; - for (q6 = ip6q.ip6q_next; q6 != &ip6q; q6 = q6->ip6q_next) + for (q6 = V_ip6q.ip6q_next; q6 != &V_ip6q; q6 = q6->ip6q_next) if (ip6f->ip6f_ident == q6->ip6q_ident && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &q6->ip6q_src) && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &q6->ip6q_dst)) break; - if (q6 == &ip6q) { + if (q6 == &V_ip6q) { /* * the first fragment to arrive, create a reassembly queue. */ @@ -240,18 +253,18 @@ * If maxfragpackets is -1, accept all fragments without * limitation. */ - if (ip6_maxfragpackets < 0) + if (V_ip6_maxfragpackets < 0) ; - else if (frag6_nfragpackets >= (u_int)ip6_maxfragpackets) + else if (V_frag6_nfragpackets >= (u_int)V_ip6_maxfragpackets) goto dropfrag; - frag6_nfragpackets++; + V_frag6_nfragpackets++; q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FTABLE, M_NOWAIT); if (q6 == NULL) goto dropfrag; bzero(q6, sizeof(*q6)); - frag6_insque(q6, &ip6q); + frag6_insque(q6, &V_ip6q); /* ip6q_nxt will be filled afterwards, from 1st fragment */ q6->ip6q_down = q6->ip6q_up = (struct ip6asfrag *)q6; @@ -465,12 +478,12 @@ * the most recently active fragmented packet. */ frag6_enq(ip6af, af6->ip6af_up); - frag6_nfrags++; + V_frag6_nfrags++; q6->ip6q_nfrag++; #if 0 /* xxx */ - if (q6 != ip6q.ip6q_next) { + if (q6 != V_ip6q.ip6q_next) { frag6_remque(q6); - frag6_insque(q6, &ip6q); + frag6_insque(q6, &V_ip6q); } #endif next = 0; @@ -528,9 +541,9 @@ /* this comes with no copy if the boundary is on cluster */ if ((t = m_split(m, offset, M_DONTWAIT)) == NULL) { frag6_remque(q6); - frag6_nfrags -= q6->ip6q_nfrag; + V_frag6_nfrags -= q6->ip6q_nfrag; free(q6, M_FTABLE); - frag6_nfragpackets--; + V_frag6_nfragpackets--; goto dropfrag; } m_adj(t, sizeof(struct ip6_frag)); @@ -546,9 +559,9 @@ } frag6_remque(q6); - frag6_nfrags -= q6->ip6q_nfrag; + V_frag6_nfrags -= q6->ip6q_nfrag; free(q6, M_FTABLE); - frag6_nfragpackets--; + V_frag6_nfragpackets--; if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */ int plen = 0; @@ -557,7 +570,7 @@ m->m_pkthdr.len = plen; } - ip6stat.ip6s_reassembled++; + V_ip6stat.ip6s_reassembled++; in6_ifstat_inc(dstifp, ifs6_reass_ok); /* @@ -573,7 +586,7 @@ dropfrag: IP6Q_UNLOCK(); in6_ifstat_inc(dstifp, ifs6_reass_fail); - ip6stat.ip6s_fragdropped++; + V_ip6stat.ip6s_fragdropped++; m_freem(m); return IPPROTO_DONE; } @@ -585,6 +598,7 @@ void frag6_freef(struct ip6q *q6) { + INIT_VNET_INET6(curvnet); struct ip6asfrag *af6, *down6; IP6Q_LOCK_ASSERT(); @@ -617,9 +631,9 @@ free(af6, M_FTABLE); } frag6_remque(q6); - frag6_nfrags -= q6->ip6q_nfrag; + V_frag6_nfrags -= q6->ip6q_nfrag; free(q6, M_FTABLE); - frag6_nfragpackets--; + V_frag6_nfragpackets--; } /* @@ -688,13 +702,15 @@ #endif IP6Q_LOCK(); - q6 = ip6q.ip6q_next; + VNET_ITERLOOP_BEGIN() + INIT_VNET_INET6(curvnet); + q6 = V_ip6q.ip6q_next; if (q6) - while (q6 != &ip6q) { + while (q6 != &V_ip6q) { --q6->ip6q_ttl; q6 = q6->ip6q_next; if (q6->ip6q_prev->ip6q_ttl == 0) { - ip6stat.ip6s_fragtimeout++; + V_ip6stat.ip6s_fragtimeout++; /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ frag6_freef(q6->ip6q_prev); } @@ -704,12 +720,13 @@ * (due to the limit being lowered), drain off * enough to get down to the new limit. */ - while (frag6_nfragpackets > (u_int)ip6_maxfragpackets && - ip6q.ip6q_prev) { - ip6stat.ip6s_fragoverflow++; + while (V_frag6_nfragpackets > (u_int)V_ip6_maxfragpackets && + V_ip6q.ip6q_prev) { + V_ip6stat.ip6s_fragoverflow++; /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ - frag6_freef(ip6q.ip6q_prev); + frag6_freef(V_ip6q.ip6q_prev); } + VNET_ITERLOOP_END() IP6Q_UNLOCK(); #if 0 @@ -718,9 +735,9 @@ * make sure we notice eventually, even if forwarding only for one * destination and the cache is never replaced. */ - if (ip6_forward_rt.ro_rt) { - RTFREE(ip6_forward_rt.ro_rt); - ip6_forward_rt.ro_rt = 0; + if (V_ip6_forward_rt.ro_rt) { + RTFREE(V_ip6_forward_rt.ro_rt); + V_ip6_forward_rt.ro_rt = 0; } if (ipsrcchk_rt.ro_rt) { RTFREE(ipsrcchk_rt.ro_rt); @@ -738,10 +755,13 @@ if (IP6Q_TRYLOCK() == 0) return; - while (ip6q.ip6q_next != &ip6q) { - ip6stat.ip6s_fragdropped++; + VNET_ITERLOOP_BEGIN() + INIT_VNET_INET6(curvnet); + while (V_ip6q.ip6q_next != &V_ip6q) { + V_ip6stat.ip6s_fragdropped++; /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ - frag6_freef(ip6q.ip6q_next); + frag6_freef(V_ip6q.ip6q_next); } + VNET_ITERLOOP_END() IP6Q_UNLOCK(); } --- /u/marko/p4/head/src/sys/netinet6/icmp6.c 2008-01-15 18:01:01.000000000 +0100 +++ src/sys/netinet6/icmp6.c 2008-02-27 11:49:33.000000000 +0100 @@ -66,6 +66,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" +#include "opt_vimage.h" #include #include @@ -81,18 +82,22 @@ #include #include #include +#include +#include #include #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -112,10 +117,14 @@ extern struct inpcbinfo ripcbinfo; extern struct inpcbhead ripcb; +#ifndef VIMAGE extern int icmp6errppslim; -static int icmp6errpps_count = 0; +static int icmp6errpps_count; +#endif /* !VIMAGE */ static struct timeval icmp6errppslim_last; +#ifndef VIMAGE extern int icmp6_nodeinfo; +#endif /* !VIMAGE */ static void icmp6_errcount(struct icmp6errstat *, int, int); static int icmp6_rip6_input(struct mbuf **, int); @@ -135,6 +144,8 @@ void icmp6_init(void) { + INIT_VNET_INET6(curvnet); + V_icmp6errpps_count = 0; mld6_init(); } @@ -203,6 +214,7 @@ icmp6_error2(struct mbuf *m, int type, int code, int param, struct ifnet *ifp) { + INIT_VNET_INET6(curvnet); struct ip6_hdr *ip6; if (ifp == NULL) @@ -234,20 +246,21 @@ void icmp6_error(struct mbuf *m, int type, int code, int param) { + INIT_VNET_INET6(curvnet); struct ip6_hdr *oip6, *nip6; struct icmp6_hdr *icmp6; u_int preplen; int off; int nxt; - icmp6stat.icp6s_error++; + V_icmp6stat.icp6s_error++; /* count per-type-code statistics */ - icmp6_errcount(&icmp6stat.icp6s_outerrhist, type, code); + icmp6_errcount(&V_icmp6stat.icp6s_outerrhist, type, code); #ifdef M_DECRYPTED /*not openbsd*/ if (m->m_flags & M_DECRYPTED) { - icmp6stat.icp6s_canterror++; + V_icmp6stat.icp6s_canterror++; goto freeit; } #endif @@ -305,7 +318,7 @@ IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off, sizeof(*icp)); if (icp == NULL) { - icmp6stat.icp6s_tooshort++; + V_icmp6stat.icp6s_tooshort++; return; } #endif @@ -316,7 +329,7 @@ * Special case: for redirect (which is * informational) we must not send icmp6 error. */ - icmp6stat.icp6s_canterror++; + V_icmp6stat.icp6s_canterror++; goto freeit; } else { /* ICMPv6 informational - send the error */ @@ -329,7 +342,7 @@ /* Finally, do rate limitation check. */ if (icmp6_ratelimit(&oip6->ip6_src, type, code)) { - icmp6stat.icp6s_toofreq++; + V_icmp6stat.icp6s_toofreq++; goto freeit; } @@ -370,7 +383,7 @@ */ m->m_pkthdr.rcvif = NULL; - icmp6stat.icp6s_outhist[type]++; + V_icmp6stat.icp6s_outhist[type]++; icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */ return; @@ -388,6 +401,9 @@ int icmp6_input(struct mbuf **mp, int *offp, int proto) { + INIT_VNET_INET6(curvnet); + /* XXX this bellow is WRONG - MARKO */ + INIT_VPROCG(TD_TO_VPROCG(curthread)); struct mbuf *m = *mp, *n; struct ip6_hdr *ip6, *nip6; struct icmp6_hdr *icmp6, *nicmp6; @@ -408,7 +424,7 @@ ip6 = mtod(m, struct ip6_hdr *); if (icmp6len < sizeof(struct icmp6_hdr)) { - icmp6stat.icp6s_tooshort++; + V_icmp6stat.icp6s_tooshort++; goto freeit; } @@ -420,7 +436,7 @@ #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6)); if (icmp6 == NULL) { - icmp6stat.icp6s_tooshort++; + V_icmp6stat.icp6s_tooshort++; return IPPROTO_DONE; } #endif @@ -431,7 +447,7 @@ "ICMP6 checksum error(%d|%x) %s\n", icmp6->icmp6_type, sum, ip6_sprintf(ip6bufs, &ip6->ip6_src))); - icmp6stat.icp6s_checksum++; + V_icmp6stat.icp6s_checksum++; goto freeit; } @@ -451,7 +467,7 @@ } } - icmp6stat.icp6s_inhist[icmp6->icmp6_type]++; + V_icmp6stat.icp6s_inhist[icmp6->icmp6_type]++; icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_msg); if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK) icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_error); @@ -585,8 +601,8 @@ nicmp6->icmp6_type = ICMP6_ECHO_REPLY; nicmp6->icmp6_code = 0; if (n) { - icmp6stat.icp6s_reflect++; - icmp6stat.icp6s_outhist[ICMP6_ECHO_REPLY]++; + V_icmp6stat.icp6s_reflect++; + V_icmp6stat.icp6s_outhist[ICMP6_ECHO_REPLY]++; icmp6_reflect(n, noff); } break; @@ -631,7 +647,7 @@ { enum { WRU, FQDN } mode; - if (!icmp6_nodeinfo) + if (!V_icmp6_nodeinfo) break; if (icmp6len == sizeof(struct icmp6_hdr) + 4) @@ -641,7 +657,7 @@ else goto badlen; -#define hostnamelen strlen(hostname) +#define hostnamelen strlen(V_hostname) if (mode == FQDN) { #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo), @@ -660,7 +676,7 @@ * XXX: this combination of flags is pointless, * but should we keep this for compatibility? */ - if ((icmp6_nodeinfo & 5) != 5) + if ((V_icmp6_nodeinfo & 5) != 5) break; if (code != 0) @@ -707,7 +723,7 @@ bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); p = (u_char *)(nicmp6 + 1); bzero(p, 4); - bcopy(hostname, p + 4, maxhlen); /* meaningless TTL */ + bcopy(V_hostname, p + 4, maxhlen); /* meaningless TTL */ noff = sizeof(struct ip6_hdr); n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + 4 + maxhlen; @@ -716,8 +732,8 @@ } #undef hostnamelen if (n) { - icmp6stat.icp6s_reflect++; - icmp6stat.icp6s_outhist[ICMP6_WRUREPLY]++; + V_icmp6stat.icp6s_reflect++; + V_icmp6stat.icp6s_outhist[ICMP6_WRUREPLY]++; icmp6_reflect(n, noff); } break; @@ -838,11 +854,11 @@ break; badcode: - icmp6stat.icp6s_badcode++; + V_icmp6stat.icp6s_badcode++; break; badlen: - icmp6stat.icp6s_badlen++; + V_icmp6stat.icp6s_badlen++; break; } @@ -859,6 +875,7 @@ static int icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code) { + INIT_VNET_INET6(curvnet); struct mbuf *m = *mp; struct icmp6_hdr *icmp6; struct ip6_hdr *eip6; @@ -866,7 +883,7 @@ struct sockaddr_in6 icmp6src, icmp6dst; if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) { - icmp6stat.icp6s_tooshort++; + V_icmp6stat.icp6s_tooshort++; goto freeit; } #ifndef PULLDOWN_TEST @@ -877,7 +894,7 @@ IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { - icmp6stat.icp6s_tooshort++; + V_icmp6stat.icp6s_tooshort++; return (-1); } #endif @@ -912,7 +929,7 @@ IP6_EXTHDR_GET(eh, struct ip6_ext *, m, eoff, sizeof(*eh)); if (eh == NULL) { - icmp6stat.icp6s_tooshort++; + V_icmp6stat.icp6s_tooshort++; return (-1); } #endif @@ -940,7 +957,7 @@ IP6_EXTHDR_GET(rth, struct ip6_rthdr *, m, eoff, sizeof(*rth)); if (rth == NULL) { - icmp6stat.icp6s_tooshort++; + V_icmp6stat.icp6s_tooshort++; return (-1); } #endif @@ -966,7 +983,7 @@ struct ip6_rthdr0 *, m, eoff, rthlen); if (rth0 == NULL) { - icmp6stat.icp6s_tooshort++; + V_icmp6stat.icp6s_tooshort++; return (-1); } #endif @@ -988,7 +1005,7 @@ IP6_EXTHDR_GET(fh, struct ip6_frag *, m, eoff, sizeof(*fh)); if (fh == NULL) { - icmp6stat.icp6s_tooshort++; + V_icmp6stat.icp6s_tooshort++; return (-1); } #endif @@ -1023,7 +1040,7 @@ IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { - icmp6stat.icp6s_tooshort++; + V_icmp6stat.icp6s_tooshort++; return (-1); } #endif @@ -1090,6 +1107,7 @@ void icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated) { + INIT_VNET_INET6(curvnet); struct in6_addr *dst = ip6cp->ip6c_finaldst; struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6; struct mbuf *m = ip6cp->ip6c_m; /* will be necessary for scope issue */ @@ -1127,7 +1145,7 @@ if (mtu < tcp_maxmtu6(&inc, NULL)) { tcp_hc_updatemtu(&inc, mtu); - icmp6stat.icp6s_pmtuchg++; + V_icmp6stat.icp6s_pmtuchg++; } } @@ -1142,10 +1160,13 @@ * - joins NI group address at in6_ifattach() time only, does not cope * with hostname changes by sethostname(3) */ -#define hostnamelen strlen(hostname) +#define hostnamelen strlen(V_hostname) static struct mbuf * ni6_input(struct mbuf *m, int off) { + INIT_VNET_INET6(curvnet); + /* XXX this bellow is WRONG - MARKO */ + INIT_VPROCG(TD_TO_VPROCG(curthread)); struct icmp6_nodeinfo *ni6, *nni6; struct mbuf *n = NULL; u_int16_t qtype; @@ -1184,7 +1205,7 @@ * link-local (note that site-local unicast was deprecated and * ULA is defined as global scope-wise) */ - if ((icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 && + if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 && !IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) && !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) goto bad; @@ -1206,7 +1227,7 @@ goto bad; /* XXX impossible */ if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) && - !(icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) { + !(V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) { nd6log((LOG_DEBUG, "ni6_input: ignore node info to " "a temporary address in %s:%d", __FILE__, __LINE__)); @@ -1296,7 +1317,7 @@ * wildcard match, if gethostname(3) side has * truncated hostname. */ - n = ni6_nametodns(hostname, hostnamelen, 0); + n = ni6_nametodns(V_hostname, hostnamelen, 0); if (!n || n->m_next || n->m_len == 0) goto bad; IP6_EXTHDR_GET(subj, char *, m, @@ -1321,12 +1342,12 @@ /* refuse based on configuration. XXX ICMP6_NI_REFUSED? */ switch (qtype) { case NI_QTYPE_FQDN: - if ((icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0) + if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0) goto bad; break; case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: - if ((icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0) + if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0) goto bad; break; } @@ -1420,7 +1441,7 @@ /* * XXX do we really have FQDN in variable "hostname"? */ - n->m_next = ni6_nametodns(hostname, hostnamelen, oldfqdn); + n->m_next = ni6_nametodns(V_hostname, hostnamelen, oldfqdn); if (n->m_next == NULL) goto bad; /* XXX we assume that n->m_next is not a chain */ @@ -1627,6 +1648,8 @@ ni6_addrs(struct icmp6_nodeinfo *ni6, struct mbuf *m, struct ifnet **ifpp, struct in6_addr *subj) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET6(curvnet); struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; @@ -1649,7 +1672,7 @@ } IFNET_RLOCK(); - for (ifp = TAILQ_FIRST(&ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { + for (ifp = TAILQ_FIRST(&V_ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { addrsofif = 0; TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { if (ifa->ifa_addr->sa_family != AF_INET6) @@ -1696,7 +1719,7 @@ (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; /* we need only unicast addresses */ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && - (icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { + (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } addrsofif++; /* count the address */ @@ -1718,7 +1741,9 @@ ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6, struct ifnet *ifp0, int resid) { - struct ifnet *ifp = ifp0 ? ifp0 : TAILQ_FIRST(&ifnet); + INIT_VNET_NET(curvnet); + INIT_VNET_INET6(curvnet); + struct ifnet *ifp = ifp0 ? ifp0 : TAILQ_FIRST(&V_ifnet); struct in6_ifaddr *ifa6; struct ifaddr *ifa; struct ifnet *ifp_dep = NULL; @@ -1782,7 +1807,7 @@ (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && - (icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { + (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } @@ -1857,6 +1882,8 @@ static int icmp6_rip6_input(struct mbuf **mp, int off) { + INIT_VNET_INET(curvnet); + INIT_VNET_INET6(curvnet); struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct in6pcb *in6p; @@ -1889,8 +1916,8 @@ return (IPPROTO_DONE); } - INP_INFO_RLOCK(&ripcbinfo); - LIST_FOREACH(in6p, &ripcb, inp_list) { + INP_INFO_RLOCK(&V_ripcbinfo); + LIST_FOREACH(in6p, &V_ripcb, inp_list) { INP_LOCK(in6p); if ((in6p->inp_vflag & INP_IPV6) == 0) { docontinue: @@ -2006,9 +2033,9 @@ INP_UNLOCK(last); } else { m_freem(m); - ip6stat.ip6s_delivered--; + V_ip6stat.ip6s_delivered--; } - INP_INFO_RUNLOCK(&ripcbinfo); + INP_INFO_RUNLOCK(&V_ripcbinfo); return IPPROTO_DONE; } @@ -2019,6 +2046,7 @@ void icmp6_reflect(struct mbuf *m, size_t off) { + INIT_VNET_INET6(curvnet); struct ip6_hdr *ip6; struct icmp6_hdr *icmp6; struct in6_ifaddr *ia; @@ -2151,7 +2179,7 @@ /* XXX: This may not be the outgoing interface */ ip6->ip6_hlim = ND_IFINFO(m->m_pkthdr.rcvif)->chlim; } else - ip6->ip6_hlim = ip6_defhlim; + ip6->ip6_hlim = V_ip6_defhlim; icmp6->icmp6_cksum = 0; icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6, @@ -2198,6 +2226,7 @@ void icmp6_redirect_input(struct mbuf *m, int off) { + INIT_VNET_INET6(curvnet); struct ifnet *ifp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_redirect *nd_rd; @@ -2224,9 +2253,9 @@ return; /* XXX if we are router, we don't update route by icmp6 redirect */ - if (ip6_forwarding) + if (V_ip6_forwarding) goto freeit; - if (!icmp6_rediraccept) + if (!V_icmp6_rediraccept) goto freeit; #ifndef PULLDOWN_TEST @@ -2235,7 +2264,7 @@ #else IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len); if (nd_rd == NULL) { - icmp6stat.icp6s_tooshort++; + V_icmp6stat.icp6s_tooshort++; return; } #endif @@ -2398,13 +2427,14 @@ return; bad: - icmp6stat.icp6s_badredirect++; + V_icmp6stat.icp6s_badredirect++; m_freem(m); } void icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt) { + INIT_VNET_INET6(curvnet); struct ifnet *ifp; /* my outgoing interface */ struct in6_addr *ifp_ll6; struct in6_addr *router_ll6; @@ -2417,10 +2447,10 @@ struct ifnet *outif = NULL; struct sockaddr_in6 src_sa; - icmp6_errcount(&icmp6stat.icp6s_outerrhist, ND_REDIRECT, 0); + icmp6_errcount(&V_icmp6stat.icp6s_outerrhist, ND_REDIRECT, 0); /* if we are not router, we don't send icmp6 redirect */ - if (!ip6_forwarding) + if (!V_ip6_forwarding) goto fail; /* sanity check */ @@ -2672,7 +2702,7 @@ icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_redirect); } - icmp6stat.icp6s_outhist[ND_REDIRECT]++; + V_icmp6stat.icp6s_outhist[ND_REDIRECT]++; return; @@ -2771,13 +2801,14 @@ icmp6_ratelimit(const struct in6_addr *dst, const int type, const int code) { + INIT_VNET_INET6(curvnet); int ret; ret = 0; /* okay to send */ /* PPS limit */ - if (!ppsratecheck(&icmp6errppslim_last, &icmp6errpps_count, - icmp6errppslim)) { + if (!ppsratecheck(&icmp6errppslim_last, &V_icmp6errpps_count, + V_icmp6errppslim)) { /* The packet is subject to rate limit */ ret++; } --- /u/marko/p4/head/src/sys/netinet6/in6.c 2008-01-28 23:53:56.000000000 +0100 +++ src/sys/netinet6/in6.c 2008-02-27 11:49:35.000000000 +0100 @@ -65,6 +65,7 @@ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -78,7 +79,9 @@ #include #include #include +#include +#include #include #include #include @@ -92,6 +95,7 @@ #include #include +#include #include #include #include @@ -230,6 +234,7 @@ void in6_ifremloop(struct ifaddr *ifa) { + INIT_VNET_INET6(curvnet); struct in6_ifaddr *ia; struct rtentry *rt; int ia_count = 0; @@ -249,7 +254,7 @@ * (probably p2p) interfaces. * XXX: we should avoid such a configuration in IPv6... */ - for (ia = in6_ifaddr; ia; ia = ia->ia_next) { + for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) { if (IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa), &ia->ia_addr.sin6_addr)) { ia_count++; if (ia_count > 1) @@ -321,6 +326,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { + INIT_VNET_INET6(curvnet); struct in6_ifreq *ifr = (struct in6_ifreq *)data; struct in6_ifaddr *ia = NULL; struct in6_aliasreq *ifra = (struct in6_aliasreq *)data; @@ -733,7 +739,7 @@ * (when required). */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF) && - ip6_use_tempaddr && pr->ndpr_refcnt == 1) { + V_ip6_use_tempaddr && pr->ndpr_refcnt == 1) { int e; if ((e = in6_tmpifadd(ia, 1, 0)) != 0) { log(LOG_NOTICE, "in6_control: failed " @@ -794,6 +800,8 @@ in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { + INIT_VNET_INET6(ifp->if_vnet); + INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX V_hostname needs this */ int error = 0, hostIsNew = 0, plen = -1; struct in6_ifaddr *oia; struct sockaddr_in6 dst6; @@ -939,12 +947,12 @@ ia->ia_ifa.ifa_netmask = (struct sockaddr *)&ia->ia_prefixmask; ia->ia_ifp = ifp; - if ((oia = in6_ifaddr) != NULL) { + if ((oia = V_in6_ifaddr) != NULL) { for ( ; oia->ia_next; oia = oia->ia_next) continue; oia->ia_next = ia; } else - in6_ifaddr = ia; + V_in6_ifaddr = ia; ia->ia_ifa.ifa_refcnt = 1; TAILQ_INSERT_TAIL(&ifp->if_addrlist, &ia->ia_ifa, ifa_list); @@ -1149,7 +1157,7 @@ /* * join node information group address */ -#define hostnamelen strlen(hostname) +#define hostnamelen strlen(V_hostname) delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* @@ -1159,7 +1167,7 @@ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } - if (in6_nigroup(ifp, hostname, hostnamelen, &mltaddr.sin6_addr) + if (in6_nigroup(ifp, V_hostname, hostnamelen, &mltaddr.sin6_addr) == 0) { imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, delay); /* XXX jinmei */ @@ -1319,14 +1327,15 @@ static void in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp) { + INIT_VNET_INET6(ifp->if_vnet); struct in6_ifaddr *oia; int s = splnet(); TAILQ_REMOVE(&ifp->if_addrlist, &ia->ia_ifa, ifa_list); oia = ia; - if (oia == (ia = in6_ifaddr)) - in6_ifaddr = ia->ia_next; + if (oia == (ia = V_in6_ifaddr)) + V_in6_ifaddr = ia->ia_next; else { while (ia->ia_next && (ia->ia_next != oia)) ia = ia->ia_next; @@ -1885,12 +1894,13 @@ int in6_localaddr(struct in6_addr *in6) { + INIT_VNET_INET6(curvnet); struct in6_ifaddr *ia; if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6)) return 1; - for (ia = in6_ifaddr; ia; ia = ia->ia_next) { + for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) { if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr, &ia->ia_prefixmask.sin6_addr)) { return 1; @@ -1903,9 +1913,10 @@ int in6_is_addr_deprecated(struct sockaddr_in6 *sa6) { + INIT_VNET_INET6(curvnet); struct in6_ifaddr *ia; - for (ia = in6_ifaddr; ia; ia = ia->ia_next) { + for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) { if (IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &sa6->sin6_addr) && (ia->ia6_flags & IN6_IFF_DEPRECATED) != 0) @@ -1995,6 +2006,7 @@ struct in6_ifaddr * in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) { + INIT_VNET_INET6(curvnet); int dst_scope = in6_addrscope(dst), blen = -1, tlen; struct ifaddr *ifa; struct in6_ifaddr *besta = 0; @@ -2018,7 +2030,7 @@ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { - if (ip6_use_deprecated) + if (V_ip6_use_deprecated) dep[0] = (struct in6_ifaddr *)ifa; continue; } @@ -2052,7 +2064,7 @@ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { - if (ip6_use_deprecated) + if (V_ip6_use_deprecated) dep[1] = (struct in6_ifaddr *)ifa; continue; } @@ -2143,11 +2155,14 @@ void in6_setmaxmtu(void) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET6(curvnet); unsigned long maxmtu = 0; struct ifnet *ifp; IFNET_RLOCK(); - for (ifp = TAILQ_FIRST(&ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { + for (ifp = TAILQ_FIRST(&V_ifnet); ifp; + ifp = TAILQ_NEXT(ifp, if_list)) { /* this function can be called during ifnet initialization */ if (!ifp->if_afdata[AF_INET6]) continue; @@ -2157,7 +2172,7 @@ } IFNET_RUNLOCK(); if (maxmtu) /* update only when maxmtu is positive */ - in6_maxmtu = maxmtu; + V_in6_maxmtu = maxmtu; } /* --- /u/marko/p4/head/src/sys/netinet6/in6_gif.c 2007-12-27 19:33:01.000000000 +0100 +++ src/sys/netinet6/in6_gif.c 2008-01-14 19:23:57.000000000 +0100 @@ -34,6 +34,7 @@ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -44,8 +45,8 @@ #include #include #include - #include +#include #include #include @@ -57,6 +58,7 @@ #endif #include #ifdef INET6 +#include #include #include #include @@ -74,12 +76,15 @@ struct ifnet *); extern struct domain inet6domain; -struct ip6protosw in6_gif_protosw = -{ SOCK_RAW, &inet6domain, 0/* IPPROTO_IPV[46] */, PR_ATOMIC|PR_ADDR, - in6_gif_input, rip6_output, 0, rip6_ctloutput, - 0, - 0, 0, 0, 0, - &rip6_usrreqs +struct ip6protosw in6_gif_protosw = { + .pr_type = SOCK_RAW, + .pr_domain = &inet6domain, + .pr_protocol = 0/* IPPROTO_IPV[46] */, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = in6_gif_input, + .pr_output = rip6_output, + .pr_ctloutput = rip6_ctloutput, + .pr_usrreqs = &rip6_usrreqs }; int @@ -87,6 +92,7 @@ int family, /* family of the packet to be encapsulate */ struct mbuf *m) { + INIT_VNET_GIF(ifp->if_vnet); struct gif_softc *sc = ifp->if_softc; struct sockaddr_in6 *dst = (struct sockaddr_in6 *)&sc->gif_ro6.ro_dst; struct sockaddr_in6 *sin6_src = (struct sockaddr_in6 *)sc->gif_psrc; @@ -175,7 +181,7 @@ ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_plen = htons((u_short)m->m_pkthdr.len); ip6->ip6_nxt = proto; - ip6->ip6_hlim = ip6_gif_hlim; + ip6->ip6_hlim = V_ip6_gif_hlim; ip6->ip6_src = sin6_src->sin6_addr; /* bidirectional configured tunnel mode */ if (!IN6_IS_ADDR_UNSPECIFIED(&sin6_dst->sin6_addr)) @@ -246,6 +252,7 @@ int in6_gif_input(struct mbuf **mp, int *offp, int proto) { + INIT_VNET_INET6(curvnet); struct mbuf *m = *mp; struct ifnet *gifp = NULL; struct gif_softc *sc; @@ -258,14 +265,14 @@ sc = (struct gif_softc *)encap_getarg(m); if (sc == NULL) { m_freem(m); - ip6stat.ip6s_nogif++; + V_ip6stat.ip6s_nogif++; return IPPROTO_DONE; } gifp = GIF2IFP(sc); if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) { m_freem(m); - ip6stat.ip6s_nogif++; + V_ip6stat.ip6s_nogif++; return IPPROTO_DONE; } @@ -320,7 +327,7 @@ break; default: - ip6stat.ip6s_nogif++; + V_ip6stat.ip6s_nogif++; m_freem(m); return IPPROTO_DONE; } --- /u/marko/p4/head/src/sys/netinet6/in6_ifattach.c 2008-01-15 18:01:18.000000000 +0100 +++ src/sys/netinet6/in6_ifattach.c 2008-02-27 11:49:36.000000000 +0100 @@ -32,6 +32,8 @@ #include __FBSDID("$FreeBSD: src/sys/netinet6/in6_ifattach.c,v 1.41 2008/01/08 19:08:57 obrien Exp $"); +#include "opt_vimage.h" + #include #include #include @@ -40,18 +42,22 @@ #include #include #include +#include +#include #include #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -60,18 +66,13 @@ #include #include -unsigned long in6_maxmtu = 0; - -#ifdef IP6_AUTO_LINKLOCAL -int ip6_auto_linklocal = IP6_AUTO_LINKLOCAL; -#else -int ip6_auto_linklocal = 1; /* enable by default */ -#endif - +#ifndef VIMAGE +unsigned long in6_maxmtu; +int ip6_auto_linklocal; struct callout in6_tmpaddrtimer_ch; - -extern struct inpcbinfo udbinfo; extern struct inpcbinfo ripcbinfo; +extern struct inpcbinfo udbinfo; +#endif static int get_rand_ifid(struct ifnet *, struct in6_addr *); static int generate_tmp_ifid(u_int8_t *, const u_int8_t *, u_int8_t *); @@ -103,9 +104,10 @@ static int get_rand_ifid(struct ifnet *ifp, struct in6_addr *in6) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX V_hostname needs this */ MD5_CTX ctxt; u_int8_t digest[16]; - int hostnamelen = strlen(hostname); + int hostnamelen = strlen(V_hostname); #if 0 /* we need at least several letters as seed for ifid */ @@ -116,7 +118,7 @@ /* generate 8 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); - MD5Update(&ctxt, hostname, hostnamelen); + MD5Update(&ctxt, V_hostname, hostnamelen); MD5Final(digest, &ctxt); /* assumes sizeof(digest) > sizeof(ifid) */ @@ -135,6 +137,7 @@ static int generate_tmp_ifid(u_int8_t *seed0, const u_int8_t *seed1, u_int8_t *ret) { + INIT_VNET_INET6(curvnet); MD5_CTX ctxt; u_int8_t seed[16], digest[16], nullbuf[8]; u_int32_t val32; @@ -354,6 +357,8 @@ get_ifid(struct ifnet *ifp0, struct ifnet *altifp, struct in6_addr *in6) { + INIT_VNET_NET(ifp0->if_vnet); + INIT_VNET_INET6(ifp0->if_vnet); struct ifnet *ifp; /* first, try to get it from the interface itself */ @@ -372,7 +377,7 @@ /* next, try to get it from some other hardware interface */ IFNET_RLOCK(); - for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_list.tqe_next) { + for (ifp = V_ifnet.tqh_first; ifp; ifp = ifp->if_list.tqe_next) { if (ifp == ifp0) continue; if (in6_get_hw_ifid(ifp, in6) != 0) @@ -417,6 +422,7 @@ static int in6_ifattach_linklocal(struct ifnet *ifp, struct ifnet *altifp) { + INIT_VNET_INET6(curvnet); struct in6_ifaddr *ia; struct in6_aliasreq ifra; struct nd_prefixctl pr0; @@ -533,6 +539,7 @@ static int in6_ifattach_loopback(struct ifnet *ifp) { + INIT_VNET_INET6(curvnet); struct in6_aliasreq ifra; int error; @@ -644,6 +651,7 @@ void in6_ifattach(struct ifnet *ifp, struct ifnet *altifp) { + INIT_VNET_INET6(ifp->if_vnet); struct in6_ifaddr *ia; struct in6_addr in6; @@ -698,7 +706,7 @@ /* * assign a link-local address, if there's none. */ - if (ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) { + if (V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) { ia = in6ifa_ifpforlinklocal(ifp, 0); if (ia == NULL) { if (in6_ifattach_linklocal(ifp, altifp) == 0) { @@ -714,8 +722,8 @@ #endif /* update dynamically. */ - if (in6_maxmtu < ifp->if_mtu) - in6_maxmtu = ifp->if_mtu; + if (V_in6_maxmtu < ifp->if_mtu) + V_in6_maxmtu = ifp->if_mtu; } /* @@ -726,6 +734,9 @@ void in6_ifdetach(struct ifnet *ifp) { + INIT_VNET_NET(ifp->if_vnet); + INIT_VNET_INET(ifp->if_vnet); + INIT_VNET_INET6(ifp->if_vnet); struct in6_ifaddr *ia, *oia; struct ifaddr *ifa, *next; struct rtentry *rt; @@ -780,8 +791,8 @@ /* also remove from the IPv6 address chain(itojun&jinmei) */ oia = ia; - if (oia == (ia = in6_ifaddr)) - in6_ifaddr = ia->ia_next; + if (oia == (ia = V_in6_ifaddr)) + V_in6_ifaddr = ia->ia_next; else { while (ia->ia_next && (ia->ia_next != oia)) ia = ia->ia_next; @@ -797,8 +808,8 @@ IFAFREE(&oia->ia_ifa); } - in6_pcbpurgeif0(&udbinfo, ifp); - in6_pcbpurgeif0(&ripcbinfo, ifp); + in6_pcbpurgeif0(&V_udbinfo, ifp); + in6_pcbpurgeif0(&V_ripcbinfo, ifp); /* leave from all multicast groups joined */ in6_purgemaddrs(ifp); @@ -821,15 +832,15 @@ /* XXX: should not fail */ return; /* XXX grab lock first to avoid LOR */ - if (rt_tables[AF_INET6] != NULL) { - RADIX_NODE_HEAD_LOCK(rt_tables[AF_INET6]); + if (V_rt_tables[AF_INET6] != NULL) { + RADIX_NODE_HEAD_LOCK(V_rt_tables[AF_INET6]); rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL); if (rt) { if (rt->rt_ifp == ifp) rtexpunge(rt); RTFREE_LOCKED(rt); } - RADIX_NODE_HEAD_UNLOCK(rt_tables[AF_INET6]); + RADIX_NODE_HEAD_UNLOCK(V_rt_tables[AF_INET6]); } } @@ -859,19 +870,22 @@ } void -in6_tmpaddrtimer(void *ignored_arg) +in6_tmpaddrtimer(void *arg) { + CURVNET_SET((struct vnet *) arg); + INIT_VNET_NET(curvnet); + INIT_VNET_INET6(curvnet); struct nd_ifinfo *ndi; u_int8_t nullbuf[8]; struct ifnet *ifp; - int s = splnet(); - callout_reset(&in6_tmpaddrtimer_ch, - (ip6_temp_preferred_lifetime - ip6_desync_factor - - ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, NULL); + callout_reset(&V_in6_tmpaddrtimer_ch, + (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - + V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, arg); bzero(nullbuf, sizeof(nullbuf)); - for (ifp = TAILQ_FIRST(&ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { + for (ifp = TAILQ_FIRST(&V_ifnet); ifp; + ifp = TAILQ_NEXT(ifp, if_list)) { ndi = ND_IFINFO(ifp); if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) != 0) { /* @@ -883,7 +897,7 @@ } } - splx(s); + CURVNET_RESTORE(); } static void --- /u/marko/p4/head/src/sys/netinet6/in6_pcb.c 2008-01-15 18:01:21.000000000 +0100 +++ src/sys/netinet6/in6_pcb.c 2008-02-27 11:49:37.000000000 +0100 @@ -67,6 +67,7 @@ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -82,19 +83,24 @@ #include #include #include +#include #include +#include #include #include #include +#include #include #include #include #include #include #include + +#include #include #include #include @@ -115,6 +121,8 @@ in6_pcbbind(register struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { + INIT_VNET_INET6(inp->inp_vnet); + INIT_VNET_INET(inp->inp_vnet); struct socket *so = inp->inp_socket; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)NULL; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; @@ -124,7 +132,7 @@ INP_INFO_WLOCK_ASSERT(pcbinfo); INP_LOCK_ASSERT(inp); - if (!in6_ifaddr) /* XXX broken! */ + if (!V_in6_ifaddr) /* XXX broken! */ return (EADDRNOTAVAIL); if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) return (EINVAL); @@ -142,7 +150,7 @@ if (nam->sa_family != AF_INET6) return (EAFNOSUPPORT); - if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0) + if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) return(error); lport = sin6->sin6_port; @@ -179,8 +187,8 @@ struct inpcb *t; /* GROSS */ - if (ntohs(lport) <= ipport_reservedhigh && - ntohs(lport) >= ipport_reservedlow && + if (ntohs(lport) <= V_ipport_reservedhigh && + ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) return (EACCES); @@ -282,6 +290,7 @@ in6_pcbladdr(register struct inpcb *inp, struct sockaddr *nam, struct in6_addr **plocal_addr6) { + INIT_VNET_INET6(inp->inp_vnet); register struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; int error = 0; struct ifnet *ifp = NULL; @@ -297,12 +306,12 @@ if (sin6->sin6_port == 0) return (EADDRNOTAVAIL); - if (sin6->sin6_scope_id == 0 && !ip6_use_defzone) + if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; - if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0) + if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) return(error); - if (in6_ifaddr) { + if (V_in6_ifaddr) { /* * If the destination address is UNSPECIFIED addr, * use the loopback addr, e.g ::1. --- /u/marko/p4/head/src/sys/netinet6/in6_proto.c 2008-01-15 18:01:23.000000000 +0100 +++ src/sys/netinet6/in6_proto.c 2008-02-27 11:49:39.000000000 +0100 @@ -69,6 +69,7 @@ #include "opt_ipstealth.h" #include "opt_carp.h" #include "opt_sctp.h" +#include "opt_vimage.h" #include #include @@ -79,10 +80,12 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -91,6 +94,7 @@ #include #include #include +#include #include #include @@ -140,6 +144,9 @@ .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV6, .pr_init = ip6_init, +#ifdef VIMAGE + .pr_destroy = ip6_destroy, +#endif .pr_slowtimo = frag6_slowtimo, .pr_drain = frag6_drain, .pr_usrreqs = &nousrreqs, @@ -340,21 +347,9 @@ }; extern int in6_inithead(void **, int); - -struct domain inet6domain = { - .dom_family = AF_INET6, - .dom_name = "internet6", - .dom_protosw = (struct protosw *)inet6sw, - .dom_protoswNPROTOSW = (struct protosw *) - &inet6sw[sizeof(inet6sw)/sizeof(inet6sw[0])], - .dom_rtattach = in6_inithead, - .dom_rtoffset = offsetof(struct sockaddr_in6, sin6_addr) << 3, - .dom_maxrtkey = sizeof(struct sockaddr_in6), - .dom_ifattach = in6_domifattach, - .dom_ifdetach = in6_domifdetach -}; - -DOMAIN_SET(inet6); +#ifdef VIMAGE +extern int in6_detachhead(void **, int); +#endif /* * Internet configuration info @@ -371,29 +366,31 @@ #define IPV6_SENDREDIRECTS 1 #endif -int ip6_forwarding = IPV6FORWARDING; /* act as router? */ -int ip6_sendredirects = IPV6_SENDREDIRECTS; -int ip6_defhlim = IPV6_DEFHLIM; -int ip6_defmcasthlim = IPV6_DEFAULT_MULTICAST_HOPS; -int ip6_accept_rtadv = 0; /* "IPV6FORWARDING ? 0 : 1" is dangerous */ +#ifndef VIMAGE +int ip6_forwarding; /* act as router? */ +int ip6_sendredirects; +int ip6_defhlim; +int ip6_defmcasthlim; +int ip6_accept_rtadv; /* "IPV6FORWARDING ? 0 : 1" is dangerous */ int ip6_maxfragpackets; /* initialized in frag6.c:frag6_init() */ -int ip6_maxfrags; /* initialized in frag6.c:frag6_init() */ -int ip6_log_interval = 5; -int ip6_hdrnestlimit = 15; /* How many header options will we process? */ -int ip6_dad_count = 1; /* DupAddrDetectionTransmits */ -int ip6_auto_flowlabel = 1; -int ip6_gif_hlim = 0; -int ip6_use_deprecated = 1; /* allow deprecated addr (RFC2462 5.5.4) */ -int ip6_rr_prune = 5; /* router renumbering prefix +int ip6_maxfrags; /* initialized in frag6.c:frag6_init() */ +int ip6_log_interval; +int ip6_hdrnestlimit; /* How many header options will we process? */ +int ip6_dad_count; /* DupAddrDetectionTransmits */ +int ip6_auto_flowlabel; +int ip6_gif_hlim = 0; +int ip6_use_deprecated; /* allow deprecated addr (RFC2462 5.5.4) */ +int ip6_rr_prune; /* router renumbering prefix * walk list every 5 sec. */ -int ip6_mcast_pmtu = 0; /* enable pMTU discovery for multicast? */ -int ip6_v6only = 1; +int ip6_mcast_pmtu; /* enable pMTU discovery for multicast? */ +int ip6_v6only; -int ip6_keepfaith = 0; -time_t ip6_log_time = (time_t)0L; +int ip6_keepfaith; +time_t ip6_log_time; #ifdef IPSTEALTH -int ip6stealth = 0; +int ip6stealth; #endif +#endif /* !VIMAGE */ /* icmp6 */ /* @@ -401,8 +398,10 @@ * XXX: what if we don't define INET? Should we define pmtu6_expire * or so? (jinmei@kame.net 19990310) */ -int pmtu_expire = 60*10; -int pmtu_probe = 60*2; +#ifndef VIMAGE +int pmtu_expire; +int pmtu_probe; +#endif /* raw IP6 parameters */ /* @@ -411,20 +410,21 @@ #define RIPV6SNDQ 8192 #define RIPV6RCVQ 8192 -u_long rip6_sendspace = RIPV6SNDQ; -u_long rip6_recvspace = RIPV6RCVQ; +#ifndef VIMAGE +u_long rip6_sendspace; +u_long rip6_recvspace; /* ICMPV6 parameters */ -int icmp6_rediraccept = 1; /* accept and process redirects */ -int icmp6_redirtimeout = 10 * 60; /* 10 minutes */ -int icmp6errppslim = 100; /* 100pps */ +int icmp6_rediraccept; /* accept and process redirects */ +int icmp6_redirtimeout; +int icmp6errppslim; /* control how to respond to NI queries */ -int icmp6_nodeinfo = (ICMP6_NODEINFO_FQDNOK|ICMP6_NODEINFO_NODEADDROK); +int icmp6_nodeinfo; /* UDP on IP6 parameters */ -int udp6_sendspace = 9216; /* really max datagram size */ -int udp6_recvspace = 40 * (1024 + sizeof(struct sockaddr_in6)); - /* 40 1K datagrams */ +int udp6_sendspace; /* really max datagram size */ +int udp6_recvspace; +#endif /* !VIMAGE */ /* * sysctl related items. @@ -446,124 +446,201 @@ /* net.inet6.ip6 */ static int +#ifdef VIMAGE +sysctl_ip6_temppltime(SYSCTL_HANDLER_V_ARGS) +#else sysctl_ip6_temppltime(SYSCTL_HANDLER_ARGS) +#endif { + INIT_VNET_INET6(curvnet); +#ifdef VIMAGE + SYSCTL_RESOLVE_V_ARG1(); +#endif int error = 0; int old; error = SYSCTL_OUT(req, arg1, sizeof(int)); if (error || !req->newptr) return (error); - old = ip6_temp_preferred_lifetime; + old = V_ip6_temp_preferred_lifetime; error = SYSCTL_IN(req, arg1, sizeof(int)); - if (ip6_temp_preferred_lifetime < - ip6_desync_factor + ip6_temp_regen_advance) { - ip6_temp_preferred_lifetime = old; + if (V_ip6_temp_preferred_lifetime < + V_ip6_desync_factor + V_ip6_temp_regen_advance) { + V_ip6_temp_preferred_lifetime = old; return (EINVAL); } return (error); } static int +#ifdef VIMAGE +sysctl_ip6_tempvltime(SYSCTL_HANDLER_V_ARGS) +#else sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS) +#endif { + INIT_VNET_INET6(curvnet); +#ifdef VIMAGE + SYSCTL_RESOLVE_V_ARG1(); +#endif int error = 0; int old; error = SYSCTL_OUT(req, arg1, sizeof(int)); if (error || !req->newptr) return (error); - old = ip6_temp_valid_lifetime; + old = V_ip6_temp_valid_lifetime; error = SYSCTL_IN(req, arg1, sizeof(int)); - if (ip6_temp_valid_lifetime < ip6_temp_preferred_lifetime) { - ip6_temp_preferred_lifetime = old; + if (V_ip6_temp_valid_lifetime < V_ip6_temp_preferred_lifetime) { + V_ip6_temp_preferred_lifetime = old; return (EINVAL); } return (error); } -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, - forwarding, CTLFLAG_RW, &ip6_forwarding, 0, ""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_SENDREDIRECTS, - redirect, CTLFLAG_RW, &ip6_sendredirects, 0, ""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM, - hlim, CTLFLAG_RW, &ip6_defhlim, 0, ""); -SYSCTL_STRUCT(_net_inet6_ip6, IPV6CTL_STATS, stats, CTLFLAG_RD, - &ip6stat, ip6stat, ""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, - maxfragpackets, CTLFLAG_RW, &ip6_maxfragpackets, 0, ""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, - accept_rtadv, CTLFLAG_RW, &ip6_accept_rtadv, 0, ""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_KEEPFAITH, - keepfaith, CTLFLAG_RW, &ip6_keepfaith, 0, ""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_LOG_INTERVAL, - log_interval, CTLFLAG_RW, &ip6_log_interval, 0, ""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_HDRNESTLIMIT, - hdrnestlimit, CTLFLAG_RW, &ip6_hdrnestlimit, 0, ""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DAD_COUNT, - dad_count, CTLFLAG_RW, &ip6_dad_count, 0, ""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_FLOWLABEL, - auto_flowlabel, CTLFLAG_RW, &ip6_auto_flowlabel, 0, ""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFMCASTHLIM, - defmcasthlim, CTLFLAG_RW, &ip6_defmcasthlim, 0, ""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_GIF_HLIM, - gifhlim, CTLFLAG_RW, &ip6_gif_hlim, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_FORWARDING, + forwarding, CTLFLAG_RW, ip6_forwarding, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_SENDREDIRECTS, + redirect, CTLFLAG_RW, ip6_sendredirects, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_DEFHLIM, + hlim, CTLFLAG_RW, ip6_defhlim, 0, ""); +SYSCTL_V_STRUCT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_STATS, stats, + CTLFLAG_RD, ip6stat, ip6stat, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, + maxfragpackets, CTLFLAG_RW, ip6_maxfragpackets, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, + accept_rtadv, CTLFLAG_RW, ip6_accept_rtadv, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_KEEPFAITH, + keepfaith, CTLFLAG_RW, ip6_keepfaith, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_LOG_INTERVAL, + log_interval, CTLFLAG_RW, ip6_log_interval, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_HDRNESTLIMIT, + hdrnestlimit, CTLFLAG_RW, ip6_hdrnestlimit, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_DAD_COUNT, + dad_count, CTLFLAG_RW, ip6_dad_count, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_AUTO_FLOWLABEL, + auto_flowlabel, CTLFLAG_RW, ip6_auto_flowlabel, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_DEFMCASTHLIM, + defmcasthlim, CTLFLAG_RW, ip6_defmcasthlim, 0, ""); SYSCTL_STRING(_net_inet6_ip6, IPV6CTL_KAME_VERSION, kame_version, CTLFLAG_RD, __KAME_VERSION, 0, ""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEPRECATED, - use_deprecated, CTLFLAG_RW, &ip6_use_deprecated, 0, ""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RR_PRUNE, - rr_prune, CTLFLAG_RW, &ip6_rr_prune, 0, ""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USETEMPADDR, - use_tempaddr, CTLFLAG_RW, &ip6_use_tempaddr, 0, ""); -SYSCTL_OID(_net_inet6_ip6, IPV6CTL_TEMPPLTIME, temppltime, - CTLTYPE_INT|CTLFLAG_RW, &ip6_temp_preferred_lifetime, 0, - sysctl_ip6_temppltime, "I", ""); -SYSCTL_OID(_net_inet6_ip6, IPV6CTL_TEMPVLTIME, tempvltime, - CTLTYPE_INT|CTLFLAG_RW, &ip6_temp_valid_lifetime, 0, - sysctl_ip6_tempvltime, "I", ""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_V6ONLY, - v6only, CTLFLAG_RW, &ip6_v6only, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_USE_DEPRECATED, + use_deprecated, CTLFLAG_RW, ip6_use_deprecated, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_RR_PRUNE, + rr_prune, CTLFLAG_RW, ip6_rr_prune, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_USETEMPADDR, + use_tempaddr, CTLFLAG_RW, ip6_use_tempaddr, 0, ""); +SYSCTL_V_OID(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_TEMPPLTIME, temppltime, + CTLTYPE_INT|CTLFLAG_RW, ip6_temp_preferred_lifetime, 0, + sysctl_ip6_temppltime, "I", ""); +SYSCTL_V_OID(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_TEMPVLTIME, tempvltime, + CTLTYPE_INT|CTLFLAG_RW, ip6_temp_valid_lifetime, 0, + sysctl_ip6_tempvltime, "I", ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_V6ONLY, + v6only, CTLFLAG_RW, ip6_v6only, 0, ""); +#ifndef VIMAGE TUNABLE_INT("net.inet6.ip6.auto_linklocal", &ip6_auto_linklocal); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL, - auto_linklocal, CTLFLAG_RW, &ip6_auto_linklocal, 0, ""); -SYSCTL_STRUCT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats, CTLFLAG_RD, - &rip6stat, rip6stat, ""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR, - prefer_tempaddr, CTLFLAG_RW, &ip6_prefer_tempaddr, 0, ""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, - use_defaultzone, CTLFLAG_RW, &ip6_use_defzone, 0,""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, - maxfrags, CTLFLAG_RW, &ip6_maxfrags, 0, ""); -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, - mcast_pmtu, CTLFLAG_RW, &ip6_mcast_pmtu, 0, ""); +#endif +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL, + auto_linklocal, CTLFLAG_RW, ip6_auto_linklocal, 0, ""); +SYSCTL_V_STRUCT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_RIP6STATS, + rip6stats, CTLFLAG_RD, rip6stat, rip6stat, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR, + prefer_tempaddr, CTLFLAG_RW, ip6_prefer_tempaddr, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, + use_defaultzone, CTLFLAG_RW, ip6_use_defzone, 0,""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_MAXFRAGS, + maxfrags, CTLFLAG_RW, ip6_maxfrags, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_MCAST_PMTU, + mcast_pmtu, CTLFLAG_RW, ip6_mcast_pmtu, 0, ""); #ifdef IPSTEALTH -SYSCTL_INT(_net_inet6_ip6, IPV6CTL_STEALTH, stealth, CTLFLAG_RW, - &ip6stealth, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_STEALTH, stealth, CTLFLAG_RW, +ip6stealth, 0, ""); #endif /* net.inet6.icmp6 */ -SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, - rediraccept, CTLFLAG_RW, &icmp6_rediraccept, 0, ""); -SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT, - redirtimeout, CTLFLAG_RW, &icmp6_redirtimeout, 0, ""); -SYSCTL_STRUCT(_net_inet6_icmp6, ICMPV6CTL_STATS, stats, CTLFLAG_RD, - &icmp6stat, icmp6stat, ""); -SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_PRUNE, - nd6_prune, CTLFLAG_RW, &nd6_prune, 0, ""); -SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DELAY, - nd6_delay, CTLFLAG_RW, &nd6_delay, 0, ""); -SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_UMAXTRIES, - nd6_umaxtries, CTLFLAG_RW, &nd6_umaxtries, 0, ""); -SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MMAXTRIES, - nd6_mmaxtries, CTLFLAG_RW, &nd6_mmaxtries, 0, ""); -SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_USELOOPBACK, - nd6_useloopback, CTLFLAG_RW, &nd6_useloopback, 0, ""); -SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO, - nodeinfo, CTLFLAG_RW, &icmp6_nodeinfo, 0, ""); -SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT, - errppslimit, CTLFLAG_RW, &icmp6errppslim, 0, ""); -SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXNUDHINT, - nd6_maxnudhint, CTLFLAG_RW, &nd6_maxnudhint, 0, ""); -SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, - nd6_debug, CTLFLAG_RW, &nd6_debug, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, + rediraccept, CTLFLAG_RW, icmp6_rediraccept, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT, + redirtimeout, CTLFLAG_RW, icmp6_redirtimeout, 0, ""); +SYSCTL_V_STRUCT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_STATS, + stats, CTLFLAG_RD, icmp6stat, icmp6stat, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_PRUNE, + nd6_prune, CTLFLAG_RW, nd6_prune, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_DELAY, + nd6_delay, CTLFLAG_RW, nd6_delay, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_UMAXTRIES, + nd6_umaxtries, CTLFLAG_RW, nd6_umaxtries, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_MMAXTRIES, + nd6_mmaxtries, CTLFLAG_RW, nd6_mmaxtries, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_USELOOPBACK, + nd6_useloopback, CTLFLAG_RW, nd6_useloopback, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_NODEINFO, + nodeinfo, CTLFLAG_RW, icmp6_nodeinfo, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT, + errppslimit, CTLFLAG_RW, icmp6errppslim, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_MAXNUDHINT, + nd6_maxnudhint, CTLFLAG_RW, nd6_maxnudhint, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, + nd6_debug, CTLFLAG_RW, nd6_debug, 0, ""); + +static void +ip6_dom_init(void) +{ + INIT_VNET_INET6(curvnet); + + V_ip6_forwarding = IPV6FORWARDING; + V_ip6_sendredirects = IPV6_SENDREDIRECTS; + V_ip6_defhlim = IPV6_DEFHLIM; + V_ip6_defmcasthlim = IPV6_DEFAULT_MULTICAST_HOPS; + V_ip6_accept_rtadv = 0; + V_ip6_log_interval = 5; + V_ip6_hdrnestlimit = 15; + V_ip6_dad_count = 1; + V_ip6_auto_flowlabel = 1; + V_ip6_use_deprecated = 1; + V_ip6_rr_prune = 5; + V_ip6_mcast_pmtu = 0; + V_ip6_v6only = 1; + V_ip6_keepfaith = 0; + V_ip6_log_time = (time_t)0L; +#ifdef IPSTEALTH + V_ip6stealth = 0; +#endif + V_pmtu_expire = 60*10; + V_pmtu_probe = 60*2; + V_rip6_sendspace = RIPV6SNDQ; + V_rip6_recvspace = RIPV6RCVQ; + + /* ICMPV6 parameters */ + V_icmp6_rediraccept = 1; + V_icmp6_redirtimeout = 10 * 60; /* 10 minutes */ + V_icmp6errppslim = 100; /* 100pps */ + /* control how to respond to NI queries */ + V_icmp6_nodeinfo = (ICMP6_NODEINFO_FQDNOK|ICMP6_NODEINFO_NODEADDROK); + + /* UDP on IP6 parameters */ + V_udp6_sendspace = 9216; /* really max datagram size */ + V_udp6_recvspace = 40 * (1024 + sizeof(struct sockaddr_in6)); + /* 40 1K datagrams */ + +} + +struct domain inet6domain = { + .dom_family = AF_INET6, + .dom_name = "internet6", + .dom_protosw = (struct protosw *)inet6sw, + .dom_protoswNPROTOSW = (struct protosw *) + &inet6sw[sizeof(inet6sw)/sizeof(inet6sw[0])], + .dom_rtattach = in6_inithead, +#ifdef VIMAGE + .dom_rtdetach = in6_detachhead, +#endif + .dom_rtoffset = offsetof(struct sockaddr_in6, sin6_addr) << 3, + .dom_maxrtkey = sizeof(struct sockaddr_in6), + .dom_ifattach = in6_domifattach, + .dom_ifdetach = in6_domifdetach, + .dom_init = ip6_dom_init +}; + +DOMAIN_SET(inet6); --- /u/marko/p4/head/src/sys/netinet6/in6_rmx.c 2008-01-15 18:01:23.000000000 +0100 +++ src/sys/netinet6/in6_rmx.c 2008-02-27 18:01:23.000000000 +0100 @@ -75,6 +75,8 @@ #include __FBSDID("$FreeBSD: src/sys/netinet6/in6_rmx.c,v 1.20 2008/01/08 19:08:57 obrien Exp $"); +#include "opt_vimage.h" + #include #include #include @@ -85,13 +87,17 @@ #include #include #include +#include +#include #include #include #include #include #include +#include + #include #include @@ -104,6 +110,9 @@ #include extern int in6_inithead(void **head, int off); +#ifdef VIMAGE +extern int in6_detachhead(void **head, int off); +#endif #define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ @@ -321,20 +330,25 @@ } #define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ -static int rtq_timeout = RTQ_TIMEOUT; -static struct callout rtq_timer; +#ifndef VIMAGE +static int rtq_timeout6; +static struct callout rtq_timer6; +#endif static void in6_rtqtimo(void *rock) { - struct radix_node_head *rnh = rock; + CURVNET_SET_QUIET((struct vnet *) rock); + INIT_VNET_NET((struct vnet *) rock); + INIT_VNET_INET6((struct vnet *) rock); + struct radix_node_head *rnh = V_rt_tables[AF_INET6]; struct rtqk_arg arg; struct timeval atv; static time_t last_adjusted_timeout = 0; arg.found = arg.killed = 0; arg.rnh = rnh; - arg.nextstop = time_uptime + rtq_timeout; + arg.nextstop = time_uptime + V_rtq_timeout6; arg.draining = arg.updating = 0; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, in6_rtqkill, &arg); @@ -349,7 +363,7 @@ * hard. */ if ((arg.found - arg.killed > rtq_toomany) - && (time_uptime - last_adjusted_timeout >= rtq_timeout) + && (time_uptime - last_adjusted_timeout >= V_rtq_timeout6) && rtq_reallyold > rtq_minreallyold) { rtq_reallyold = 2*rtq_reallyold / 3; if (rtq_reallyold < rtq_minreallyold) { @@ -370,7 +384,8 @@ atv.tv_usec = 0; atv.tv_sec = arg.nextstop - time_uptime; - callout_reset(&rtq_timer, tvtohz(&atv), in6_rtqtimo, rock); + callout_reset(&V_rtq_timer6, tvtohz(&atv), in6_rtqtimo, rock); + CURVNET_RESTORE(); } /* @@ -380,7 +395,9 @@ struct radix_node_head *rnh; time_t nextstop; }; +#ifndef VIMAGE static struct callout rtq_mtutimer; +#endif static int in6_mtuexpire(struct radix_node *rn, void *rock) @@ -409,7 +426,10 @@ static void in6_mtutimo(void *rock) { - struct radix_node_head *rnh = rock; + CURVNET_SET_QUIET((struct vnet *) rock); + INIT_VNET_NET((struct vnet *) rock); + INIT_VNET_INET6((struct vnet *) rock); + struct radix_node_head *rnh = V_rt_tables[AF_INET6]; struct mtuex_arg arg; struct timeval atv; @@ -426,14 +446,16 @@ arg.nextstop = time_uptime + 30; /* last resort */ atv.tv_sec = 30; } - callout_reset(&rtq_mtutimer, tvtohz(&atv), in6_mtutimo, rock); + callout_reset(&V_rtq_mtutimer, tvtohz(&atv), in6_mtutimo, rock); + CURVNET_RESTORE(); } #if 0 void in6_rtqdrain(void) { - struct radix_node_head *rnh = rt_tables[AF_INET6]; + INIT_VNET_NET(curvnet); + struct radix_node_head *rnh = V_rt_tables[AF_INET6]; struct rtqk_arg arg; arg.found = arg.killed = 0; @@ -453,21 +475,36 @@ int in6_inithead(void **head, int off) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET6(curvnet); struct radix_node_head *rnh; if (!rn_inithead(head, off)) return 0; - if (head != (void **)&rt_tables[AF_INET6]) /* BOGUS! */ + if (head != (void **)&V_rt_tables[AF_INET6]) /* BOGUS! */ return 1; /* only do this for the real routing table */ + V_rtq_timeout6 = RTQ_TIMEOUT; rnh = *head; rnh->rnh_addaddr = in6_addroute; rnh->rnh_matchaddr = in6_matroute; rnh->rnh_close = in6_clsroute; - callout_init(&rtq_timer, CALLOUT_MPSAFE); - in6_rtqtimo(rnh); /* kick off timeout first time */ - callout_init(&rtq_mtutimer, CALLOUT_MPSAFE); - in6_mtutimo(rnh); /* kick off timeout first time */ + callout_init(&V_rtq_timer6, CALLOUT_MPSAFE); + callout_init(&V_rtq_mtutimer, CALLOUT_MPSAFE); + in6_rtqtimo(curvnet); /* kick off timeout first time */ + in6_mtutimo(curvnet); /* kick off timeout first time */ + return 1; +} + +#ifdef VIMAGE +int +in6_detachhead(void **head, int off) +{ + INIT_VNET_INET6(curvnet); + + callout_drain(&V_rtq_timer6); + callout_drain(&V_rtq_mtutimer); return 1; } +#endif --- /u/marko/p4/head/src/sys/netinet6/in6_src.c 2008-01-28 23:53:56.000000000 +0100 +++ src/sys/netinet6/in6_src.c 2008-02-27 11:49:41.000000000 +0100 @@ -65,6 +65,7 @@ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -81,15 +82,20 @@ #include #include #include +#include +#include #include #include +#include #include #include #include #include #include + +#include #include #include #include @@ -111,9 +117,11 @@ #define ADDRSEL_XUNLOCK() sx_xunlock(&addrsel_sxlock) #define ADDR_LABEL_NOTAPP (-1) +#ifndef VIMAGE struct in6_addrpolicy defaultaddrpolicy; int ip6_prefer_tempaddr = 0; +#endif static int selectroute __P((struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, struct ifnet **, @@ -138,9 +146,9 @@ * an entry to the caller for later use. */ #define REPLACE(r) do {\ - if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \ - sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ - ip6stat.ip6s_sources_rule[(r)]++; \ + if ((r) < sizeof(V_ip6stat.ip6s_sources_rule) / \ + sizeof(V_ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ + V_ip6stat.ip6s_sources_rule[(r)]++; \ /* { \ char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \ printf("in6_selectsrc: replace %s with %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \ @@ -148,9 +156,9 @@ goto replace; \ } while(0) #define NEXT(r) do {\ - if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \ - sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ - ip6stat.ip6s_sources_rule[(r)]++; \ + if ((r) < sizeof(V_ip6stat.ip6s_sources_rule) / \ + sizeof(V_ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ + V_ip6stat.ip6s_sources_rule[(r)]++; \ /* { \ char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \ printf("in6_selectsrc: keep %s against %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \ @@ -158,9 +166,9 @@ goto next; /* XXX: we can't use 'continue' here */ \ } while(0) #define BREAK(r) do { \ - if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \ - sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ - ip6stat.ip6s_sources_rule[(r)]++; \ + if ((r) < sizeof(V_ip6stat.ip6s_sources_rule) / \ + sizeof(V_ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ + V_ip6stat.ip6s_sources_rule[(r)]++; \ goto out; /* XXX: we can't use 'break' here */ \ } while(0) @@ -169,6 +177,7 @@ struct ip6_moptions *mopts, struct route_in6 *ro, struct in6_addr *laddr, struct ifnet **ifpp, int *errorp) { + INIT_VNET_INET6(curvnet); struct in6_addr dst; struct ifnet *ifp = NULL; struct in6_ifaddr *ia = NULL, *ia_best = NULL; @@ -251,7 +260,7 @@ if (*errorp != 0) return (NULL); - for (ia = in6_ifaddr; ia; ia = ia->ia_next) { + for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) { int new_scope = -1, new_matchlen = -1; struct in6_addrpolicy *new_policy = NULL; u_int32_t srczone, osrczone, dstzone; @@ -280,7 +289,7 @@ (IN6_IFF_NOTREADY | IN6_IFF_ANYCAST | IN6_IFF_DETACHED))) { continue; } - if (!ip6_use_deprecated && IFA6_IS_DEPRECATED(ia)) + if (!V_ip6_use_deprecated && IFA6_IS_DEPRECATED(ia)) continue; /* Rule 1: Prefer same address */ @@ -351,7 +360,7 @@ */ if (opts == NULL || opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) { - prefer_tempaddr = ip6_prefer_tempaddr; + prefer_tempaddr = V_ip6_prefer_tempaddr; } else if (opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_NOTPREFER) { prefer_tempaddr = 0; @@ -442,6 +451,8 @@ struct ifnet **retifp, struct rtentry **retrt, int clone, int norouteok) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET6(curvnet); int error = 0; struct ifnet *ifp = NULL; struct rtentry *rt = NULL; @@ -623,7 +634,7 @@ error = EHOSTUNREACH; } if (error == EHOSTUNREACH) - ip6stat.ip6s_noroute++; + V_ip6stat.ip6s_noroute++; if (retifp != NULL) *retifp = ifp; @@ -716,6 +727,7 @@ int in6_selecthlim(struct in6pcb *in6p, struct ifnet *ifp) { + INIT_VNET_INET6(curvnet); if (in6p && in6p->in6p_hops >= 0) return (in6p->in6p_hops); @@ -736,9 +748,9 @@ if (lifp) return (ND_IFINFO(lifp)->chlim); } else - return (ip6_defhlim); + return (V_ip6_defhlim); } - return (ip6_defhlim); + return (V_ip6_defhlim); } /* @@ -748,6 +760,7 @@ int in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct ucred *cred) { + INIT_VNET_INET(curvnet); struct socket *so = inp->inp_socket; u_int16_t lport = 0, first, last, *lastport; int count, error = 0, wild = 0; @@ -763,19 +776,19 @@ inp->inp_flags |= INP_ANONPORT; if (inp->inp_flags & INP_HIGHPORT) { - first = ipport_hifirstauto; /* sysctl */ - last = ipport_hilastauto; + first = V_ipport_hifirstauto; /* sysctl */ + last = V_ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); if (error) return error; - first = ipport_lowfirstauto; /* 1023 */ - last = ipport_lowlastauto; /* 600 */ + first = V_ipport_lowfirstauto; /* 1023 */ + last = V_ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { - first = ipport_firstauto; /* sysctl */ - last = ipport_lastauto; + first = V_ipport_firstauto; /* sysctl */ + last = V_ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* @@ -842,26 +855,34 @@ void addrsel_policy_init(void) { - ADDRSEL_LOCK_INIT(); - ADDRSEL_SXLOCK_INIT(); + INIT_VNET_INET6(curvnet); init_policy_queue(); /* initialize the "last resort" policy */ - bzero(&defaultaddrpolicy, sizeof(defaultaddrpolicy)); - defaultaddrpolicy.label = ADDR_LABEL_NOTAPP; + bzero(&V_defaultaddrpolicy, sizeof(V_defaultaddrpolicy)); + V_defaultaddrpolicy.label = ADDR_LABEL_NOTAPP; + +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif + + ADDRSEL_LOCK_INIT(); + ADDRSEL_SXLOCK_INIT(); } static struct in6_addrpolicy * lookup_addrsel_policy(struct sockaddr_in6 *key) { + INIT_VNET_INET6(curvnet); struct in6_addrpolicy *match = NULL; ADDRSEL_LOCK(); match = match_addrsel_policy(key); if (match == NULL) - match = &defaultaddrpolicy; + match = &V_defaultaddrpolicy; else match->use++; ADDRSEL_UNLOCK(); @@ -939,20 +960,22 @@ struct in6_addrpolicy ape_policy; }; -TAILQ_HEAD(addrsel_policyhead, addrsel_policyent); - -struct addrsel_policyhead addrsel_policytab; +#ifndef VIMAGE +TAILQ_HEAD(, addrsel_policyent) addrsel_policytab; +#endif static void init_policy_queue(void) { + INIT_VNET_INET6(curvnet); - TAILQ_INIT(&addrsel_policytab); + TAILQ_INIT(&V_addrsel_policytab); } static int add_addrsel_policyent(struct in6_addrpolicy *newpolicy) { + INIT_VNET_INET6(curvnet); struct addrsel_policyent *new, *pol; MALLOC(new, struct addrsel_policyent *, sizeof(*new), M_IFADDR, @@ -961,7 +984,7 @@ ADDRSEL_LOCK(); /* duplication check */ - TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) { + TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) { if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr, &pol->ape_policy.addr.sin6_addr) && IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr, @@ -978,7 +1001,7 @@ /* XXX: should validate entry */ new->ape_policy = *newpolicy; - TAILQ_INSERT_TAIL(&addrsel_policytab, new, ape_entry); + TAILQ_INSERT_TAIL(&V_addrsel_policytab, new, ape_entry); ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); @@ -988,13 +1011,14 @@ static int delete_addrsel_policyent(struct in6_addrpolicy *key) { + INIT_VNET_INET6(curvnet); struct addrsel_policyent *pol; ADDRSEL_XLOCK(); ADDRSEL_LOCK(); /* search for the entry in the table */ - TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) { + TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) { if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr, &pol->ape_policy.addr.sin6_addr) && IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr, @@ -1008,7 +1032,7 @@ return (ESRCH); } - TAILQ_REMOVE(&addrsel_policytab, pol, ape_entry); + TAILQ_REMOVE(&V_addrsel_policytab, pol, ape_entry); ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); @@ -1019,11 +1043,12 @@ walk_addrsel_policy(int (*callback)(struct in6_addrpolicy *, void *), void *w) { + INIT_VNET_INET6(curvnet); struct addrsel_policyent *pol; int error = 0; ADDRSEL_SLOCK(); - TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) { + TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) { if ((error = (*callback)(&pol->ape_policy, w)) != 0) { ADDRSEL_SUNLOCK(); return (error); @@ -1047,12 +1072,13 @@ static struct in6_addrpolicy * match_addrsel_policy(struct sockaddr_in6 *key) { + INIT_VNET_INET6(curvnet); struct addrsel_policyent *pent; struct in6_addrpolicy *bestpol = NULL, *pol; int matchlen, bestmatchlen = -1; u_char *mp, *ep, *k, *p, m; - TAILQ_FOREACH(pent, &addrsel_policytab, ape_entry) { + TAILQ_FOREACH(pent, &V_addrsel_policytab, ape_entry) { matchlen = 0; pol = &pent->ape_policy; --- /u/marko/p4/head/src/sys/netinet6/in6_var.h 2007-12-27 19:33:04.000000000 +0100 +++ src/sys/netinet6/in6_var.h 2008-01-14 19:23:58.000000000 +0100 @@ -470,9 +470,11 @@ #endif #ifdef _KERNEL +#ifndef VIMAGE extern struct in6_ifaddr *in6_ifaddr; extern struct icmp6stat icmp6stat; +#endif #define in6_ifstat_inc(ifp, tag) \ do { \ if (ifp) \ --- /u/marko/p4/head/src/sys/netinet6/ip6_forward.c 2007-12-27 19:33:04.000000000 +0100 +++ src/sys/netinet6/ip6_forward.c 2008-01-14 19:23:58.000000000 +0100 @@ -36,6 +36,7 @@ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_ipstealth.h" +#include "opt_vimage.h" #include #include @@ -48,6 +49,7 @@ #include #include #include +#include #include #include @@ -58,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -71,11 +74,14 @@ #include #include #include +#include #endif /* IPSEC */ #include +#ifndef VIMAGE struct route_in6 ip6_forward_rt; +#endif /* * Forward a packet. If some error occurs return the sender @@ -92,6 +98,7 @@ void ip6_forward(struct mbuf *m, int srcrt) { + INIT_VNET_INET6(curvnet); struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct sockaddr_in6 *dst = NULL; struct rtentry *rt = NULL; @@ -101,6 +108,7 @@ u_int32_t inzone, outzone; struct in6_addr src_in6, dst_in6; #ifdef IPSEC + INIT_VNET_IPSEC(curvnet); struct secpolicy *sp = NULL; int ipsecrt = 0; #endif @@ -117,7 +125,7 @@ * before forwarding packet actually. */ if (ipsec6_in_reject(m, NULL)) { - ipsec6stat.in_polvio++; + V_ipsec6stat.in_polvio++; m_freem(m); return; } @@ -132,10 +140,10 @@ if ((m->m_flags & (M_BCAST|M_MCAST)) != 0 || IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { - ip6stat.ip6s_cantforward++; + V_ip6stat.ip6s_cantforward++; /* XXX in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard) */ - if (ip6_log_time + ip6_log_interval < time_second) { - ip6_log_time = time_second; + if (V_ip6_log_time + V_ip6_log_interval < time_second) { + V_ip6_log_time = time_second; log(LOG_DEBUG, "cannot forward " "from %s to %s nxt %d received on %s\n", @@ -149,7 +157,7 @@ } #ifdef IPSTEALTH - if (!ip6stealth) { + if (!V_ip6stealth) { #endif if (ip6->ip6_hlim <= IPV6_HLIMDEC) { /* XXX in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard) */ @@ -179,8 +187,8 @@ sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, IP_FORWARDING, &error); if (sp == NULL) { - ipsec6stat.out_inval++; - ip6stat.ip6s_cantforward++; + V_ipsec6stat.out_inval++; + V_ip6stat.ip6s_cantforward++; if (mcopy) { #if 0 /* XXX: what icmp ? */ @@ -200,8 +208,8 @@ /* * This packet is just discarded. */ - ipsec6stat.out_polvio++; - ip6stat.ip6s_cantforward++; + V_ipsec6stat.out_polvio++; + V_ip6stat.ip6s_cantforward++; KEY_FREESP(&sp); if (mcopy) { #if 0 @@ -223,7 +231,7 @@ if (sp->req == NULL) { /* XXX should be panic ? */ printf("ip6_forward: No IPsec request specified.\n"); - ip6stat.ip6s_cantforward++; + V_ip6stat.ip6s_cantforward++; KEY_FREESP(&sp); if (mcopy) { #if 0 @@ -307,7 +315,7 @@ /* don't show these error codes to the user */ break; } - ip6stat.ip6s_cantforward++; + V_ip6stat.ip6s_cantforward++; if (mcopy) { #if 0 /* XXX: what icmp ? */ @@ -352,22 +360,22 @@ goto skip_routing; #endif - dst = (struct sockaddr_in6 *)&ip6_forward_rt.ro_dst; + dst = (struct sockaddr_in6 *)&V_ip6_forward_rt.ro_dst; if (!srcrt) { /* ip6_forward_rt.ro_dst.sin6_addr is equal to ip6->ip6_dst */ - if (ip6_forward_rt.ro_rt == 0 || - (ip6_forward_rt.ro_rt->rt_flags & RTF_UP) == 0) { - if (ip6_forward_rt.ro_rt) { - RTFREE(ip6_forward_rt.ro_rt); - ip6_forward_rt.ro_rt = 0; + if (V_ip6_forward_rt.ro_rt == 0 || + (V_ip6_forward_rt.ro_rt->rt_flags & RTF_UP) == 0) { + if (V_ip6_forward_rt.ro_rt) { + RTFREE(V_ip6_forward_rt.ro_rt); + V_ip6_forward_rt.ro_rt = 0; } /* this probably fails but give it a try again */ - rtalloc((struct route *)&ip6_forward_rt); + rtalloc((struct route *)&V_ip6_forward_rt); } - if (ip6_forward_rt.ro_rt == 0) { - ip6stat.ip6s_noroute++; + if (V_ip6_forward_rt.ro_rt == 0) { + V_ip6stat.ip6s_noroute++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute); if (mcopy) { icmp6_error(mcopy, ICMP6_DST_UNREACH, @@ -376,20 +384,20 @@ m_freem(m); return; } - } else if ((rt = ip6_forward_rt.ro_rt) == 0 || + } else if ((rt = V_ip6_forward_rt.ro_rt) == 0 || !IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &dst->sin6_addr)) { - if (ip6_forward_rt.ro_rt) { - RTFREE(ip6_forward_rt.ro_rt); - ip6_forward_rt.ro_rt = 0; + if (V_ip6_forward_rt.ro_rt) { + RTFREE(V_ip6_forward_rt.ro_rt); + V_ip6_forward_rt.ro_rt = 0; } bzero(dst, sizeof(*dst)); dst->sin6_len = sizeof(struct sockaddr_in6); dst->sin6_family = AF_INET6; dst->sin6_addr = ip6->ip6_dst; - rtalloc((struct route *)&ip6_forward_rt); - if (ip6_forward_rt.ro_rt == 0) { - ip6stat.ip6s_noroute++; + rtalloc((struct route *)&V_ip6_forward_rt); + if (V_ip6_forward_rt.ro_rt == 0) { + V_ip6stat.ip6s_noroute++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute); if (mcopy) { icmp6_error(mcopy, ICMP6_DST_UNREACH, @@ -399,7 +407,7 @@ return; } } - rt = ip6_forward_rt.ro_rt; + rt = V_ip6_forward_rt.ro_rt; #ifdef IPSEC skip_routing:; #endif @@ -416,14 +424,14 @@ src_in6 = ip6->ip6_src; if (in6_setscope(&src_in6, rt->rt_ifp, &outzone)) { /* XXX: this should not happen */ - ip6stat.ip6s_cantforward++; - ip6stat.ip6s_badscope++; + V_ip6stat.ip6s_cantforward++; + V_ip6stat.ip6s_badscope++; m_freem(m); return; } if (in6_setscope(&src_in6, m->m_pkthdr.rcvif, &inzone)) { - ip6stat.ip6s_cantforward++; - ip6stat.ip6s_badscope++; + V_ip6stat.ip6s_cantforward++; + V_ip6stat.ip6s_badscope++; m_freem(m); return; } @@ -432,12 +440,12 @@ && !ipsecrt #endif ) { - ip6stat.ip6s_cantforward++; - ip6stat.ip6s_badscope++; + V_ip6stat.ip6s_cantforward++; + V_ip6stat.ip6s_badscope++; in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard); - if (ip6_log_time + ip6_log_interval < time_second) { - ip6_log_time = time_second; + if (V_ip6_log_time + V_ip6_log_interval < time_second) { + V_ip6_log_time = time_second; log(LOG_DEBUG, "cannot forward " "src %s, dst %s, nxt %d, rcvif %s, outif %s\n", @@ -464,8 +472,8 @@ if (in6_setscope(&dst_in6, m->m_pkthdr.rcvif, &inzone) != 0 || in6_setscope(&dst_in6, rt->rt_ifp, &outzone) != 0 || inzone != outzone) { - ip6stat.ip6s_cantforward++; - ip6stat.ip6s_badscope++; + V_ip6stat.ip6s_cantforward++; + V_ip6stat.ip6s_badscope++; m_freem(m); return; } @@ -523,7 +531,7 @@ * Also, don't send redirect if forwarding using a route * modified by a redirect. */ - if (ip6_sendredirects && rt->rt_ifp == m->m_pkthdr.rcvif && !srcrt && + if (V_ip6_sendredirects && rt->rt_ifp == m->m_pkthdr.rcvif && !srcrt && #ifdef IPSEC !ipsecrt && #endif /* IPSEC */ @@ -607,12 +615,12 @@ error = nd6_output(rt->rt_ifp, origifp, m, dst, rt); if (error) { in6_ifstat_inc(rt->rt_ifp, ifs6_out_discard); - ip6stat.ip6s_cantforward++; + V_ip6stat.ip6s_cantforward++; } else { - ip6stat.ip6s_forward++; + V_ip6stat.ip6s_forward++; in6_ifstat_inc(rt->rt_ifp, ifs6_out_forward); if (type) - ip6stat.ip6s_redirectsent++; + V_ip6stat.ip6s_redirectsent++; else { if (mcopy) goto freecopy; --- /u/marko/p4/head/src/sys/netinet6/ip6_input.c 2008-01-15 18:01:33.000000000 +0100 +++ src/sys/netinet6/ip6_input.c 2008-02-27 11:49:43.000000000 +0100 @@ -66,6 +66,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" +#include "opt_vimage.h" #include #include @@ -80,13 +81,16 @@ #include #include #include +#include +#include #include #include #include #include #include #include +#include #include #include @@ -95,10 +99,13 @@ #include #endif /* INET */ #include +#include #include #include #include #include + +#include #include #include #include @@ -115,6 +122,7 @@ u_char ip6_protox[IPPROTO_MAX]; static struct ifqueue ip6intrq; +#ifndef VIMAGE static int ip6qmaxlen = IFQ_MAXLEN; struct in6_ifaddr *in6_ifaddr; @@ -125,10 +133,13 @@ int ip6_sourcecheck_interval; /* XXX */ int ip6_ours_check_algorithm; +#endif /* !VIMAGE */ struct pfil_head inet6_pfil_hook; +#ifndef VIMAGE struct ip6stat ip6stat; +#endif static void ip6_init2(void *); static struct ip6aux *ip6_setdstifaddr(struct mbuf *, struct in6_ifaddr *); @@ -137,6 +148,19 @@ static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int); #endif +#ifdef VIMAGE +static void vnet_inet6_register(void); + +VNET_MOD_DECLARE(INET6, inet6, NULL, NULL, INET, NULL) + +static void vnet_inet6_register() +{ + vnet_mod_register(&vnet_inet6_modinfo); +} + +SYSINIT(inet6, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, vnet_inet6_register, 0); +#endif /* VIMAGE */ + /* * IP6 initialization: fill in IP6 protocol switch table. * All protocols not implemented in kernel go to raw IP6 protocol handler. @@ -144,9 +168,36 @@ void ip6_init(void) { + INIT_VNET_INET6(curvnet); struct ip6protosw *pr; int i; + V_ip6_prefer_tempaddr = 0; + + V_ip6qmaxlen = IFQ_MAXLEN; + V_ip6_forward_srcrt = 0; /* XXX */ + V_ip6_sourcecheck = 0; /* XXX */ + V_ip6_sourcecheck_interval = 0; /* XXX */ + + V_ip6_ours_check_algorithm = 0; + +#ifdef IP6_AUTO_LINKLOCAL + V_ip6_auto_linklocal = IP6_AUTO_LINKLOCAL; +#else + V_ip6_auto_linklocal = 1; /* enable by default */ +#endif + + scope6_init(); + addrsel_policy_init(); + nd6_init(); + frag6_init(); + +#ifdef VIMAGE + /* Skip global initialization stuff for non-default instances. */ + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif + #ifdef DIAGNOSTIC if (sizeof(struct protosw) != sizeof(struct ip6protosw)) panic("sizeof(protosw) != sizeof(ip6protosw)"); @@ -178,41 +229,53 @@ printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); - ip6intrq.ifq_maxlen = ip6qmaxlen; + ip6intrq.ifq_maxlen = V_ip6qmaxlen; mtx_init(&ip6intrq.ifq_mtx, "ip6_inq", NULL, MTX_DEF); netisr_register(NETISR_IPV6, ip6_input, &ip6intrq, 0); - scope6_init(); - addrsel_policy_init(); - nd6_init(); - frag6_init(); - ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR; + V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR; } +#ifdef VIMAGE +void +ip6_destroy() +{ + INIT_VNET_INET6(curvnet); + + nd6_destroy(); + callout_drain(&V_in6_tmpaddrtimer_ch); +} +#endif + static void ip6_init2(void *dummy) { + INIT_VNET_INET6(curvnet); /* nd6_timer_init */ - callout_init(&nd6_timer_ch, 0); - callout_reset(&nd6_timer_ch, hz, nd6_timer, NULL); + callout_init(&V_nd6_timer_ch, 0); + callout_reset(&V_nd6_timer_ch, hz, nd6_timer, curvnet); /* timer for regeneranation of temporary addresses randomize ID */ - callout_init(&in6_tmpaddrtimer_ch, 0); - callout_reset(&in6_tmpaddrtimer_ch, - (ip6_temp_preferred_lifetime - ip6_desync_factor - - ip6_temp_regen_advance) * hz, - in6_tmpaddrtimer, NULL); + callout_init(&V_in6_tmpaddrtimer_ch, 0); + callout_reset(&V_in6_tmpaddrtimer_ch, + (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - + V_ip6_temp_regen_advance) * hz, + in6_tmpaddrtimer, curvnet); } /* cheat */ /* This must be after route_init(), which is now SI_ORDER_THIRD */ SYSINIT(netinet6init2, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ip6_init2, NULL); +#ifndef VIMAGE extern struct route_in6 ip6_forward_rt; +#endif void ip6_input(struct mbuf *m) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET6(curvnet); struct ip6_hdr *ip6; int off = sizeof(struct ip6_hdr), nest; u_int32_t plen; @@ -246,20 +309,20 @@ */ if (m->m_flags & M_EXT) { if (m->m_next) - ip6stat.ip6s_mext2m++; + V_ip6stat.ip6s_mext2m++; else - ip6stat.ip6s_mext1++; + V_ip6stat.ip6s_mext1++; } else { -#define M2MMAX (sizeof(ip6stat.ip6s_m2m)/sizeof(ip6stat.ip6s_m2m[0])) +#define M2MMAX (sizeof(V_ip6stat.ip6s_m2m)/sizeof(V_ip6stat.ip6s_m2m[0])) if (m->m_next) { if (m->m_flags & M_LOOP) { - ip6stat.ip6s_m2m[loif[0].if_index]++; /* XXX */ + V_ip6stat.ip6s_m2m[V_loif->if_index]++; } else if (m->m_pkthdr.rcvif->if_index < M2MMAX) - ip6stat.ip6s_m2m[m->m_pkthdr.rcvif->if_index]++; + V_ip6stat.ip6s_m2m[m->m_pkthdr.rcvif->if_index]++; else - ip6stat.ip6s_m2m[0]++; + V_ip6stat.ip6s_m2m[0]++; } else - ip6stat.ip6s_m1++; + V_ip6stat.ip6s_m1++; #undef M2MMAX } @@ -270,7 +333,7 @@ } in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_receive); - ip6stat.ip6s_total++; + V_ip6stat.ip6s_total++; #ifndef PULLDOWN_TEST /* @@ -308,7 +371,7 @@ struct ifnet *inifp; inifp = m->m_pkthdr.rcvif; if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { - ip6stat.ip6s_toosmall++; + V_ip6stat.ip6s_toosmall++; in6_ifstat_inc(inifp, ifs6_in_hdrerr); return; } @@ -317,12 +380,12 @@ ip6 = mtod(m, struct ip6_hdr *); if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { - ip6stat.ip6s_badvers++; + V_ip6stat.ip6s_badvers++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); goto bad; } - ip6stat.ip6s_nxthist[ip6->ip6_nxt]++; + V_ip6stat.ip6s_nxthist[ip6->ip6_nxt]++; /* * Check against address spoofing/corruption. @@ -332,7 +395,7 @@ /* * XXX: "badscope" is not very suitable for a multicast source. */ - ip6stat.ip6s_badscope++; + V_ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } @@ -344,7 +407,7 @@ * because ip6_mloopback() passes the "actual" interface * as the outgoing/incoming interface. */ - ip6stat.ip6s_badscope++; + V_ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } @@ -369,7 +432,7 @@ */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { - ip6stat.ip6s_badscope++; + V_ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } @@ -383,7 +446,7 @@ */ if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) || IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) { - ip6stat.ip6s_badscope++; + V_ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } @@ -421,12 +484,12 @@ * is not loopback. */ if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) { - ip6stat.ip6s_badscope++; /* XXX */ + V_ip6stat.ip6s_badscope++; /* XXX */ goto bad; } if (in6_setscope(&ip6->ip6_src, m->m_pkthdr.rcvif, NULL) || in6_setscope(&ip6->ip6_dst, m->m_pkthdr.rcvif, NULL)) { - ip6stat.ip6s_badscope++; + V_ip6stat.ip6s_badscope++; goto bad; } @@ -445,8 +508,8 @@ if (in6m) ours = 1; else if (!ip6_mrouter) { - ip6stat.ip6s_notmember++; - ip6stat.ip6s_cantforward++; + V_ip6stat.ip6s_notmember++; + V_ip6stat.ip6s_cantforward++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto bad; } @@ -457,28 +520,28 @@ /* * Unicast check */ - if (ip6_forward_rt.ro_rt != NULL && - (ip6_forward_rt.ro_rt->rt_flags & RTF_UP) != 0 && + if (V_ip6_forward_rt.ro_rt != NULL && + (V_ip6_forward_rt.ro_rt->rt_flags & RTF_UP) != 0 && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, - &((struct sockaddr_in6 *)(&ip6_forward_rt.ro_dst))->sin6_addr)) - ip6stat.ip6s_forward_cachehit++; + &((struct sockaddr_in6 *)(&V_ip6_forward_rt.ro_dst))->sin6_addr)) + V_ip6stat.ip6s_forward_cachehit++; else { struct sockaddr_in6 *dst6; - if (ip6_forward_rt.ro_rt) { + if (V_ip6_forward_rt.ro_rt) { /* route is down or destination is different */ - ip6stat.ip6s_forward_cachemiss++; - RTFREE(ip6_forward_rt.ro_rt); - ip6_forward_rt.ro_rt = 0; + V_ip6stat.ip6s_forward_cachemiss++; + RTFREE(V_ip6_forward_rt.ro_rt); + V_ip6_forward_rt.ro_rt = 0; } - bzero(&ip6_forward_rt.ro_dst, sizeof(struct sockaddr_in6)); - dst6 = (struct sockaddr_in6 *)&ip6_forward_rt.ro_dst; + bzero(&V_ip6_forward_rt.ro_dst, sizeof(struct sockaddr_in6)); + dst6 = (struct sockaddr_in6 *)&V_ip6_forward_rt.ro_dst; dst6->sin6_len = sizeof(struct sockaddr_in6); dst6->sin6_family = AF_INET6; dst6->sin6_addr = ip6->ip6_dst; - rtalloc((struct route *)&ip6_forward_rt); + rtalloc((struct route *)&V_ip6_forward_rt); } #define rt6_key(r) ((struct sockaddr_in6 *)((r)->rt_nodes->rn_key)) @@ -503,14 +566,14 @@ * while it would be less efficient. Or, should we rather install a * reject route for such a case? */ - if (ip6_forward_rt.ro_rt && - (ip6_forward_rt.ro_rt->rt_flags & + if (V_ip6_forward_rt.ro_rt && + (V_ip6_forward_rt.ro_rt->rt_flags & (RTF_HOST|RTF_GATEWAY)) == RTF_HOST && #ifdef RTF_WASCLONED - !(ip6_forward_rt.ro_rt->rt_flags & RTF_WASCLONED) && + !(V_ip6_forward_rt.ro_rt->rt_flags & RTF_WASCLONED) && #endif #ifdef RTF_CLONED - !(ip6_forward_rt.ro_rt->rt_flags & RTF_CLONED) && + !(V_ip6_forward_rt.ro_rt->rt_flags & RTF_CLONED) && #endif #if 0 /* @@ -519,11 +582,11 @@ * already done through looking up the routing table. */ IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, - &rt6_key(ip6_forward_rt.ro_rt)->sin6_addr) + &rt6_key(V_ip6_forward_rt.ro_rt)->sin6_addr) #endif - ip6_forward_rt.ro_rt->rt_ifp->if_type == IFT_LOOP) { + V_ip6_forward_rt.ro_rt->rt_ifp->if_type == IFT_LOOP) { struct in6_ifaddr *ia6 = - (struct in6_ifaddr *)ip6_forward_rt.ro_rt->rt_ifa; + (struct in6_ifaddr *)V_ip6_forward_rt.ro_rt->rt_ifa; /* * record address information into m_tag. @@ -558,12 +621,12 @@ /* * FAITH (Firewall Aided Internet Translator) */ - if (ip6_keepfaith) { - if (ip6_forward_rt.ro_rt && ip6_forward_rt.ro_rt->rt_ifp - && ip6_forward_rt.ro_rt->rt_ifp->if_type == IFT_FAITH) { + if (V_ip6_keepfaith) { + if (V_ip6_forward_rt.ro_rt && V_ip6_forward_rt.ro_rt->rt_ifp + && V_ip6_forward_rt.ro_rt->rt_ifp->if_type == IFT_FAITH) { /* XXX do we need more sanity checks? */ ours = 1; - deliverifp = ip6_forward_rt.ro_rt->rt_ifp; /* faith */ + deliverifp = V_ip6_forward_rt.ro_rt->rt_ifp; /* faith */ goto hbhcheck; } } @@ -572,8 +635,8 @@ * Now there is no reason to process the packet if it's not our own * and we're not a router. */ - if (!ip6_forwarding) { - ip6stat.ip6s_cantforward++; + if (!V_ip6_forwarding) { + V_ip6stat.ip6s_cantforward++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto bad; } @@ -630,7 +693,7 @@ * contained, ip6_hopopts_input() must set a valid * (non-zero) payload length to the variable plen. */ - ip6stat.ip6s_badoptions++; + V_ip6stat.ip6s_badoptions++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); icmp6_error(m, ICMP6_PARAM_PROB, @@ -645,7 +708,7 @@ IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { - ip6stat.ip6s_tooshort++; + V_ip6stat.ip6s_tooshort++; return; } #endif @@ -658,7 +721,7 @@ * case we should pass the packet to the multicast routing * daemon. */ - if (rtalert != ~0 && ip6_forwarding) { + if (rtalert != ~0 && V_ip6_forwarding) { switch (rtalert) { case IP6OPT_RTALERT_MLD: ours = 1; @@ -681,7 +744,7 @@ * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) { - ip6stat.ip6s_tooshort++; + V_ip6stat.ip6s_tooshort++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); goto bad; } @@ -707,7 +770,7 @@ */ if (ip6_mrouter && ip6_mforward && ip6_mforward(ip6, m->m_pkthdr.rcvif, m)) { - ip6stat.ip6s_cantforward++; + V_ip6stat.ip6s_cantforward++; m_freem(m); return; } @@ -733,7 +796,7 @@ */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { - ip6stat.ip6s_badscope++; + V_ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } @@ -741,13 +804,13 @@ /* * Tell launch routine the next header */ - ip6stat.ip6s_delivered++; + V_ip6stat.ip6s_delivered++; in6_ifstat_inc(deliverifp, ifs6_in_deliver); nest = 0; while (nxt != IPPROTO_DONE) { - if (ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) { - ip6stat.ip6s_toomanyhdr++; + if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) { + V_ip6stat.ip6s_toomanyhdr++; goto bad; } @@ -756,7 +819,7 @@ * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < off) { - ip6stat.ip6s_tooshort++; + V_ip6stat.ip6s_tooshort++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); goto bad; } @@ -814,6 +877,7 @@ ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp, struct mbuf **mp, int *offp) { + INIT_VNET_INET6(curvnet); struct mbuf *m = *mp; int off = *offp, hbhlen; struct ip6_hbh *hbh; @@ -831,14 +895,14 @@ IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { - ip6stat.ip6s_tooshort++; + V_ip6stat.ip6s_tooshort++; return -1; } hbhlen = (hbh->ip6h_len + 1) << 3; IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), hbhlen); if (hbh == NULL) { - ip6stat.ip6s_tooshort++; + V_ip6stat.ip6s_tooshort++; return -1; } #endif @@ -869,6 +933,7 @@ ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen, u_int32_t *rtalertp, u_int32_t *plenp) { + INIT_VNET_INET6(curvnet); struct ip6_hdr *ip6; int optlen = 0; u_int8_t *opt = opthead; @@ -883,7 +948,7 @@ break; case IP6OPT_PADN: if (hbhlen < IP6OPT_MINLEN) { - ip6stat.ip6s_toosmall++; + V_ip6stat.ip6s_toosmall++; goto bad; } optlen = *(opt + 1) + 2; @@ -891,7 +956,7 @@ case IP6OPT_ROUTER_ALERT: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_RTALERT_LEN) { - ip6stat.ip6s_toosmall++; + V_ip6stat.ip6s_toosmall++; goto bad; } if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) { @@ -908,7 +973,7 @@ case IP6OPT_JUMBO: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_JUMBO_LEN) { - ip6stat.ip6s_toosmall++; + V_ip6stat.ip6s_toosmall++; goto bad; } if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) { @@ -926,7 +991,7 @@ */ ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_plen) { - ip6stat.ip6s_badoptions++; + V_ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt - opthead); @@ -950,7 +1015,7 @@ * there's no explicit mention in specification. */ if (*plenp != 0) { - ip6stat.ip6s_badoptions++; + V_ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); @@ -962,7 +1027,7 @@ * jumbo payload length must be larger than 65535. */ if (jumboplen <= IPV6_MAXPACKET) { - ip6stat.ip6s_badoptions++; + V_ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); @@ -973,7 +1038,7 @@ break; default: /* unknown option */ if (hbhlen < IP6OPT_MINLEN) { - ip6stat.ip6s_toosmall++; + V_ip6stat.ip6s_toosmall++; goto bad; } optlen = ip6_unknown_opt(opt, m, @@ -1001,6 +1066,7 @@ int ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off) { + INIT_VNET_INET6(curvnet); struct ip6_hdr *ip6; switch (IP6OPT_TYPE(*optp)) { @@ -1010,11 +1076,11 @@ m_freem(m); return (-1); case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */ - ip6stat.ip6s_badoptions++; + V_ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */ - ip6stat.ip6s_badoptions++; + V_ip6stat.ip6s_badoptions++; ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || (m->m_flags & (M_BCAST|M_MCAST))) @@ -1128,14 +1194,14 @@ ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr), ip6->ip6_nxt); if (ext == NULL) { - ip6stat.ip6s_tooshort++; + V_ip6stat.ip6s_tooshort++; return; } hbh = mtod(ext, struct ip6_hbh *); hbhlen = (hbh->ip6h_len + 1) << 3; if (hbhlen != ext->m_len) { m_freem(ext); - ip6stat.ip6s_tooshort++; + V_ip6stat.ip6s_tooshort++; return; } #endif @@ -1202,7 +1268,7 @@ #else ext = ip6_pullexthdr(m, off, nxt); if (ext == NULL) { - ip6stat.ip6s_tooshort++; + V_ip6stat.ip6s_tooshort++; return; } ip6e = mtod(ext, struct ip6_ext *); @@ -1212,7 +1278,7 @@ elen = (ip6e->ip6e_len + 1) << 3; if (elen != ext->m_len) { m_freem(ext); - ip6stat.ip6s_tooshort++; + V_ip6stat.ip6s_tooshort++; return; } #endif --- /u/marko/p4/head/src/sys/netinet6/ip6_ipsec.c 2007-12-27 19:33:04.000000000 +0100 +++ src/sys/netinet6/ip6_ipsec.c 2008-01-14 19:23:58.000000000 +0100 @@ -31,6 +31,7 @@ __FBSDID("$FreeBSD: src/sys/netinet6/ip6_ipsec.c,v 1.7 2007/12/10 16:03:38 obrien Exp $"); #include "opt_ipsec.h" +#include "opt_vimage.h" #include #include @@ -42,6 +43,7 @@ #include #include #include +#include #include #include @@ -61,6 +63,7 @@ #include #include #include +#include #ifdef IPSEC_DEBUG #include #else @@ -68,6 +71,7 @@ #endif #endif /*IPSEC*/ +#include #include extern struct protosw inet6sw[]; @@ -100,6 +104,8 @@ ip6_ipsec_fwd(struct mbuf *m) { #ifdef IPSEC + INIT_VNET_INET6(curvnet); + INIT_VNET_IPSEC(curvnet); struct m_tag *mtag; struct tdb_ident *tdbi; struct secpolicy *sp; @@ -127,7 +133,7 @@ KEY_FREESP(&sp); splx(s); if (error) { - ipstat.ips_cantforward++; + V_ip6stat.ip6s_cantforward++; return 1; } #endif /* IPSEC */ @@ -145,6 +151,7 @@ ip6_ipsec_input(struct mbuf *m, int nxt) { #ifdef IPSEC + INIT_VNET_IPSEC(curvnet); struct m_tag *mtag; struct tdb_ident *tdbi; struct secpolicy *sp; --- /u/marko/p4/head/src/sys/netinet6/ip6_mroute.c 2008-01-15 18:01:33.000000000 +0100 +++ src/sys/netinet6/ip6_mroute.c 2008-02-27 11:49:45.000000000 +0100 @@ -83,6 +83,7 @@ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -101,7 +102,9 @@ #include #include #include +#include +#include #include #include #include @@ -112,6 +115,7 @@ #include #include +#include #include #include #include @@ -148,7 +152,9 @@ .pr_usrreqs = &rip6_usrreqs }; +#ifndef VIMAGE static int ip6_mrouter_ver = 0; +#endif /* !VIMAGE */ SYSCTL_DECL(_net_inet6); SYSCTL_DECL(_net_inet6_ip6); @@ -176,7 +182,9 @@ "Multicast Interfaces (struct mif[MAXMIFS], netinet6/ip6_mroute.h)"); #ifdef MRT6DEBUG -static u_int mrt6debug = 0; /* debug level */ +#ifndef VIMAGE +static u_int mrt6debug; /* debug level */ +#endif /* !VIMAGE */ #define DEBUG_MFC 0x02 #define DEBUG_FORWARD 0x04 #define DEBUG_EXPIRE 0x08 @@ -221,7 +229,9 @@ &pim6stat, pim6stat, "PIM Statistics (struct pim6stat, netinet6/pim_var.h)"); +#ifndef VIMAGE static int pim6; +#endif /* * Hash function for a source, group entry @@ -375,6 +385,7 @@ int X_ip6_mrouter_get(struct socket *so, struct sockopt *sopt) { + INIT_VNET_INET6(curvnet); int error = 0; if (so != ip6_mrouter) @@ -382,7 +393,7 @@ switch (sopt->sopt_name) { case MRT6_PIM: - error = sooptcopyout(sopt, &pim6, sizeof(pim6)); + error = sooptcopyout(sopt, &V_pim6, sizeof(V_pim6)); break; } return (error); @@ -451,10 +462,11 @@ static int set_pim6(int *i) { + INIT_VNET_INET6(curvnet); if ((*i != 1) && (*i != 0)) return (EINVAL); - pim6 = *i; + V_pim6 = *i; return (0); } @@ -465,8 +477,14 @@ static int ip6_mrouter_init(struct socket *so, int v, int cmd) { + INIT_VNET_INET6(curvnet); + + V_ip6_mrouter_ver = 0; + #ifdef MRT6DEBUG - if (mrt6debug) + V_mrt6debug = 0; + + if (V_mrt6debug) log(LOG_DEBUG, "ip6_mrouter_init: so_type = %d, pr_protocol = %d\n", so->so_type, so->so_proto->pr_protocol); @@ -483,19 +501,19 @@ return (EADDRINUSE); ip6_mrouter = so; - ip6_mrouter_ver = cmd; + V_ip6_mrouter_ver = cmd; bzero((caddr_t)mf6ctable, sizeof(mf6ctable)); bzero((caddr_t)n6expire, sizeof(n6expire)); - pim6 = 0;/* used for stubbing out/in pim stuff */ + V_pim6 = 0;/* used for stubbing out/in pim stuff */ callout_init(&expire_upcalls_ch, 0); callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); #ifdef MRT6DEBUG - if (mrt6debug) + if (V_mrt6debug) log(LOG_DEBUG, "ip6_mrouter_init\n"); #endif @@ -508,6 +526,7 @@ int X_ip6_mrouter_done(void) { + INIT_VNET_INET6(curvnet); mifi_t mifi; int i; struct mf6c *rt; @@ -542,7 +561,7 @@ bzero((caddr_t)mif6table, sizeof(mif6table)); nummifs = 0; - pim6 = 0; /* used to stub out/in pim specific code */ + V_pim6 = 0; /* used to stub out/in pim specific code */ callout_stop(&expire_upcalls_ch); @@ -580,12 +599,12 @@ } ip6_mrouter = NULL; - ip6_mrouter_ver = 0; + V_ip6_mrouter_ver = 0; splx(s); #ifdef MRT6DEBUG - if (mrt6debug) + if (V_mrt6debug) log(LOG_DEBUG, "ip6_mrouter_done\n"); #endif @@ -600,6 +619,7 @@ static int add_m6if(struct mif6ctl *mifcp) { + INIT_VNET_NET(curvnet); struct mif6 *mifp; struct ifnet *ifp; int error, s; @@ -609,7 +629,7 @@ mifp = mif6table + mifcp->mif6c_mifi; if (mifp->m6_ifp) return (EADDRINUSE); /* XXX: is it appropriate? */ - if (mifcp->mif6c_pifi == 0 || mifcp->mif6c_pifi > if_index) + if (mifcp->mif6c_pifi == 0 || mifcp->mif6c_pifi > V_if_index) return (ENXIO); ifp = ifnet_byindex(mifcp->mif6c_pifi); @@ -661,7 +681,7 @@ nummifs = mifcp->mif6c_mifi + 1; #ifdef MRT6DEBUG - if (mrt6debug) + if (V_mrt6debug) log(LOG_DEBUG, "add_mif #%d, phyint %s\n", mifcp->mif6c_mifi, @@ -718,7 +738,7 @@ splx(s); #ifdef MRT6DEBUG - if (mrt6debug) + if (V_mrt6debug) log(LOG_DEBUG, "del_m6if %d, nummifs %d\n", *mifip, nummifs); #endif @@ -744,7 +764,7 @@ /* If an entry already exists, just update the fields */ if (rt) { #ifdef MRT6DEBUG - if (mrt6debug & DEBUG_MFC) { + if (V_mrt6debug & DEBUG_MFC) { log(LOG_DEBUG, "add_m6fc no upcall h %d o %s g %s p %x\n", ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr), @@ -784,7 +804,7 @@ mfccp->mf6cc_parent, rt->mf6c_stall); #ifdef MRT6DEBUG - if (mrt6debug & DEBUG_MFC) + if (V_mrt6debug & DEBUG_MFC) log(LOG_DEBUG, "add_m6fc o %s g %s p %x dbg %x\n", ip6_sprintf(ip6bufo, @@ -826,7 +846,7 @@ */ if (nstl == 0) { #ifdef MRT6DEBUG - if (mrt6debug & DEBUG_MFC) + if (V_mrt6debug & DEBUG_MFC) log(LOG_DEBUG, "add_mfc no upcall h %d o %s g %s p %x\n", hash, @@ -930,7 +950,7 @@ hash = MF6CHASH(origin.sin6_addr, mcastgrp.sin6_addr); #ifdef MRT6DEBUG - if (mrt6debug & DEBUG_MFC) { + if (V_mrt6debug & DEBUG_MFC) { char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN]; log(LOG_DEBUG,"del_m6fc orig %s mcastgrp %s\n", ip6_sprintf(ip6bufo, &origin.sin6_addr), @@ -1001,6 +1021,7 @@ int X_ip6_mforward(struct ip6_hdr *ip6, struct ifnet *ifp, struct mbuf *m) { + INIT_VNET_INET6(curvnet); struct mf6c *rt; struct mif6 *mifp; struct mbuf *mm; @@ -1009,7 +1030,7 @@ char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; #ifdef MRT6DEBUG - if (mrt6debug & DEBUG_FORWARD) + if (V_mrt6debug & DEBUG_FORWARD) log(LOG_DEBUG, "ip6_mforward: src %s, dst %s, ifindex %d\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), @@ -1033,9 +1054,9 @@ * (although such packets must normally set 1 to the hop limit field). */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { - ip6stat.ip6s_cantforward++; - if (ip6_log_time + ip6_log_interval < time_second) { - ip6_log_time = time_second; + V_ip6stat.ip6s_cantforward++; + if (V_ip6_log_time + V_ip6_log_interval < time_second) { + V_ip6_log_time = time_second; log(LOG_DEBUG, "cannot forward " "from %s to %s nxt %d received on %s\n", @@ -1076,7 +1097,7 @@ mrt6stat.mrt6s_no_route++; #ifdef MRT6DEBUG - if (mrt6debug & (DEBUG_FORWARD | DEBUG_MFC)) + if (V_mrt6debug & (DEBUG_FORWARD | DEBUG_MFC)) log(LOG_DEBUG, "ip6_mforward: no rte s %s g %s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst)); @@ -1155,7 +1176,7 @@ #ifdef MRT6_OINIT oim = NULL; #endif - switch (ip6_mrouter_ver) { + switch (V_ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim = mtod(mm, struct omrt6msg *); @@ -1177,7 +1198,7 @@ } #ifdef MRT6DEBUG - if (mrt6debug & DEBUG_FORWARD) + if (V_mrt6debug & DEBUG_FORWARD) log(LOG_DEBUG, "getting the iif info in the kernel\n"); #endif @@ -1187,7 +1208,7 @@ mifp++, mifi++) ; - switch (ip6_mrouter_ver) { + switch (V_ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim->im6_mif = mifi; @@ -1287,7 +1308,7 @@ mfc->mf6c_expire != 0 && --mfc->mf6c_expire == 0) { #ifdef MRT6DEBUG - if (mrt6debug & DEBUG_EXPIRE) { + if (V_mrt6debug & DEBUG_EXPIRE) { char ip6bufo[INET6_ADDRSTRLEN]; char ip6bufg[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "expire_upcalls: expiring (%s %s)\n", @@ -1326,6 +1347,7 @@ static int ip6_mdq(struct mbuf *m, struct ifnet *ifp, struct mf6c *rt) { + INIT_VNET_INET6(curvnet); struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); mifi_t mifi, iif; struct mif6 *mifp; @@ -1355,7 +1377,7 @@ if ((mifi >= nummifs) || (mif6table[mifi].m6_ifp != ifp)) { /* came in the wrong interface */ #ifdef MRT6DEBUG - if (mrt6debug & DEBUG_FORWARD) + if (V_mrt6debug & DEBUG_FORWARD) log(LOG_DEBUG, "wrong if: ifid %d mifi %d mififid %x\n", ifp->if_index, mifi, @@ -1370,7 +1392,7 @@ */ /* have to make sure this is a valid mif */ if (mifi < nummifs && mif6table[mifi].m6_ifp) - if (pim6 && (m->m_flags & M_LOOP) == 0) { + if (V_pim6 && (m->m_flags & M_LOOP) == 0) { /* * Check the M_LOOP flag to avoid an * unnecessary PIM assert. @@ -1397,7 +1419,7 @@ oim = NULL; #endif im = NULL; - switch (ip6_mrouter_ver) { + switch (V_ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim = mtod(mm, struct omrt6msg *); @@ -1421,7 +1443,7 @@ mifp++, iif++) ; - switch (ip6_mrouter_ver) { + switch (V_ip6_mrouter_ver) { #ifdef MRT6_OINIT case MRT6_OINIT: oim->im6_mif = iif; @@ -1438,7 +1460,7 @@ if (socket_send(ip6_mrouter, mm, &sin6) < 0) { #ifdef MRT6DEBUG - if (mrt6debug) + if (V_mrt6debug) log(LOG_WARNING, "mdq, ip6_mrouter socket queue full\n"); #endif ++mrt6stat.mrt6s_upq_sockfull; @@ -1468,7 +1490,7 @@ dst0 = ip6->ip6_dst; if ((error = in6_setscope(&src0, ifp, &iszone)) != 0 || (error = in6_setscope(&dst0, ifp, &idzone)) != 0) { - ip6stat.ip6s_badscope++; + V_ip6stat.ip6s_badscope++; return (error); } for (mifp = mif6table, mifi = 0; mifi < nummifs; mifp++, mifi++) { @@ -1488,7 +1510,7 @@ &odzone) || iszone != oszone || idzone != odzone) { - ip6stat.ip6s_badscope++; + V_ip6stat.ip6s_badscope++; continue; } } @@ -1504,6 +1526,7 @@ static void phyint_send(struct ip6_hdr *ip6, struct mif6 *mifp, struct mbuf *m) { + INIT_VNET_INET6(curvnet); struct mbuf *mb_copy; struct ifnet *ifp = mifp->m6_ifp; int error = 0; @@ -1547,7 +1570,7 @@ IPV6_FORWARDING, &im6o, NULL, NULL); #ifdef MRT6DEBUG - if (mrt6debug & DEBUG_XMIT) + if (V_mrt6debug & DEBUG_XMIT) log(LOG_DEBUG, "phyint_send on mif %d err %d\n", mifp - mif6table, error); #endif @@ -1583,7 +1606,7 @@ error = (*ifp->if_output)(ifp, mb_copy, (struct sockaddr *)&ro.ro_dst, NULL); #ifdef MRT6DEBUG - if (mrt6debug & DEBUG_XMIT) + if (V_mrt6debug & DEBUG_XMIT) log(LOG_DEBUG, "phyint_send on mif %d err %d\n", mifp - mif6table, error); #endif @@ -1593,11 +1616,11 @@ * various router may notify pMTU in multicast, which can be * a DDoS to a router */ - if (ip6_mcast_pmtu) + if (V_ip6_mcast_pmtu) icmp6_error(mb_copy, ICMP6_PACKET_TOO_BIG, 0, linkmtu); else { #ifdef MRT6DEBUG - if (mrt6debug & DEBUG_XMIT) { + if (V_mrt6debug & DEBUG_XMIT) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; log(LOG_DEBUG, @@ -1625,7 +1648,7 @@ struct mrt6msg *im6; #ifdef MRT6DEBUG - if (mrt6debug) { + if (V_mrt6debug) { char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "** IPv6 register_send **\n src %s dst %s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), @@ -1671,7 +1694,7 @@ if (socket_send(ip6_mrouter, mm, &sin6) < 0) { #ifdef MRT6DEBUG - if (mrt6debug) + if (V_mrt6debug) log(LOG_WARNING, "register_send: ip6_mrouter socket queue full\n"); #endif @@ -1691,6 +1714,7 @@ int pim6_input(struct mbuf **mp, int *offp, int proto) { + INIT_VNET_INET6(curvnet); struct pim *pim; /* pointer to a pim struct */ struct ip6_hdr *ip6; int pimlen; @@ -1709,7 +1733,7 @@ if (pimlen < PIM_MINLEN) { ++pim6stat.pim6s_rcv_tooshort; #ifdef MRT6DEBUG - if (mrt6debug & DEBUG_PIM) + if (V_mrt6debug & DEBUG_PIM) log(LOG_DEBUG,"pim6_input: PIM packet too short\n"); #endif m_freem(m); @@ -1762,7 +1786,7 @@ if (in6_cksum(m, IPPROTO_PIM, off, cksumlen)) { ++pim6stat.pim6s_rcv_badsum; #ifdef MRT6DEBUG - if (mrt6debug & DEBUG_PIM) + if (V_mrt6debug & DEBUG_PIM) log(LOG_DEBUG, "pim6_input: invalid checksum\n"); #endif @@ -1804,7 +1828,7 @@ if ((reg_mif_num >= nummifs) || (reg_mif_num == (mifi_t) -1)) { #ifdef MRT6DEBUG - if (mrt6debug & DEBUG_PIM) + if (V_mrt6debug & DEBUG_PIM) log(LOG_DEBUG, "pim6_input: register mif not set: %d\n", reg_mif_num); @@ -1836,7 +1860,7 @@ eip6 = (struct ip6_hdr *) (reghdr + 1); #ifdef MRT6DEBUG - if (mrt6debug & DEBUG_PIM) + if (V_mrt6debug & DEBUG_PIM) log(LOG_DEBUG, "pim6_input[register], eip6: %s -> %s, " "eip6 plen %d\n", @@ -1861,7 +1885,7 @@ if (!IN6_IS_ADDR_MULTICAST(&eip6->ip6_dst)) { ++pim6stat.pim6s_rcv_badregisters; #ifdef MRT6DEBUG - if (mrt6debug & DEBUG_PIM) + if (V_mrt6debug & DEBUG_PIM) log(LOG_DEBUG, "pim6_input: inner packet of register " "is not multicast %s\n", @@ -1890,7 +1914,7 @@ */ m_adj(m, off + PIM_MINLEN); #ifdef MRT6DEBUG - if (mrt6debug & DEBUG_PIM) { + if (V_mrt6debug & DEBUG_PIM) { log(LOG_DEBUG, "pim6_input: forwarding decapsulated register: " "src %s, dst %s, mif %d\n", --- /u/marko/p4/head/src/sys/netinet6/ip6_output.c 2008-02-03 08:16:02.000000000 +0100 +++ src/sys/netinet6/ip6_output.c 2008-02-27 18:02:02.000000000 +0100 @@ -66,6 +66,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" +#include "opt_vimage.h" #include #include @@ -78,7 +79,9 @@ #include #include #include +#include +#include #include #include #include @@ -86,6 +89,7 @@ #include #include +#include #include #include #include @@ -188,6 +192,8 @@ struct route_in6 *ro, int flags, struct ip6_moptions *im6o, struct ifnet **ifpp, struct inpcb *inp) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET6(curvnet); struct ip6_hdr *ip6, *mhip6; struct ifnet *ifp, *origifp; struct mbuf *m = m0; @@ -452,7 +458,7 @@ sa.sin6_len = sizeof(sa); sa.sin6_addr = addr[0]; if ((error = sa6_embedscope(&sa, - ip6_use_defzone)) != 0) { + V_ip6_use_defzone)) != 0) { goto bad; } ip6->ip6_dst = sa.sin6_addr; @@ -472,16 +478,16 @@ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && (flags & IPV6_UNSPECSRC) == 0) { error = EOPNOTSUPP; - ip6stat.ip6s_badscope++; + V_ip6stat.ip6s_badscope++; goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { error = EOPNOTSUPP; - ip6stat.ip6s_badscope++; + V_ip6stat.ip6s_badscope++; goto bad; } - ip6stat.ip6s_localout++; + V_ip6stat.ip6s_localout++; /* * Route packet. @@ -519,7 +525,7 @@ if (im6o != NULL) ip6->ip6_hlim = im6o->im6o_multicast_hlim; else - ip6->ip6_hlim = ip6_defmcasthlim; + ip6->ip6_hlim = V_ip6_defmcasthlim; } #ifdef IPSEC @@ -596,7 +602,7 @@ &ifp, &rt, 0)) != 0) { switch (error) { case EHOSTUNREACH: - ip6stat.ip6s_noroute++; + V_ip6stat.ip6s_noroute++; break; case EADDRNOTAVAIL: default: @@ -662,7 +668,7 @@ goto routefound; badscope: - ip6stat.ip6s_badscope++; + V_ip6stat.ip6s_badscope++; in6_ifstat_inc(origifp, ifs6_out_discard); if (error == 0) error = EHOSTUNREACH; /* XXX */ @@ -695,7 +701,7 @@ * Confirm that the outgoing interface supports multicast. */ if (!(ifp->if_flags & IFF_MULTICAST)) { - ip6stat.ip6s_noroute++; + V_ip6stat.ip6s_noroute++; in6_ifstat_inc(ifp, ifs6_out_discard); error = ENETUNREACH; goto bad; @@ -845,7 +851,7 @@ /* If destination is now ourself drop to ip6_input(). */ if (in6_localaddr(&ip6->ip6_dst)) { if (m->m_pkthdr.rcvif == NULL) - m->m_pkthdr.rcvif = loif; + m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; @@ -975,7 +981,7 @@ if (qslots <= 0 || ((u_int)qslots * (mtu - hlen) < tlen /* - hlen */)) { error = ENOBUFS; - ip6stat.ip6s_odropped++; + V_ip6stat.ip6s_odropped++; goto bad; } @@ -1009,7 +1015,7 @@ MGETHDR(m, M_DONTWAIT, MT_HEADER); if (!m) { error = ENOBUFS; - ip6stat.ip6s_odropped++; + V_ip6stat.ip6s_odropped++; goto sendorfree; } m->m_pkthdr.rcvif = NULL; @@ -1022,7 +1028,7 @@ m->m_len = sizeof(*mhip6); error = ip6_insertfraghdr(m0, m, hlen, &ip6f); if (error) { - ip6stat.ip6s_odropped++; + V_ip6stat.ip6s_odropped++; goto sendorfree; } ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7)); @@ -1034,7 +1040,7 @@ sizeof(*ip6f) - sizeof(struct ip6_hdr))); if ((m_frgpart = m_copy(m0, off, len)) == 0) { error = ENOBUFS; - ip6stat.ip6s_odropped++; + V_ip6stat.ip6s_odropped++; goto sendorfree; } m_cat(m, m_frgpart); @@ -1043,7 +1049,7 @@ ip6f->ip6f_reserved = 0; ip6f->ip6f_ident = id; ip6f->ip6f_nxt = nextproto; - ip6stat.ip6s_ofragments++; + V_ip6stat.ip6s_ofragments++; in6_ifstat_inc(ifp, ifs6_out_fragcreat); } @@ -1072,7 +1078,7 @@ } if (error == 0) - ip6stat.ip6s_fragmented++; + V_ip6stat.ip6s_fragmented++; done: if (ro == &ip6route && ro->ro_rt) { /* brace necessary for RTFREE */ @@ -2406,6 +2412,8 @@ static int ip6_setmoptions(int optname, struct ip6_moptions **im6op, struct mbuf *m) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET6(curvnet); int error = 0; u_int loop, ifindex; struct ipv6_mreq *mreq; @@ -2426,7 +2434,7 @@ return (ENOBUFS); *im6op = im6o; im6o->im6o_multicast_ifp = NULL; - im6o->im6o_multicast_hlim = ip6_defmcasthlim; + im6o->im6o_multicast_hlim = V_ip6_defmcasthlim; im6o->im6o_multicast_loop = IPV6_DEFAULT_MULTICAST_LOOP; LIST_INIT(&im6o->im6o_memberships); } @@ -2442,7 +2450,7 @@ break; } bcopy(mtod(m, u_int *), &ifindex, sizeof(ifindex)); - if (ifindex < 0 || if_index < ifindex) { + if (ifindex < 0 || V_if_index < ifindex) { error = ENXIO; /* XXX EINVAL? */ break; } @@ -2468,7 +2476,7 @@ if (optval < -1 || optval >= 256) error = EINVAL; else if (optval == -1) - im6o->im6o_multicast_hlim = ip6_defmcasthlim; + im6o->im6o_multicast_hlim = V_ip6_defmcasthlim; else im6o->im6o_multicast_hlim = optval; break; @@ -2547,7 +2555,7 @@ * If the interface is specified, validate it. */ if (mreq->ipv6mr_interface < 0 || - if_index < mreq->ipv6mr_interface) { + V_if_index < mreq->ipv6mr_interface) { error = ENXIO; /* XXX EINVAL? */ break; } @@ -2611,7 +2619,7 @@ * to its ifnet structure. */ if (mreq->ipv6mr_interface < 0 || - if_index < mreq->ipv6mr_interface) { + V_if_index < mreq->ipv6mr_interface) { error = ENXIO; /* XXX EINVAL? */ break; } @@ -2652,7 +2660,7 @@ sa6_mc.sin6_family = AF_INET6; sa6_mc.sin6_len = sizeof(sa6_mc); sa6_mc.sin6_addr = mreq->ipv6mr_multiaddr; - error = sa6_embedscope(&sa6_mc, ip6_use_defzone); + error = sa6_embedscope(&sa6_mc, V_ip6_use_defzone); if (error != 0) break; mreq->ipv6mr_multiaddr = sa6_mc.sin6_addr; @@ -2691,7 +2699,7 @@ * If all options have default values, no need to keep the mbuf. */ if (im6o->im6o_multicast_ifp == NULL && - im6o->im6o_multicast_hlim == ip6_defmcasthlim && + im6o->im6o_multicast_hlim == V_ip6_defmcasthlim && im6o->im6o_multicast_loop == IPV6_DEFAULT_MULTICAST_LOOP && im6o->im6o_memberships.lh_first == NULL) { free(*im6op, M_IP6MOPTS); @@ -2707,6 +2715,7 @@ static int ip6_getmoptions(int optname, struct ip6_moptions *im6o, struct mbuf **mp) { + INIT_VNET_INET6(curvnet); u_int *hlim, *loop, *ifindex; *mp = m_get(M_TRYWAIT, MT_HEADER); /* XXX */ @@ -2726,7 +2735,7 @@ hlim = mtod(*mp, u_int *); (*mp)->m_len = sizeof(u_int); if (im6o == NULL) - *hlim = ip6_defmcasthlim; + *hlim = V_ip6_defmcasthlim; else *hlim = im6o->im6o_multicast_hlim; return (0); @@ -2735,7 +2744,7 @@ loop = mtod(*mp, u_int *); (*mp)->m_len = sizeof(u_int); if (im6o == NULL) - *loop = ip6_defmcasthlim; + *loop = V_ip6_defmcasthlim; else *loop = im6o->im6o_multicast_loop; return (0); @@ -2836,6 +2845,8 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, struct ucred *cred, int sticky, int cmsg, int uproto) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET6(curvnet); int minmtupolicy, preftemp; int error; @@ -2911,7 +2922,7 @@ } /* validate the interface index if specified. */ - if (pktinfo->ipi6_ifindex > if_index || + if (pktinfo->ipi6_ifindex > V_if_index || pktinfo->ipi6_ifindex < 0) { return (ENXIO); } @@ -3008,7 +3019,7 @@ IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) { return (EINVAL); } - if ((error = sa6_embedscope(sa6, ip6_use_defzone)) + if ((error = sa6_embedscope(sa6, V_ip6_use_defzone)) != 0) { return (error); } --- /u/marko/p4/head/src/sys/netinet6/ip6_var.h 2008-01-28 23:53:57.000000000 +0100 +++ src/sys/netinet6/ip6_var.h 2008-02-27 11:49:47.000000000 +0100 @@ -278,6 +278,7 @@ #define IP6_HDR_ALIGNED_P(ip) ((((intptr_t) (ip)) & 3) == 0) #endif +#ifndef VIMAGE extern struct ip6stat ip6stat; /* statistics */ extern int ip6_defhlim; /* default hop limit */ extern int ip6_defmcasthlim; /* default multicast hop limit */ @@ -289,8 +290,10 @@ * walk list every 5 sec. */ extern int ip6_mcast_pmtu; /* enable pMTU discovery for multicast? */ extern int ip6_v6only; +#endif extern struct socket *ip6_mrouter; /* multicast routing daemon */ +#ifndef VIMAGE extern int ip6_sendredirects; /* send IP redirects when forwarding? */ extern int ip6_maxfragpackets; /* Maximum packets in reassembly queue */ extern int ip6_maxfrags; /* Maximum fragments in reassembly queue */ @@ -304,6 +307,7 @@ extern int ip6_dad_count; /* DupAddrDetectionTransmits */ extern int ip6_auto_flowlabel; +#endif extern int ip6_auto_linklocal; extern int ip6_anonportmin; /* minimum ephemeral port */ @@ -312,8 +316,10 @@ extern int ip6_lowportmax; /* maximum reserved port */ extern int ip6_use_tempaddr; /* whether to use temporary addresses. */ +#ifndef VIMAGE extern int ip6_prefer_tempaddr; /* whether to prefer temporary addresses in the source address selection */ +#endif extern int ip6_use_defzone; /* whether to use the default scope zone when unspecified */ @@ -332,6 +338,9 @@ struct in6_ifaddr; void ip6_init __P((void)); +#ifdef VIMAGE +void ip6_destroy __P((void)); +#endif void ip6_input __P((struct mbuf *)); struct in6_ifaddr *ip6_getdstifaddr __P((struct mbuf *)); void ip6_freepcbopts __P((struct ip6_pktopts *)); --- /u/marko/p4/head/src/sys/netinet6/ip6protosw.h 2007-12-27 19:33:05.000000000 +0100 +++ src/sys/netinet6/ip6protosw.h 2008-01-14 19:23:59.000000000 +0100 @@ -134,6 +134,8 @@ /* utility hooks */ void (*pr_init) /* initialization hook */ __P((void)); + void (*pr_destroy) /* cleanup hook */ + __P((void)); void (*pr_fasttimo) /* fast timeout (200ms) */ __P((void)); --- /u/marko/p4/head/src/sys/netinet6/mld6.c 2008-01-15 18:01:34.000000000 +0100 +++ src/sys/netinet6/mld6.c 2008-02-27 11:49:49.000000000 +0100 @@ -69,6 +69,7 @@ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -79,11 +80,14 @@ #include #include #include +#include +#include #include #include #include +#include #include #include #include @@ -103,7 +107,9 @@ */ #define MLD_UNSOLICITED_REPORT_INTERVAL 10 +#ifndef VIMAGE static struct ip6_pktopts ip6_opts; +#endif static void mld6_sendpkt(struct in6_multi *, int, const struct in6_addr *); static void mld_starttimer(struct in6_multi *); @@ -114,6 +120,7 @@ void mld6_init(void) { + INIT_VNET_INET6(curvnet); static u_int8_t hbh_buf[8]; struct ip6_hbh *hbh = (struct ip6_hbh *)hbh_buf; u_int16_t rtalert_code = htons((u_int16_t)IP6OPT_RTALERT_MLD); @@ -128,8 +135,8 @@ hbh_buf[5] = IP6OPT_RTALERT_LEN - 2; bcopy((caddr_t)&rtalert_code, &hbh_buf[6], sizeof(u_int16_t)); - ip6_initpktopts(&ip6_opts); - ip6_opts.ip6po_hbh = hbh; + ip6_initpktopts(&V_ip6_opts); + V_ip6_opts.ip6po_hbh = hbh; } static void @@ -170,6 +177,7 @@ callout_stop(in6m->in6m_timer_ch); + CURVNET_SET(in6m->in6m_ifp->if_vnet); switch (in6m->in6m_state) { case MLD_REPORTPENDING: mld6_start_listening(in6m); @@ -178,6 +186,7 @@ mld6_sendpkt(in6m, MLD_LISTENER_REPORT, NULL); break; } + CURVNET_RESTORE(); splx(s); } @@ -267,6 +276,7 @@ void mld6_input(struct mbuf *m, int off) { + INIT_VNET_INET6(curvnet); struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct mld_hdr *mldh; struct ifnet *ifp = m->m_pkthdr.rcvif; @@ -282,7 +292,7 @@ #else IP6_EXTHDR_GET(mldh, struct mld_hdr *, m, off, sizeof(*mldh)); if (mldh == NULL) { - icmp6stat.icp6s_tooshort++; + V_icmp6stat.icp6s_tooshort++; return; } #endif @@ -437,6 +447,7 @@ static void mld6_sendpkt(struct in6_multi *in6m, int type, const struct in6_addr *dst) { + INIT_VNET_INET6(curvnet); struct mbuf *mh, *md; struct mld_hdr *mldh; struct ip6_hdr *ip6; @@ -510,9 +521,9 @@ im6o.im6o_multicast_loop = (ip6_mrouter != NULL); /* increment output statictics */ - icmp6stat.icp6s_outhist[type]++; + V_icmp6stat.icp6s_outhist[type]++; - ip6_output(mh, &ip6_opts, NULL, 0, &im6o, &outif, NULL); + ip6_output(mh, &V_ip6_opts, NULL, 0, &im6o, &outif, NULL); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); switch (type) { --- /u/marko/p4/head/src/sys/netinet6/nd6.c 2008-01-15 18:01:34.000000000 +0100 +++ src/sys/netinet6/nd6.c 2008-02-27 11:49:50.000000000 +0100 @@ -35,6 +35,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -50,7 +51,9 @@ #include #include #include +#include +#include #include #include #include @@ -61,6 +64,7 @@ #include #include +#include #include #include #include @@ -79,18 +83,19 @@ #define SDL(s) ((struct sockaddr_dl *)s) /* timer values */ -int nd6_prune = 1; /* walk list every 1 seconds */ -int nd6_delay = 5; /* delay first probe time 5 second */ -int nd6_umaxtries = 3; /* maximum unicast query */ -int nd6_mmaxtries = 3; /* maximum multicast query */ -int nd6_useloopback = 1; /* use loopback interface for local traffic */ -int nd6_gctimer = (60 * 60 * 24); /* 1 day: garbage collection timer */ +#ifndef VIMAGE +int nd6_prune; /* walk list every 1 seconds */ +int nd6_delay; /* delay first probe time 5 second */ +int nd6_umaxtries; /* maximum unicast query */ +int nd6_mmaxtries; /* maximum multicast query */ +int nd6_useloopback; /* use loopback interface for local traffic */ +int nd6_gctimer; /* 1 day: garbage collection timer */ /* preventing too many loops in ND option parsing */ -int nd6_maxndopt = 10; /* max # of ND options allowed */ +int nd6_maxndopt; /* max # of ND options allowed */ -int nd6_maxnudhint = 0; /* max # of subsequent upper layer hints */ -int nd6_maxqueuelen = 1; /* max # of packets cached in unresolved ND entries */ +int nd6_maxnudhint; /* max # of subsequent upper layer hints */ +int nd6_maxqueuelen; /* max # of packets cached in unresolved ND entries */ #ifdef ND6_DEBUG int nd6_debug = 1; @@ -101,11 +106,15 @@ /* for debugging? */ static int nd6_inuse, nd6_allocated; -struct llinfo_nd6 llinfo_nd6 = {&llinfo_nd6, &llinfo_nd6}; +struct llinfo_nd6 llinfo_nd6; struct nd_drhead nd_defrouter; -struct nd_prhead nd_prefix = { 0 }; +struct nd_prhead nd_prefix; -int nd6_recalc_reachtm_interval = ND6_RECALC_REACHTM_INTERVAL; +int nd6_recalc_reachtm_interval; + +extern int dad_ignore_ns; /* ignore NS in DAD - specwise incorrect*/ +extern int dad_maxtry; /* max # of *tries* to transmit DAD packet */ +#endif /* !VIMAGE */ static struct sockaddr_in6 all1_sa; static int nd6_is_new_addr_neighbor __P((struct sockaddr_in6 *, @@ -117,20 +126,59 @@ static void nd6_llinfo_timer(void *); static void clear_llinfo_pqueue(struct llinfo_nd6 *); +#ifndef VIMAGE struct callout nd6_slowtimo_ch; struct callout nd6_timer_ch; extern struct callout in6_tmpaddrtimer_ch; +#endif void nd6_init(void) { - static int nd6_init_done = 0; + INIT_VNET_INET6(curvnet); int i; - if (nd6_init_done) { - log(LOG_NOTICE, "nd6_init called more than once(ignored)\n"); - return; - } + V_nd6_prune = 1; /* walk list every 1 seconds */ + V_nd6_delay = 5; /* delay first probe time 5 second */ + V_nd6_umaxtries = 3; /* maximum unicast query */ + V_nd6_mmaxtries = 3; /* maximum multicast query */ + V_nd6_useloopback = 1; /* use loopback interface for local traffic */ + V_nd6_gctimer = (60 * 60 * 24); /* 1 day: garbage collection timer */ + + /* preventing too many loops in ND option parsing */ + V_nd6_maxndopt = 10; /* max # of ND options allowed */ + + V_nd6_maxnudhint = 0; /* max # of subsequent upper layer hints */ + V_nd6_maxqueuelen = 1; /* max # of packets cached in unresolved ND entries */ + +#ifdef ND6_DEBUG + V_nd6_debug = 1; +#else + V_nd6_debug = 0; +#endif + + V_nd6_recalc_reachtm_interval = ND6_RECALC_REACHTM_INTERVAL; + +#ifdef INET6 + V_dad_ignore_ns = 0; /* ignore NS in DAD - specwise incorrect*/ +#endif + V_dad_maxtry = 15; /* max # of *tries* to transmit DAD packet */ + + V_ip6_use_tempaddr = 0; + + V_ip6_desync_factor = 0; + V_ip6_temp_preferred_lifetime = DEF_TEMP_PREFERRED_LIFETIME; + V_ip6_temp_valid_lifetime = DEF_TEMP_VALID_LIFETIME; + /* + * shorter lifetimes for debugging purposes. + V_ip6_temp_preferred_lifetime = 800; + V_ip6_temp_valid_lifetime = 1800; + */ + + V_ip6_temp_regen_advance = TEMPADDR_REGEN_ADVANCE; + + V_llinfo_nd6.ln_next = V_llinfo_nd6.ln_prev = &V_llinfo_nd6; + LIST_INIT(&V_nd_prefix); all1_sa.sin6_family = AF_INET6; all1_sa.sin6_len = sizeof(struct sockaddr_in6); @@ -138,16 +186,25 @@ all1_sa.sin6_addr.s6_addr[i] = 0xff; /* initialization of the default router list */ - TAILQ_INIT(&nd_defrouter); - - nd6_init_done = 1; + TAILQ_INIT(&V_nd_defrouter); /* start timer */ - callout_init(&nd6_slowtimo_ch, 0); - callout_reset(&nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, - nd6_slowtimo, NULL); + callout_init(&V_nd6_slowtimo_ch, 0); + callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, + nd6_slowtimo, curvnet); } +#ifdef VIMAGE +void +nd6_destroy() +{ + INIT_VNET_INET6(curvnet); + + callout_drain(&V_nd6_slowtimo_ch); + callout_drain(&V_nd6_timer_ch); +} +#endif + struct nd_ifinfo * nd6_ifattach(struct ifnet *ifp) { @@ -197,6 +254,7 @@ void nd6_setmtu0(struct ifnet *ifp, struct nd_ifinfo *ndi) { + INIT_VNET_INET6(ifp->if_vnet); u_int32_t omaxmtu; omaxmtu = ndi->maxmtu; @@ -228,10 +286,9 @@ if_name(ifp), (unsigned long)ndi->maxmtu); } - if (ndi->maxmtu > in6_maxmtu) + if (ndi->maxmtu > V_in6_maxmtu) in6_setmaxmtu(); /* check all interfaces just in case */ -#undef MIN } void @@ -306,6 +363,7 @@ int nd6_options(union nd_opts *ndopts) { + INIT_VNET_INET6(curvnet); struct nd_opt_hdr *nd_opt; int i = 0; @@ -323,7 +381,7 @@ * Message validation requires that all included * options have a length that is greater than zero. */ - icmp6stat.icp6s_nd_badopt++; + V_icmp6stat.icp6s_nd_badopt++; bzero(ndopts, sizeof(*ndopts)); return -1; } @@ -366,8 +424,8 @@ skip1: i++; - if (i > nd6_maxndopt) { - icmp6stat.icp6s_nd_toomanyopt++; + if (i > V_nd6_maxndopt) { + V_icmp6stat.icp6s_nd_toomanyopt++; nd6log((LOG_INFO, "too many loop in nd opt\n")); break; } @@ -414,7 +472,7 @@ ln = (struct llinfo_nd6 *)arg; - if (ln->ln_ntick > 0) { + if (ln->ln_ntick > 0) { if (ln->ln_ntick > INT_MAX) { ln->ln_ntick -= INT_MAX; nd6_llinfo_settimer(ln, INT_MAX); @@ -431,6 +489,9 @@ panic("ln->ln_rt->rt_ifp == NULL"); ndi = ND_IFINFO(ifp); + CURVNET_SET(ifp->if_vnet); + INIT_VNET_INET6(curvnet); + /* sanity check */ if (rt->rt_llinfo && (struct llinfo_nd6 *)rt->rt_llinfo != ln) panic("rt_llinfo(%p) is not equal to ln(%p)", @@ -442,7 +503,7 @@ switch (ln->ln_state) { case ND6_LLINFO_INCOMPLETE: - if (ln->ln_asked < nd6_mmaxtries) { + if (ln->ln_asked < V_nd6_mmaxtries) { ln->ln_asked++; nd6_llinfo_settimer(ln, (long)ndi->retrans * hz / 1000); nd6_ns_output(ifp, NULL, dst, ln, 0); @@ -471,7 +532,7 @@ case ND6_LLINFO_REACHABLE: if (!ND6_LLINFO_PERMANENT(ln)) { ln->ln_state = ND6_LLINFO_STALE; - nd6_llinfo_settimer(ln, (long)nd6_gctimer * hz); + nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz); } break; @@ -493,11 +554,11 @@ nd6_ns_output(ifp, dst, dst, ln, 0); } else { ln->ln_state = ND6_LLINFO_STALE; /* XXX */ - nd6_llinfo_settimer(ln, (long)nd6_gctimer * hz); + nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz); } break; case ND6_LLINFO_PROBE: - if (ln->ln_asked < nd6_umaxtries) { + if (ln->ln_asked < V_nd6_umaxtries) { ln->ln_asked++; nd6_llinfo_settimer(ln, (long)ndi->retrans * hz / 1000); nd6_ns_output(ifp, dst, dst, ln, 0); @@ -521,6 +582,7 @@ } break; } + CURVNET_RESTORE(); } @@ -528,20 +590,22 @@ * ND6 timer routine to expire default route list and prefix list */ void -nd6_timer(void *ignored_arg) +nd6_timer(void *arg) { + CURVNET_SET_QUIET((struct vnet *) arg); + INIT_VNET_INET6((struct vnet *) arg); int s; struct nd_defrouter *dr; struct nd_prefix *pr; struct in6_ifaddr *ia6, *nia6; struct in6_addrlifetime *lt6; - callout_reset(&nd6_timer_ch, nd6_prune * hz, - nd6_timer, NULL); + callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz, + nd6_timer, arg); /* expire default router list */ s = splnet(); - dr = TAILQ_FIRST(&nd_defrouter); + dr = TAILQ_FIRST(&V_nd_defrouter); while (dr) { if (dr->expire && dr->expire < time_second) { struct nd_defrouter *t; @@ -560,7 +624,7 @@ * rather separate address lifetimes and prefix lifetimes. */ addrloop: - for (ia6 = in6_ifaddr; ia6; ia6 = nia6) { + for (ia6 = V_in6_ifaddr; ia6; ia6 = nia6) { nia6 = ia6->ia_next; /* check address lifetime */ lt6 = &ia6->ia6_lifetime; @@ -577,7 +641,7 @@ * address. Otherwise, we'd see an infinite loop of * regeneration. */ - if (ip6_use_tempaddr && + if (V_ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0) { if (regen_tmpaddr(ia6) == 0) regen = 1; @@ -596,7 +660,7 @@ * If a temporary address has just become deprecated, * regenerate a new one if possible. */ - if (ip6_use_tempaddr && + if (V_ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (oldflags & IN6_IFF_DEPRECATED) == 0) { @@ -626,7 +690,7 @@ } /* expire prefix list */ - pr = nd_prefix.lh_first; + pr = V_nd_prefix.lh_first; while (pr) { /* * check prefix lifetime. @@ -649,6 +713,7 @@ pr = pr->ndpr_next; } splx(s); + CURVNET_RESTORE(); } /* @@ -723,6 +788,7 @@ void nd6_purge(struct ifnet *ifp) { + INIT_VNET_INET6(ifp->if_vnet); struct llinfo_nd6 *ln, *nln; struct nd_defrouter *dr, *ndr; struct nd_prefix *pr, *npr; @@ -733,7 +799,7 @@ * in the routing table, in order to keep additional side effects as * small as possible. */ - for (dr = TAILQ_FIRST(&nd_defrouter); dr; dr = ndr) { + for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = ndr) { ndr = TAILQ_NEXT(dr, dr_entry); if (dr->installed) continue; @@ -742,7 +808,7 @@ defrtrlist_del(dr); } - for (dr = TAILQ_FIRST(&nd_defrouter); dr; dr = ndr) { + for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = ndr) { ndr = TAILQ_NEXT(dr, dr_entry); if (!dr->installed) continue; @@ -752,7 +818,7 @@ } /* Nuke prefix list entries toward ifp */ - for (pr = nd_prefix.lh_first; pr; pr = npr) { + for (pr = V_nd_prefix.lh_first; pr; pr = npr) { npr = pr->ndpr_next; if (pr->ndpr_ifp == ifp) { /* @@ -776,10 +842,10 @@ } /* cancel default outgoing interface setting */ - if (nd6_defifindex == ifp->if_index) + if (V_nd6_defifindex == ifp->if_index) nd6_setdefaultiface(0); - if (!ip6_forwarding && ip6_accept_rtadv) { /* XXX: too restrictive? */ + if (!V_ip6_forwarding && V_ip6_accept_rtadv) { /* XXX: too restrictive? */ /* refresh default router list */ defrouter_select(); } @@ -790,8 +856,8 @@ * due to KAME goto ours hack. See RTM_RESOLVE case in * nd6_rtrequest(), and ip6_input(). */ - ln = llinfo_nd6.ln_next; - while (ln && ln != &llinfo_nd6) { + ln = V_llinfo_nd6.ln_next; + while (ln && ln != &V_llinfo_nd6) { struct rtentry *rt; struct sockaddr_dl *sdl; @@ -810,6 +876,7 @@ struct rtentry * nd6_lookup(struct in6_addr *addr6, int create, struct ifnet *ifp) { + INIT_VNET_INET6(curvnet); struct rtentry *rt; struct sockaddr_in6 sin6; char ip6buf[INET6_ADDRSTRLEN]; @@ -915,6 +982,7 @@ static int nd6_is_new_addr_neighbor(struct sockaddr_in6 *addr, struct ifnet *ifp) { + INIT_VNET_INET6(ifp->if_vnet); struct nd_prefix *pr; struct ifaddr *dstaddr; @@ -947,7 +1015,7 @@ * If the address matches one of our on-link prefixes, it should be a * neighbor. */ - for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) { if (pr->ndpr_ifp != ifp) continue; @@ -973,8 +1041,8 @@ * XXX: we restrict the condition to hosts, because routers usually do * not have the "default router list". */ - if (!ip6_forwarding && TAILQ_FIRST(&nd_defrouter) == NULL && - nd6_defifindex == ifp->if_index) { + if (!V_ip6_forwarding && TAILQ_FIRST(&V_nd_defrouter) == NULL && + V_nd6_defifindex == ifp->if_index) { return (1); } @@ -1012,6 +1080,7 @@ static struct llinfo_nd6 * nd6_free(struct rtentry *rt, int gc) { + INIT_VNET_INET6(curvnet); struct llinfo_nd6 *ln = (struct llinfo_nd6 *)rt->rt_llinfo, *next; struct in6_addr in6 = ((struct sockaddr_in6 *)rt_key(rt))->sin6_addr; struct nd_defrouter *dr; @@ -1024,7 +1093,7 @@ /* cancel timer */ nd6_llinfo_settimer(ln, -1); - if (!ip6_forwarding) { + if (!V_ip6_forwarding) { int s; s = splnet(); dr = defrouter_lookup(&((struct sockaddr_in6 *)rt_key(rt))->sin6_addr, @@ -1048,7 +1117,7 @@ nd6_llinfo_settimer(ln, (dr->expire - time_second) * hz); else - nd6_llinfo_settimer(ln, (long)nd6_gctimer * hz); + nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz); splx(s); return (ln->ln_next); } @@ -1121,6 +1190,7 @@ void nd6_nud_hint(struct rtentry *rt, struct in6_addr *dst6, int force) { + INIT_VNET_INET6(curvnet); struct llinfo_nd6 *ln; /* @@ -1152,7 +1222,7 @@ */ if (!force) { ln->ln_byhint++; - if (ln->ln_byhint > nd6_maxnudhint) + if (ln->ln_byhint > V_nd6_maxnudhint) return; } @@ -1174,6 +1244,8 @@ static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; struct ifnet *ifp = rt->rt_ifp; struct ifaddr *ifa; + INIT_VNET_NET(ifp->if_vnet); + INIT_VNET_INET6(ifp->if_vnet); RT_LOCK_ASSERT(rt); @@ -1287,8 +1359,8 @@ log(LOG_DEBUG, "nd6_rtrequest: malloc failed\n"); break; } - nd6_inuse++; - nd6_allocated++; + V_nd6_inuse++; + V_nd6_allocated++; bzero(ln, sizeof(*ln)); RT_ADDREF(rt); ln->ln_rt = rt; @@ -1312,9 +1384,9 @@ nd6_llinfo_settimer(ln, 0); } rt->rt_flags |= RTF_LLINFO; - ln->ln_next = llinfo_nd6.ln_next; - llinfo_nd6.ln_next = ln; - ln->ln_prev = &llinfo_nd6; + ln->ln_next = V_llinfo_nd6.ln_next; + V_llinfo_nd6.ln_next = ln; + ln->ln_prev = &V_llinfo_nd6; ln->ln_next->ln_prev = ln; /* @@ -1332,8 +1404,8 @@ bcopy(macp, LLADDR(SDL(gate)), ifp->if_addrlen); SDL(gate)->sdl_alen = ifp->if_addrlen; } - if (nd6_useloopback) { - rt->rt_ifp = &loif[0]; /* XXX */ + if (V_nd6_useloopback) { + rt->rt_ifp = V_loif; /* XXX */ /* * Make sure rt_ifa be equal to the ifaddr * corresponding to the address. @@ -1398,7 +1470,7 @@ } else ; /* XXX: should not happen. bark here? */ } - nd6_inuse--; + V_nd6_inuse--; ln->ln_next->ln_prev = ln->ln_prev; ln->ln_prev->ln_next = ln->ln_next; ln->ln_prev = NULL; @@ -1414,6 +1486,7 @@ int nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) { + INIT_VNET_INET6(ifp->if_vnet); struct in6_drlist *drl = (struct in6_drlist *)data; struct in6_oprlist *oprl = (struct in6_oprlist *)data; struct in6_ndireq *ndi = (struct in6_ndireq *)data; @@ -1432,7 +1505,7 @@ */ bzero(drl, sizeof(*drl)); s = splnet(); - dr = TAILQ_FIRST(&nd_defrouter); + dr = TAILQ_FIRST(&V_nd_defrouter); while (dr && i < DRLSTSIZ) { drl->defrouter[i].rtaddr = dr->rtaddr; in6_clearscope(&drl->defrouter[i].rtaddr); @@ -1461,7 +1534,7 @@ */ bzero(oprl, sizeof(*oprl)); s = splnet(); - pr = nd_prefix.lh_first; + pr = V_nd_prefix.lh_first; while (pr && i < PRLSTSIZ) { struct nd_pfxrouter *pfr; int j; @@ -1570,7 +1643,7 @@ struct nd_prefix *pr, *next; s = splnet(); - for (pr = nd_prefix.lh_first; pr; pr = next) { + for (pr = V_nd_prefix.lh_first; pr; pr = next) { struct in6_ifaddr *ia, *ia_next; next = pr->ndpr_next; @@ -1579,7 +1652,7 @@ continue; /* XXX */ /* do we really have to remove addresses as well? */ - for (ia = in6_ifaddr; ia; ia = ia_next) { + for (ia = V_in6_ifaddr; ia; ia = ia_next) { /* ia might be removed. keep the next ptr. */ ia_next = ia->ia_next; @@ -1601,7 +1674,7 @@ s = splnet(); defrouter_reset(); - for (dr = TAILQ_FIRST(&nd_defrouter); dr; dr = next) { + for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = next) { next = TAILQ_NEXT(dr, dr_entry); defrtrlist_del(dr); } @@ -1633,7 +1706,7 @@ break; } case SIOCGDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */ - ndif->ifindex = nd6_defifindex; + ndif->ifindex = V_nd6_defifindex; break; case SIOCSDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */ return (nd6_setdefaultiface(ndif->ifindex)); @@ -1652,6 +1725,7 @@ nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr, int lladdrlen, int type, int code) { + INIT_VNET_INET6(curvnet); struct rtentry *rt = NULL; struct llinfo_nd6 *ln = NULL; int is_newentry; @@ -1764,7 +1838,7 @@ * we must set the timer now, although it is actually * meaningless. */ - nd6_llinfo_settimer(ln, (long)nd6_gctimer * hz); + nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz); if (ln->ln_hold) { struct mbuf *m_hold, *m_hold_next; @@ -1875,22 +1949,26 @@ * for those are not autoconfigured hosts, we explicitly avoid such * cases for safety. */ - if (do_update && ln->ln_router && !ip6_forwarding && ip6_accept_rtadv) + if (do_update && ln->ln_router && !V_ip6_forwarding && V_ip6_accept_rtadv) defrouter_select(); return rt; } static void -nd6_slowtimo(void *ignored_arg) +nd6_slowtimo(void *arg) { + CURVNET_SET((struct vnet *) arg); + INIT_VNET_NET((struct vnet *) arg); + INIT_VNET_INET6((struct vnet *) arg); struct nd_ifinfo *nd6if; struct ifnet *ifp; - callout_reset(&nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, - nd6_slowtimo, NULL); + callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, + nd6_slowtimo, arg); IFNET_RLOCK(); - for (ifp = TAILQ_FIRST(&ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { + for (ifp = TAILQ_FIRST(&V_ifnet); ifp; + ifp = TAILQ_NEXT(ifp, if_list)) { nd6if = ND_IFINFO(ifp); if (nd6if->basereachable && /* already initialized */ (nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) { @@ -1900,11 +1978,12 @@ * value gets recomputed at least once every few hours. * (RFC 2461, 6.3.4) */ - nd6if->recalctm = nd6_recalc_reachtm_interval; + nd6if->recalctm = V_nd6_recalc_reachtm_interval; nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable); } } IFNET_RUNLOCK(); + CURVNET_RESTORE(); } #define senderr(e) { error = (e); goto bad;} @@ -1912,6 +1991,7 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, struct sockaddr_in6 *dst, struct rtentry *rt0) { + INIT_VNET_INET6(curvnet); struct mbuf *m = m0; struct rtentry *rt = rt0; struct sockaddr_in6 *gw6 = NULL; @@ -2041,7 +2121,7 @@ if ((ifp->if_flags & IFF_POINTOPOINT) != 0 && ln->ln_state < ND6_LLINFO_REACHABLE) { ln->ln_state = ND6_LLINFO_STALE; - nd6_llinfo_settimer(ln, (long)nd6_gctimer * hz); + nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz); } /* @@ -2054,7 +2134,7 @@ if (ln->ln_state == ND6_LLINFO_STALE) { ln->ln_asked = 0; ln->ln_state = ND6_LLINFO_DELAY; - nd6_llinfo_settimer(ln, (long)nd6_delay * hz); + nd6_llinfo_settimer(ln, (long)V_nd6_delay * hz); } /* @@ -2086,7 +2166,7 @@ break; } } - while (i >= nd6_maxqueuelen) { + while (i >= V_nd6_maxqueuelen) { m_hold = ln->ln_hold; ln->ln_hold = ln->ln_hold->m_nextpkt; m_freem(m_hold); @@ -2262,12 +2342,13 @@ CTLFLAG_RD, nd6_sysctl_drlist, ""); SYSCTL_NODE(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist, CTLFLAG_RD, nd6_sysctl_prlist, ""); -SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen, - CTLFLAG_RW, &nd6_maxqueuelen, 1, ""); +SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen, + CTLFLAG_RW, nd6_maxqueuelen, 1, ""); static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET6(curvnet); int error; char buf[1024] __aligned(4); struct in6_defrouter *d, *de; @@ -2277,7 +2358,7 @@ return EPERM; error = 0; - for (dr = TAILQ_FIRST(&nd_defrouter); dr; + for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { d = (struct in6_defrouter *)buf; de = (struct in6_defrouter *)(buf + sizeof(buf)); @@ -2308,6 +2389,7 @@ static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET6(curvnet); int error; char buf[1024] __aligned(4); struct in6_prefix *p, *pe; @@ -2318,7 +2400,7 @@ return EPERM; error = 0; - for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) { u_short advrtrs; size_t advance; struct sockaddr_in6 *sin6, *s6; --- /u/marko/p4/head/src/sys/netinet6/nd6.h 2007-12-27 19:33:07.000000000 +0100 +++ src/sys/netinet6/nd6.h 2008-01-14 19:23:59.000000000 +0100 @@ -328,6 +328,7 @@ LIST_HEAD(nd_prhead, nd_prefix); /* nd6.c */ +#ifndef VIMAGE extern int nd6_prune; extern int nd6_delay; extern int nd6_umaxtries; @@ -339,17 +340,22 @@ extern struct nd_drhead nd_defrouter; extern struct nd_prhead nd_prefix; extern int nd6_debug; +#endif -#define nd6log(x) do { if (nd6_debug) log x; } while (/*CONSTCOND*/ 0) +#define nd6log(x) do { if (V_nd6_debug) log x; } while (/*CONSTCOND*/ 0) +#ifndef VIMAGE extern struct callout nd6_timer_ch; +#endif /* nd6_rtr.c */ +#ifndef VIMAGE extern int nd6_defifindex; extern int ip6_desync_factor; /* seconds */ extern u_int32_t ip6_temp_preferred_lifetime; /* seconds */ extern u_int32_t ip6_temp_valid_lifetime; /* seconds */ extern int ip6_temp_regen_advance; /* seconds */ +#endif union nd_opts { struct nd_opt_hdr *nd_opt_array[8]; /* max = target address list */ @@ -379,6 +385,9 @@ /* XXX: need nd6_var.h?? */ /* nd6.c */ void nd6_init __P((void)); +#ifdef VIMAGE +void nd6_destroy __P((void)); +#endif struct nd_ifinfo *nd6_ifattach __P((struct ifnet *)); void nd6_ifdetach __P((struct nd_ifinfo *)); int nd6_is_addr_neighbor __P((struct sockaddr_in6 *, struct ifnet *)); --- /u/marko/p4/head/src/sys/netinet6/nd6_nbr.c 2008-01-15 18:01:35.000000000 +0100 +++ src/sys/netinet6/nd6_nbr.c 2008-02-27 18:04:50.000000000 +0100 @@ -36,6 +36,7 @@ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_carp.h" +#include "opt_vimage.h" #include #include @@ -49,6 +50,8 @@ #include #include #include +#include +#include #include #include @@ -58,6 +61,7 @@ #include #include +#include #include #include #include @@ -76,13 +80,15 @@ static struct dadq *nd6_dad_find(struct ifaddr *); static void nd6_dad_starttimer(struct dadq *, int); static void nd6_dad_stoptimer(struct dadq *); -static void nd6_dad_timer(struct ifaddr *); +static void nd6_dad_timer(struct dadq *); static void nd6_dad_ns_output(struct dadq *, struct ifaddr *); static void nd6_dad_ns_input(struct ifaddr *); static void nd6_dad_na_input(struct ifaddr *); -static int dad_ignore_ns = 0; /* ignore NS in DAD - specwise incorrect*/ -static int dad_maxtry = 15; /* max # of *tries* to transmit DAD packet */ +#ifndef VIMAGE +int dad_ignore_ns; /* ignore NS in DAD - specwise incorrect*/ +int dad_maxtry; /* max # of *tries* to transmit DAD packet */ +#endif /* !VIMAGE */ /* * Input a Neighbor Solicitation Message. @@ -93,6 +99,7 @@ void nd6_ns_input(struct mbuf *m, int off, int icmp6len) { + INIT_VNET_INET6(curvnet); struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_neighbor_solicit *nd_ns; @@ -115,7 +122,7 @@ #else IP6_EXTHDR_GET(nd_ns, struct nd_neighbor_solicit *, m, off, icmp6len); if (nd_ns == NULL) { - icmp6stat.icp6s_tooshort++; + V_icmp6stat.icp6s_tooshort++; return; } #endif @@ -301,7 +308,7 @@ goto bad; nd6_na_output(ifp, &in6_all, &taddr6, ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) | - (ip6_forwarding ? ND_NA_FLAG_ROUTER : 0), + (V_ip6_forwarding ? ND_NA_FLAG_ROUTER : 0), tlladdr, (struct sockaddr *)proxydl); goto freeit; } @@ -311,7 +318,7 @@ nd6_na_output(ifp, &saddr6, &taddr6, ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) | - (ip6_forwarding ? ND_NA_FLAG_ROUTER : 0) | ND_NA_FLAG_SOLICITED, + (V_ip6_forwarding ? ND_NA_FLAG_ROUTER : 0) | ND_NA_FLAG_SOLICITED, tlladdr, (struct sockaddr *)proxydl); freeit: m_freem(m); @@ -324,7 +331,7 @@ ip6_sprintf(ip6bufs, &daddr6))); nd6log((LOG_ERR, "nd6_ns_input: tgt=%s\n", ip6_sprintf(ip6bufs, &taddr6))); - icmp6stat.icp6s_badns++; + V_icmp6stat.icp6s_badns++; m_freem(m); } @@ -344,6 +351,7 @@ nd6_ns_output(struct ifnet *ifp, const struct in6_addr *daddr6, const struct in6_addr *taddr6, struct llinfo_nd6 *ln, int dad) { + INIT_VNET_INET6(ifp->if_vnet); struct mbuf *m; struct ip6_hdr *ip6; struct nd_neighbor_solicit *nd_ns; @@ -521,7 +529,7 @@ ip6_output(m, NULL, &ro, dad ? IPV6_UNSPECSRC : 0, &im6o, NULL, NULL); icmp6_ifstat_inc(ifp, ifs6_out_msg); icmp6_ifstat_inc(ifp, ifs6_out_neighborsolicit); - icmp6stat.icp6s_outhist[ND_NEIGHBOR_SOLICIT]++; + V_icmp6stat.icp6s_outhist[ND_NEIGHBOR_SOLICIT]++; if (ro.ro_rt) { /* we don't cache this route. */ RTFREE(ro.ro_rt); @@ -549,6 +557,7 @@ void nd6_na_input(struct mbuf *m, int off, int icmp6len) { + INIT_VNET_INET6(curvnet); struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_neighbor_advert *nd_na; @@ -581,7 +590,7 @@ #else IP6_EXTHDR_GET(nd_na, struct nd_neighbor_advert *, m, off, icmp6len); if (nd_na == NULL) { - icmp6stat.icp6s_tooshort++; + V_icmp6stat.icp6s_tooshort++; return; } #endif @@ -686,7 +695,7 @@ } } else { ln->ln_state = ND6_LLINFO_STALE; - nd6_llinfo_settimer(ln, (long)nd6_gctimer * hz); + nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz); } if ((ln->ln_router = is_router) != 0) { /* @@ -740,7 +749,7 @@ */ if (ln->ln_state == ND6_LLINFO_REACHABLE) { ln->ln_state = ND6_LLINFO_STALE; - nd6_llinfo_settimer(ln, (long)nd6_gctimer * hz); + nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz); } goto freeit; } else if (is_override /* (2a) */ @@ -770,7 +779,7 @@ if (lladdr != NULL && llchange) { ln->ln_state = ND6_LLINFO_STALE; nd6_llinfo_settimer(ln, - (long)nd6_gctimer * hz); + (long)V_nd6_gctimer * hz); } } } @@ -797,7 +806,7 @@ dr = defrouter_lookup(in6, ifp); if (dr) defrtrlist_del(dr); - else if (!ip6_forwarding) { + else if (!V_ip6_forwarding) { /* * Even if the neighbor is not in the default * router list, the neighbor may be used @@ -840,7 +849,7 @@ return; bad: - icmp6stat.icp6s_badna++; + V_icmp6stat.icp6s_badna++; m_freem(m); } @@ -861,6 +870,7 @@ const struct in6_addr *taddr6, u_long flags, int tlladdr, struct sockaddr *sdl0) { + INIT_VNET_INET6(ifp->if_vnet); struct mbuf *m; struct ip6_hdr *ip6; struct nd_neighbor_advert *nd_na; @@ -1007,7 +1017,7 @@ ip6_output(m, NULL, &ro, 0, &im6o, NULL, NULL); icmp6_ifstat_inc(ifp, ifs6_out_msg); icmp6_ifstat_inc(ifp, ifs6_out_neighboradvert); - icmp6stat.icp6s_outhist[ND_NEIGHBOR_ADVERT]++; + V_icmp6stat.icp6s_outhist[ND_NEIGHBOR_ADVERT]++; if (ro.ro_rt) { /* we don't cache this route. */ RTFREE(ro.ro_rt); @@ -1047,7 +1057,6 @@ } } -TAILQ_HEAD(dadq_head, dadq); struct dadq { TAILQ_ENTRY(dadq) dad_list; struct ifaddr *dad_ifa; @@ -1057,17 +1066,21 @@ int dad_ns_icount; int dad_na_icount; struct callout dad_timer_ch; + struct vnet *dad_vnet; }; -static struct dadq_head dadq; +#ifndef VIMAGE +TAILQ_HEAD(, dadq) dadq; static int dad_init = 0; +#endif static struct dadq * nd6_dad_find(struct ifaddr *ifa) { + INIT_VNET_INET6(curvnet); struct dadq *dp; - for (dp = dadq.tqh_first; dp; dp = dp->dad_list.tqe_next) { + for (dp = V_dadq.tqh_first; dp; dp = dp->dad_list.tqe_next) { if (dp->dad_ifa == ifa) return dp; } @@ -1079,7 +1092,7 @@ { callout_reset(&dp->dad_timer_ch, ticks, - (void (*)(void *))nd6_dad_timer, (void *)dp->dad_ifa); + (void (*)(void *))nd6_dad_timer, (void *)dp); } static void @@ -1095,13 +1108,14 @@ void nd6_dad_start(struct ifaddr *ifa, int delay) { + INIT_VNET_INET6(curvnet); struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; char ip6buf[INET6_ADDRSTRLEN]; - if (!dad_init) { - TAILQ_INIT(&dadq); - dad_init++; + if (!V_dad_init) { + TAILQ_INIT(&V_dadq); + V_dad_init++; } /* @@ -1122,7 +1136,7 @@ ia->ia6_flags &= ~IN6_IFF_TENTATIVE; return; } - if (!ip6_dad_count) { + if (!V_ip6_dad_count) { ia->ia6_flags &= ~IN6_IFF_TENTATIVE; return; } @@ -1146,7 +1160,10 @@ } bzero(dp, sizeof(*dp)); callout_init(&dp->dad_timer_ch, 0); - TAILQ_INSERT_TAIL(&dadq, (struct dadq *)dp, dad_list); +#ifdef VIMAGE + dp->dad_vnet = curvnet; +#endif + TAILQ_INSERT_TAIL(&V_dadq, (struct dadq *)dp, dad_list); nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); @@ -1159,7 +1176,7 @@ */ dp->dad_ifa = ifa; IFAREF(ifa); /* just for safety */ - dp->dad_count = ip6_dad_count; + dp->dad_count = V_ip6_dad_count; dp->dad_ns_icount = dp->dad_na_icount = 0; dp->dad_ns_ocount = dp->dad_ns_tcount = 0; if (delay == 0) { @@ -1177,9 +1194,10 @@ void nd6_dad_stop(struct ifaddr *ifa) { + INIT_VNET_INET6(curvnet); struct dadq *dp; - if (!dad_init) + if (!V_dad_init) return; dp = nd6_dad_find(ifa); if (!dp) { @@ -1189,32 +1207,26 @@ nd6_dad_stoptimer(dp); - TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); + TAILQ_REMOVE(&V_dadq, (struct dadq *)dp, dad_list); free(dp, M_IP6NDP); dp = NULL; IFAFREE(ifa); } static void -nd6_dad_timer(struct ifaddr *ifa) +nd6_dad_timer(struct dadq *dp) { - int s; + CURVNET_SET(dp->dad_vnet); + INIT_VNET_INET6(curvnet); + struct ifaddr *ifa = dp->dad_ifa; struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; - struct dadq *dp; char ip6buf[INET6_ADDRSTRLEN]; - s = splnet(); /* XXX */ - /* Sanity check */ if (ia == NULL) { log(LOG_ERR, "nd6_dad_timer: called with null parameter\n"); goto done; } - dp = nd6_dad_find(ifa); - if (dp == NULL) { - log(LOG_ERR, "nd6_dad_timer: DAD structure not found\n"); - goto done; - } if (ia->ia6_flags & IN6_IFF_DUPLICATED) { log(LOG_ERR, "nd6_dad_timer: called with duplicated address " "%s(%s)\n", @@ -1231,11 +1243,11 @@ } /* timeouted with IFF_{RUNNING,UP} check */ - if (dp->dad_ns_tcount > dad_maxtry) { + if (dp->dad_ns_tcount > V_dad_maxtry) { nd6log((LOG_INFO, "%s: could not run DAD, driver problem?\n", if_name(ifa->ifa_ifp))); - TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); + TAILQ_REMOVE(&V_dadq, (struct dadq *)dp, dad_list); free(dp, M_IP6NDP); dp = NULL; IFAFREE(ifa); @@ -1288,7 +1300,7 @@ if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); - TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); + TAILQ_REMOVE(&V_dadq, (struct dadq *)dp, dad_list); free(dp, M_IP6NDP); dp = NULL; IFAFREE(ifa); @@ -1296,12 +1308,13 @@ } done: - splx(s); + CURVNET_RESTORE(); } void nd6_dad_duplicated(struct ifaddr *ifa) { + INIT_VNET_INET6(curvnet); struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct ifnet *ifp; struct dadq *dp; @@ -1364,7 +1377,7 @@ } } - TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); + TAILQ_REMOVE(&V_dadq, (struct dadq *)dp, dad_list); free(dp, M_IP6NDP); dp = NULL; IFAFREE(ifa); @@ -1391,6 +1404,7 @@ static void nd6_dad_ns_input(struct ifaddr *ifa) { + INIT_VNET_INET6(curvnet); struct in6_ifaddr *ia; struct ifnet *ifp; const struct in6_addr *taddr6; @@ -1407,7 +1421,7 @@ dp = nd6_dad_find(ifa); /* Quickhack - completely ignore DAD NS packets */ - if (dad_ignore_ns) { + if (V_dad_ignore_ns) { char ip6buf[INET6_ADDRSTRLEN]; nd6log((LOG_INFO, "nd6_dad_ns_input: ignoring DAD NS packet for " --- /u/marko/p4/head/src/sys/netinet6/nd6_rtr.c 2008-01-15 18:01:35.000000000 +0100 +++ src/sys/netinet6/nd6_rtr.c 2008-02-27 11:49:54.000000000 +0100 @@ -34,6 +34,7 @@ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -46,7 +47,9 @@ #include #include #include +#include +#include #include #include #include @@ -54,6 +57,7 @@ #include #include +#include #include #include #include @@ -84,22 +88,24 @@ static int rt6_deleteroute(struct radix_node *, void *); +#ifndef VIMAGE extern int nd6_recalc_reachtm_interval; static struct ifnet *nd6_defifp; int nd6_defifindex; -int ip6_use_tempaddr = 0; +int ip6_use_tempaddr; int ip6_desync_factor; -u_int32_t ip6_temp_preferred_lifetime = DEF_TEMP_PREFERRED_LIFETIME; -u_int32_t ip6_temp_valid_lifetime = DEF_TEMP_VALID_LIFETIME; +u_int32_t ip6_temp_preferred_lifetime; +u_int32_t ip6_temp_valid_lifetime; /* * shorter lifetimes for debugging purposes. -int ip6_temp_preferred_lifetime = 800; -static int ip6_temp_valid_lifetime = 1800; +int ip6_temp_preferred_lifetime; +static int ip6_temp_valid_lifetime; */ -int ip6_temp_regen_advance = TEMPADDR_REGEN_ADVANCE; +int ip6_temp_regen_advance; +#endif /* !VIMAGE */ /* RTPREF_MEDIUM has to be 0! */ #define RTPREF_HIGH 1 @@ -118,6 +124,7 @@ void nd6_rs_input(struct mbuf *m, int off, int icmp6len) { + INIT_VNET_INET6(curvnet); struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_router_solicit *nd_rs; @@ -128,7 +135,7 @@ char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; /* If I'm not a router, ignore it. */ - if (ip6_accept_rtadv != 0 || ip6_forwarding != 1) + if (V_ip6_accept_rtadv != 0 || V_ip6_forwarding != 1) goto freeit; /* Sanity checks */ @@ -153,7 +160,7 @@ #else IP6_EXTHDR_GET(nd_rs, struct nd_router_solicit *, m, off, icmp6len); if (nd_rs == NULL) { - icmp6stat.icp6s_tooshort++; + V_icmp6stat.icp6s_tooshort++; return; } #endif @@ -188,7 +195,7 @@ return; bad: - icmp6stat.icp6s_badrs++; + V_icmp6stat.icp6s_badrs++; m_freem(m); } @@ -202,6 +209,7 @@ void nd6_ra_input(struct mbuf *m, int off, int icmp6len) { + INIT_VNET_INET6(curvnet); struct ifnet *ifp = m->m_pkthdr.rcvif; struct nd_ifinfo *ndi = ND_IFINFO(ifp); struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); @@ -217,7 +225,7 @@ * the system-wide variable allows the acceptance, and * per-interface variable allows RAs on the receiving interface. */ - if (ip6_accept_rtadv == 0) + if (V_ip6_accept_rtadv == 0) goto freeit; if (!(ndi->flags & ND6_IFF_ACCEPT_RTADV)) goto freeit; @@ -243,7 +251,7 @@ #else IP6_EXTHDR_GET(nd_ra, struct nd_router_advert *, m, off, icmp6len); if (nd_ra == NULL) { - icmp6stat.icp6s_tooshort++; + V_icmp6stat.icp6s_tooshort++; return; } #endif @@ -278,7 +286,7 @@ ndi->basereachable != advreachable) { ndi->basereachable = advreachable; ndi->reachable = ND_COMPUTE_RTIME(ndi->basereachable); - ndi->recalctm = nd6_recalc_reachtm_interval; /* reset */ + ndi->recalctm = V_nd6_recalc_reachtm_interval; /* reset */ } } if (nd_ra->nd_ra_retransmit) @@ -419,7 +427,7 @@ return; bad: - icmp6stat.icp6s_badra++; + V_icmp6stat.icp6s_badra++; m_freem(m); } @@ -482,9 +490,10 @@ struct nd_defrouter * defrouter_lookup(struct in6_addr *addr, struct ifnet *ifp) { + INIT_VNET_INET6(ifp->if_vnet); struct nd_defrouter *dr; - for (dr = TAILQ_FIRST(&nd_defrouter); dr; + for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { if (dr->ifp == ifp && IN6_ARE_ADDR_EQUAL(addr, &dr->rtaddr)) return (dr); @@ -530,9 +539,10 @@ void defrouter_reset(void) { + INIT_VNET_INET6(curvnet); struct nd_defrouter *dr; - for (dr = TAILQ_FIRST(&nd_defrouter); dr; + for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) defrouter_delreq(dr); @@ -545,6 +555,7 @@ void defrtrlist_del(struct nd_defrouter *dr) { + INIT_VNET_INET6(curvnet); struct nd_defrouter *deldr = NULL; struct nd_prefix *pr; @@ -552,19 +563,19 @@ * Flush all the routing table entries that use the router * as a next hop. */ - if (!ip6_forwarding && ip6_accept_rtadv) /* XXX: better condition? */ + if (!V_ip6_forwarding && V_ip6_accept_rtadv) /* XXX: better condition? */ rt6_flush(&dr->rtaddr, dr->ifp); if (dr->installed) { deldr = dr; defrouter_delreq(dr); } - TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); + TAILQ_REMOVE(&V_nd_defrouter, dr, dr_entry); /* * Also delete all the pointers to the router in each prefix lists. */ - for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) { struct nd_pfxrouter *pfxrtr; if ((pfxrtr = pfxrtr_lookup(pr, dr)) != NULL) pfxrtr_del(pfxrtr); @@ -606,6 +617,7 @@ void defrouter_select(void) { + INIT_VNET_INET6(curvnet); int s = splnet(); struct nd_defrouter *dr, *selected_dr = NULL, *installed_dr = NULL; struct rtentry *rt = NULL; @@ -617,10 +629,10 @@ * if the node is not an autoconfigured host, we explicitly exclude * such cases here for safety. */ - if (ip6_forwarding || !ip6_accept_rtadv) { + if (V_ip6_forwarding || !V_ip6_accept_rtadv) { nd6log((LOG_WARNING, "defrouter_select: called unexpectedly (forwarding=%d, " - "accept_rtadv=%d)\n", ip6_forwarding, ip6_accept_rtadv)); + "accept_rtadv=%d)\n", V_ip6_forwarding, V_ip6_accept_rtadv)); splx(s); return; } @@ -629,7 +641,7 @@ * Let's handle easy case (3) first: * If default router list is empty, there's nothing to be done. */ - if (!TAILQ_FIRST(&nd_defrouter)) { + if (!TAILQ_FIRST(&V_nd_defrouter)) { splx(s); return; } @@ -639,7 +651,7 @@ * We just pick up the first reachable one (if any), assuming that * the ordering rule of the list described in defrtrlist_update(). */ - for (dr = TAILQ_FIRST(&nd_defrouter); dr; + for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { if (selected_dr == NULL && (rt = nd6_lookup(&dr->rtaddr, 0, dr->ifp)) && @@ -666,7 +678,7 @@ */ if (selected_dr == NULL) { if (installed_dr == NULL || !TAILQ_NEXT(installed_dr, dr_entry)) - selected_dr = TAILQ_FIRST(&nd_defrouter); + selected_dr = TAILQ_FIRST(&V_nd_defrouter); else selected_dr = TAILQ_NEXT(installed_dr, dr_entry); } else if (installed_dr && @@ -722,6 +734,7 @@ static struct nd_defrouter * defrtrlist_update(struct nd_defrouter *new) { + INIT_VNET_INET6(curvnet); struct nd_defrouter *dr, *n; int s = splnet(); @@ -756,7 +769,7 @@ * defrouter_select() below will handle routing * changes later. */ - TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); + TAILQ_REMOVE(&V_nd_defrouter, dr, dr_entry); n = dr; goto insert; } @@ -787,7 +800,7 @@ */ /* insert at the end of the group */ - for (dr = TAILQ_FIRST(&nd_defrouter); dr; + for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { if (rtpref(n) > rtpref(dr)) break; @@ -795,7 +808,7 @@ if (dr) TAILQ_INSERT_BEFORE(dr, n, dr_entry); else - TAILQ_INSERT_TAIL(&nd_defrouter, n, dr_entry); + TAILQ_INSERT_TAIL(&V_nd_defrouter, n, dr_entry); defrouter_select(); @@ -843,9 +856,11 @@ struct nd_prefix * nd6_prefix_lookup(struct nd_prefixctl *key) { + INIT_VNET_INET6(curvnet); struct nd_prefix *search; - for (search = nd_prefix.lh_first; search; search = search->ndpr_next) { + for (search = V_nd_prefix.lh_first; + search; search = search->ndpr_next) { if (key->ndpr_ifp == search->ndpr_ifp && key->ndpr_plen == search->ndpr_plen && in6_are_prefix_equal(&key->ndpr_prefix.sin6_addr, @@ -861,6 +876,7 @@ nd6_prelist_add(struct nd_prefixctl *pr, struct nd_defrouter *dr, struct nd_prefix **newp) { + INIT_VNET_INET6(curvnet); struct nd_prefix *new = NULL; int error = 0; int i, s; @@ -894,7 +910,7 @@ s = splnet(); /* link ndpr_entry to nd_prefix list */ - LIST_INSERT_HEAD(&nd_prefix, new, ndpr_entry); + LIST_INSERT_HEAD(&V_nd_prefix, new, ndpr_entry); splx(s); /* ND_OPT_PI_FLAG_ONLINK processing */ @@ -919,6 +935,7 @@ void prelist_remove(struct nd_prefix *pr) { + INIT_VNET_INET6(curvnet); struct nd_pfxrouter *pfr, *next; int e, s; char ip6buf[INET6_ADDRSTRLEN]; @@ -971,6 +988,7 @@ prelist_update(struct nd_prefixctl *new, struct nd_defrouter *dr, struct mbuf *m, int mcast) { + INIT_VNET_INET6(curvnet); struct in6_ifaddr *ia6 = NULL, *ia6_match = NULL; struct ifaddr *ifa; struct ifnet *ifp = new->ndpr_ifp; @@ -1194,20 +1212,20 @@ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0) { u_int32_t maxvltime, maxpltime; - if (ip6_temp_valid_lifetime > + if (V_ip6_temp_valid_lifetime > (u_int32_t)((time_second - ifa6->ia6_createtime) + - ip6_desync_factor)) { - maxvltime = ip6_temp_valid_lifetime - + V_ip6_desync_factor)) { + maxvltime = V_ip6_temp_valid_lifetime - (time_second - ifa6->ia6_createtime) - - ip6_desync_factor; + V_ip6_desync_factor; } else maxvltime = 0; - if (ip6_temp_preferred_lifetime > + if (V_ip6_temp_preferred_lifetime > (u_int32_t)((time_second - ifa6->ia6_createtime) + - ip6_desync_factor)) { - maxpltime = ip6_temp_preferred_lifetime - + V_ip6_desync_factor)) { + maxpltime = V_ip6_temp_preferred_lifetime - (time_second - ifa6->ia6_createtime) - - ip6_desync_factor; + V_ip6_desync_factor; } else maxpltime = 0; @@ -1274,7 +1292,7 @@ * addresses. Thus, we specifiy 1 as the 2nd arg of * in6_tmpifadd(). */ - if (ip6_use_tempaddr) { + if (V_ip6_use_tempaddr) { int e; if ((e = in6_tmpifadd(ia6, 1, 1)) != 0) { nd6log((LOG_NOTICE, "prelist_update: " @@ -1341,6 +1359,7 @@ void pfxlist_onlink_check() { + INIT_VNET_INET6(curvnet); struct nd_prefix *pr; struct in6_ifaddr *ifa; struct nd_defrouter *dr; @@ -1350,7 +1369,7 @@ * Check if there is a prefix that has a reachable advertising * router. */ - for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) { if (pr->ndpr_raf_onlink && find_pfxlist_reachable_router(pr)) break; } @@ -1360,11 +1379,11 @@ * that does not advertise any prefixes. */ if (pr == NULL) { - for (dr = TAILQ_FIRST(&nd_defrouter); dr; + for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { struct nd_prefix *pr0; - for (pr0 = nd_prefix.lh_first; pr0; + for (pr0 = V_nd_prefix.lh_first; pr0; pr0 = pr0->ndpr_next) { if ((pfxrtr = pfxrtr_lookup(pr0, dr)) != NULL) break; @@ -1373,7 +1392,7 @@ break; } } - if (pr != NULL || (TAILQ_FIRST(&nd_defrouter) && pfxrtr == NULL)) { + if (pr != NULL || (TAILQ_FIRST(&V_nd_defrouter) && pfxrtr == NULL)) { /* * There is at least one prefix that has a reachable router, * or at least a router which probably does not advertise @@ -1383,7 +1402,7 @@ * Detach prefixes which have no reachable advertising * router, and attach other prefixes. */ - for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) { /* XXX: a link-local prefix should never be detached */ if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) continue; @@ -1404,7 +1423,7 @@ } } else { /* there is no prefix that has a reachable router */ - for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) { if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) continue; @@ -1424,7 +1443,7 @@ * interfaces. Such cases will be handled in nd6_prefix_onlink, * so we don't have to care about them. */ - for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) { int e; char ip6buf[INET6_ADDRSTRLEN]; @@ -1467,7 +1486,7 @@ * always be attached. * The precise detection logic is same as the one for prefixes. */ - for (ifa = in6_ifaddr; ifa; ifa = ifa->ia_next) { + for (ifa = V_in6_ifaddr; ifa; ifa = ifa->ia_next) { if (!(ifa->ia6_flags & IN6_IFF_AUTOCONF)) continue; @@ -1484,7 +1503,7 @@ break; } if (ifa) { - for (ifa = in6_ifaddr; ifa; ifa = ifa->ia_next) { + for (ifa = V_in6_ifaddr; ifa; ifa = ifa->ia_next) { if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; @@ -1503,7 +1522,7 @@ } } else { - for (ifa = in6_ifaddr; ifa; ifa = ifa->ia_next) { + for (ifa = V_in6_ifaddr; ifa; ifa = ifa->ia_next) { if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; @@ -1520,6 +1539,7 @@ int nd6_prefix_onlink(struct nd_prefix *pr) { + INIT_VNET_INET6(curvnet); struct ifaddr *ifa; struct ifnet *ifp = pr->ndpr_ifp; struct sockaddr_in6 mask6; @@ -1545,7 +1565,7 @@ * Although such a configuration is expected to be rare, we explicitly * allow it. */ - for (opr = nd_prefix.lh_first; opr; opr = opr->ndpr_next) { + for (opr = V_nd_prefix.lh_first; opr; opr = opr->ndpr_next) { if (opr == pr) continue; @@ -1633,6 +1653,7 @@ int nd6_prefix_offlink(struct nd_prefix *pr) { + INIT_VNET_INET6(curvnet); int error = 0; struct ifnet *ifp = pr->ndpr_ifp; struct nd_prefix *opr; @@ -1674,7 +1695,7 @@ * If there's one, try to make the prefix on-link on the * interface. */ - for (opr = nd_prefix.lh_first; opr; opr = opr->ndpr_next) { + for (opr = V_nd_prefix.lh_first; opr; opr = opr->ndpr_next) { if (opr == pr) continue; @@ -1724,6 +1745,7 @@ static struct in6_ifaddr * in6_ifadd(struct nd_prefixctl *pr, int mcast) { + INIT_VNET_INET6(curvnet); struct ifnet *ifp = pr->ndpr_ifp; struct ifaddr *ifa; struct in6_aliasreq ifra; @@ -1852,6 +1874,7 @@ int in6_tmpifadd(const struct in6_ifaddr *ia0, int forcegen, int delay) { + INIT_VNET_INET6(curvnet); struct ifnet *ifp = ia0->ia_ifa.ifa_ifp; struct in6_ifaddr *newia, *ia; struct in6_aliasreq ifra; @@ -1890,7 +1913,7 @@ * there may be a time lag between generation of the ID and generation * of the address. So, we'll do one more sanity check. */ - for (ia = in6_ifaddr; ia; ia = ia->ia_next) { + for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) { if (IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &ifra.ifra_addr.sin6_addr)) { if (trylimit-- == 0) { @@ -1918,20 +1941,20 @@ vltime0 = IFA6_IS_INVALID(ia0) ? 0 : (ia0->ia6_lifetime.ia6t_vltime - (time_second - ia0->ia6_updatetime)); - if (vltime0 > ip6_temp_valid_lifetime) - vltime0 = ip6_temp_valid_lifetime; + if (vltime0 > V_ip6_temp_valid_lifetime) + vltime0 = V_ip6_temp_valid_lifetime; } else - vltime0 = ip6_temp_valid_lifetime; + vltime0 = V_ip6_temp_valid_lifetime; if (ia0->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { pltime0 = IFA6_IS_DEPRECATED(ia0) ? 0 : (ia0->ia6_lifetime.ia6t_pltime - (time_second - ia0->ia6_updatetime)); - if (pltime0 > ip6_temp_preferred_lifetime - ip6_desync_factor){ - pltime0 = ip6_temp_preferred_lifetime - - ip6_desync_factor; + if (pltime0 > V_ip6_temp_preferred_lifetime - V_ip6_desync_factor){ + pltime0 = V_ip6_temp_preferred_lifetime - + V_ip6_desync_factor; } } else - pltime0 = ip6_temp_preferred_lifetime - ip6_desync_factor; + pltime0 = V_ip6_temp_preferred_lifetime - V_ip6_desync_factor; ifra.ifra_lifetime.ia6t_vltime = vltime0; ifra.ifra_lifetime.ia6t_pltime = pltime0; @@ -1939,7 +1962,7 @@ * A temporary address is created only if this calculated Preferred * Lifetime is greater than REGEN_ADVANCE time units. */ - if (ifra.ifra_lifetime.ia6t_pltime <= ip6_temp_regen_advance) + if (ifra.ifra_lifetime.ia6t_pltime <= V_ip6_temp_regen_advance) return (0); /* XXX: scope zone ID? */ @@ -2019,7 +2042,8 @@ void rt6_flush(struct in6_addr *gateway, struct ifnet *ifp) { - struct radix_node_head *rnh = rt_tables[AF_INET6]; + INIT_VNET_NET(curvnet); + struct radix_node_head *rnh = V_rt_tables[AF_INET6]; int s = splnet(); /* We'll care only link-local addresses */ @@ -2071,26 +2095,28 @@ int nd6_setdefaultiface(int ifindex) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET6(curvnet); int error = 0; - if (ifindex < 0 || if_index < ifindex) + if (ifindex < 0 || V_if_index < ifindex) return (EINVAL); if (ifindex != 0 && !ifnet_byindex(ifindex)) return (EINVAL); - if (nd6_defifindex != ifindex) { - nd6_defifindex = ifindex; - if (nd6_defifindex > 0) - nd6_defifp = ifnet_byindex(nd6_defifindex); + if (V_nd6_defifindex != ifindex) { + V_nd6_defifindex = ifindex; + if (V_nd6_defifindex > 0) + V_nd6_defifp = ifnet_byindex(V_nd6_defifindex); else - nd6_defifp = NULL; + V_nd6_defifp = NULL; /* * Our current implementation assumes one-to-one maping between * interfaces and links, so it would be natural to use the * default interface as the default link. */ - scope6_setdefault(nd6_defifp); + scope6_setdefault(V_nd6_defifp); } return (error); --- /u/marko/p4/head/src/sys/netinet6/raw_ip6.c 2008-01-28 23:53:57.000000000 +0100 +++ src/sys/netinet6/raw_ip6.c 2008-02-27 11:49:55.000000000 +0100 @@ -63,6 +63,7 @@ #include "opt_ipsec.h" #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -77,17 +78,21 @@ #include #include #include +#include +#include #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -99,6 +104,7 @@ #ifdef IPSEC #include #include +#include #endif /* IPSEC */ #include @@ -110,12 +116,14 @@ * Raw interface to IP6 protocol. */ +#ifndef VIMAGE extern struct inpcbhead ripcb; extern struct inpcbinfo ripcbinfo; -extern u_long rip_sendspace; -extern u_long rip_recvspace; struct rip6stat rip6stat; +#endif +extern u_long rip_sendspace; +extern u_long rip_recvspace; /* * Hooks for multicast forwarding. @@ -135,6 +143,11 @@ int rip6_input(struct mbuf **mp, int *offp, int proto) { + INIT_VNET_INET(curvnet); + INIT_VNET_INET6(curvnet); +#ifdef IPSEC + INIT_VNET_IPSEC(curvnet); +#endif struct mbuf *m = *mp; register struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); register struct inpcb *in6p; @@ -142,7 +155,7 @@ struct mbuf *opts = NULL; struct sockaddr_in6 fromsa; - rip6stat.rip6s_ipackets++; + V_rip6stat.rip6s_ipackets++; if (faithprefix_p != NULL && (*faithprefix_p)(&ip6->ip6_dst)) { /* XXX send icmp6 host/port unreach? */ @@ -152,8 +165,8 @@ init_sin6(&fromsa, m); /* general init */ - INP_INFO_RLOCK(&ripcbinfo); - LIST_FOREACH(in6p, &ripcb, inp_list) { + INP_INFO_RLOCK(&V_ripcbinfo); + LIST_FOREACH(in6p, &V_ripcb, inp_list) { INP_LOCK(in6p); if ((in6p->in6p_vflag & INP_IPV6) == 0) { docontinue: @@ -170,10 +183,10 @@ !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src)) goto docontinue; if (in6p->in6p_cksum != -1) { - rip6stat.rip6s_isum++; + V_rip6stat.rip6s_isum++; if (in6_cksum(m, proto, *offp, m->m_pkthdr.len - *offp)) { - rip6stat.rip6s_badsum++; + V_rip6stat.rip6s_badsum++; goto docontinue; } } @@ -186,7 +199,7 @@ */ if (n && ipsec6_in_reject(n, last)) { m_freem(n); - ipsec6stat.in_polvio++; + V_ipsec6stat.in_polvio++; /* do not inject data into pcb */ } else #endif /* IPSEC */ @@ -202,7 +215,7 @@ m_freem(n); if (opts) m_freem(opts); - rip6stat.rip6s_fullsock++; + V_rip6stat.rip6s_fullsock++; } else sorwakeup(last->in6p_socket); opts = NULL; @@ -217,8 +230,8 @@ */ if (last && ipsec6_in_reject(m, last)) { m_freem(m); - ipsec6stat.in_polvio++; - ip6stat.ip6s_delivered--; + V_ipsec6stat.in_polvio++; + V_ip6stat.ip6s_delivered--; /* do not inject data into pcb */ INP_UNLOCK(last); } else @@ -234,14 +247,14 @@ m_freem(m); if (opts) m_freem(opts); - rip6stat.rip6s_fullsock++; + V_rip6stat.rip6s_fullsock++; } else sorwakeup(last->in6p_socket); INP_UNLOCK(last); } else { - rip6stat.rip6s_nosock++; + V_rip6stat.rip6s_nosock++; if (m->m_flags & M_MCAST) - rip6stat.rip6s_nosockmcast++; + V_rip6stat.rip6s_nosockmcast++; if (proto == IPPROTO_NONE) m_freem(m); else { @@ -250,15 +263,16 @@ ICMP6_PARAMPROB_NEXTHEADER, prvnxtp - mtod(m, char *)); } - ip6stat.ip6s_delivered--; + V_ip6stat.ip6s_delivered--; } - INP_INFO_RUNLOCK(&ripcbinfo); + INP_INFO_RUNLOCK(&V_ripcbinfo); return IPPROTO_DONE; } void rip6_ctlinput(int cmd, struct sockaddr *sa, void *d) { + INIT_VNET_INET(curvnet); struct ip6_hdr *ip6; struct mbuf *m; int off = 0; @@ -295,7 +309,7 @@ sa6_src = &sa6_any; } - (void) in6_pcbnotify(&ripcbinfo, sa, 0, + (void) in6_pcbnotify(&V_ripcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify); } @@ -313,6 +327,7 @@ va_dcl #endif { + INIT_VNET_INET6(curvnet); struct mbuf *control; struct socket *so; struct sockaddr_in6 *dstsock; @@ -353,9 +368,9 @@ * XXX: we may still need to determine the zone later. */ if (!(so->so_state & SS_ISCONNECTED)) { - if (dstsock->sin6_scope_id == 0 && !ip6_use_defzone) + if (dstsock->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; - if ((error = sa6_embedscope(dstsock, ip6_use_defzone)) != 0) + if ((error = sa6_embedscope(dstsock, V_ip6_use_defzone)) != 0) goto bad; } @@ -450,9 +465,9 @@ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (oifp) icmp6_ifoutstat_inc(oifp, type, code); - icmp6stat.icp6s_outhist[type]++; + V_icmp6stat.icp6s_outhist[type]++; } else - rip6stat.rip6s_opackets++; + V_rip6stat.rip6s_opackets++; goto freectl; @@ -538,6 +553,7 @@ static int rip6_attach(struct socket *so, int proto, struct thread *td) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; struct icmp6_filter *filter; int error; @@ -554,15 +570,15 @@ sizeof(struct icmp6_filter), M_PCB, M_NOWAIT); if (filter == NULL) return ENOMEM; - INP_INFO_WLOCK(&ripcbinfo); - error = in_pcballoc(so, &ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); + error = in_pcballoc(so, &V_ripcbinfo); if (error) { - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); FREE(filter, M_PCB); return error; } inp = (struct inpcb *)so->so_pcb; - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); inp->inp_vflag |= INP_IPV6; inp->in6p_ip6_nxt = (long)proto; inp->in6p_hops = -1; /* use kernel default */ @@ -576,6 +592,7 @@ static void rip6_detach(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); @@ -584,7 +601,7 @@ if (so == ip6_mrouter && ip6_mrouter_done) ip6_mrouter_done(); /* xxx: RSVP */ - INP_INFO_WLOCK(&ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); INP_LOCK(inp); if (inp->in6p_icmp6filt) { FREE(inp->in6p_icmp6filt, M_PCB); @@ -592,7 +609,7 @@ } in6_pcbdetach(inp); in6_pcbfree(inp); - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); } /* XXXRW: This can't ever be called. */ @@ -633,6 +650,9 @@ static int rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { + INIT_VNET_NET(so->so_vnet); + INIT_VNET_INET(so->so_vnet); + INIT_VNET_INET6(so->so_vnet); struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct ifaddr *ia = NULL; @@ -641,9 +661,9 @@ KASSERT(inp != NULL, ("rip6_bind: inp == NULL")); if (nam->sa_len != sizeof(*addr)) return EINVAL; - if (TAILQ_EMPTY(&ifnet) || addr->sin6_family != AF_INET6) + if (TAILQ_EMPTY(&V_ifnet) || addr->sin6_family != AF_INET6) return EADDRNOTAVAIL; - if ((error = sa6_embedscope(addr, ip6_use_defzone)) != 0) + if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0) return(error); if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) && @@ -655,17 +675,20 @@ IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { return (EADDRNOTAVAIL); } - INP_INFO_WLOCK(&ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); INP_LOCK(inp); inp->in6p_laddr = addr->sin6_addr; INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); return 0; } static int rip6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { + INIT_VNET_NET(so->so_vnet); + INIT_VNET_INET(so->so_vnet); + INIT_VNET_INET6(so->so_vnet); struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct in6_addr *in6a = NULL; @@ -675,7 +698,7 @@ KASSERT(inp != NULL, ("rip6_connect: inp == NULL")); if (nam->sa_len != sizeof(*addr)) return EINVAL; - if (TAILQ_EMPTY(&ifnet)) + if (TAILQ_EMPTY(&V_ifnet)) return EADDRNOTAVAIL; if (addr->sin6_family != AF_INET6) return EAFNOSUPPORT; @@ -688,12 +711,12 @@ * we'll see if we can determine the outgoing interface. If we * can, determine the zone ID based on the interface below. */ - if (addr->sin6_scope_id == 0 && !ip6_use_defzone) + if (addr->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; - if ((error = sa6_embedscope(addr, ip6_use_defzone)) != 0) + if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0) return(error); - INP_INFO_WLOCK(&ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); INP_LOCK(inp); /* Source address selection. XXX: need pcblookup? */ in6a = in6_selectsrc(addr, inp->in6p_outputopts, @@ -701,7 +724,7 @@ &inp->in6p_laddr, &ifp, &error); if (in6a == NULL) { INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); return (error ? error : EADDRNOTAVAIL); } @@ -709,14 +732,14 @@ if (ifp && scope_ambiguous && (error = in6_setscope(&addr->sin6_addr, ifp, NULL)) != 0) { INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); return(error); } inp->in6p_faddr = addr->sin6_addr; inp->in6p_laddr = *in6a; soisconnected(so); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); return 0; } @@ -737,18 +760,19 @@ rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 tmp; struct sockaddr_in6 *dst; int ret; KASSERT(inp != NULL, ("rip6_send: inp == NULL")); - INP_INFO_WLOCK(&ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); /* always copy sockaddr to avoid overwrites */ /* Unlocked read. */ if (so->so_state & SS_ISCONNECTED) { if (nam) { - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); m_freem(m); return EISCONN; } @@ -761,12 +785,12 @@ dst = &tmp; } else { if (nam == NULL) { - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); m_freem(m); return ENOTCONN; } if (nam->sa_len != sizeof(struct sockaddr_in6)) { - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); m_freem(m); return(EINVAL); } @@ -783,13 +807,13 @@ "unspec. Assume AF_INET6\n"); dst->sin6_family = AF_INET6; } else if (dst->sin6_family != AF_INET6) { - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); m_freem(m); return(EAFNOSUPPORT); } } ret = rip6_output(m, so, dst, control); - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); return (ret); } --- /u/marko/p4/head/src/sys/netinet6/route6.c 2007-12-27 19:33:10.000000000 +0100 +++ src/sys/netinet6/route6.c 2008-01-14 19:23:59.000000000 +0100 @@ -34,16 +34,19 @@ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_vimage.h" #include #include #include #include #include +#include #include #include +#include #include #include #include @@ -64,6 +67,7 @@ int route6_input(struct mbuf **mp, int *offp, int proto) { + INIT_VNET_INET6(curvnet); struct ip6_hdr *ip6; struct mbuf *m = *mp; struct ip6_rthdr *rh; @@ -74,7 +78,7 @@ if (ip6a) { /* XXX reject home-address option before rthdr */ if (ip6a->ip6a_flags & IP6A_SWAP) { - ip6stat.ip6s_badoptions++; + V_ip6stat.ip6s_badoptions++; m_freem(m); return IPPROTO_DONE; } @@ -88,7 +92,7 @@ ip6 = mtod(m, struct ip6_hdr *); IP6_EXTHDR_GET(rh, struct ip6_rthdr *, m, off, sizeof(*rh)); if (rh == NULL) { - ip6stat.ip6s_tooshort++; + V_ip6stat.ip6s_tooshort++; return IPPROTO_DONE; } #endif @@ -115,7 +119,7 @@ */ IP6_EXTHDR_GET(rh, struct ip6_rthdr *, m, off, rhlen); if (rh == NULL) { - ip6stat.ip6s_tooshort++; + V_ip6stat.ip6s_tooshort++; return IPPROTO_DONE; } #endif @@ -129,7 +133,7 @@ rhlen = (rh->ip6r_len + 1) << 3; break; /* Final dst. Just ignore the header. */ } - ip6stat.ip6s_badoptions++; + V_ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&rh->ip6r_type - (caddr_t)ip6); return (IPPROTO_DONE); @@ -149,6 +153,7 @@ static int ip6_rthdr0(struct mbuf *m, struct ip6_hdr *ip6, struct ip6_rthdr0 *rh0) { + INIT_VNET_INET6(curvnet); int addrs, index; struct in6_addr *nextaddr, tmpaddr; struct in6_ifaddr *ifa; @@ -166,14 +171,14 @@ * RFC 2462: this limitation was removed since strict/loose * bitmap field was deleted. */ - ip6stat.ip6s_badoptions++; + V_ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&rh0->ip6r0_len - (caddr_t)ip6); return (-1); } if ((addrs = rh0->ip6r0_len / 2) < rh0->ip6r0_segleft) { - ip6stat.ip6s_badoptions++; + V_ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&rh0->ip6r0_segleft - (caddr_t)ip6); return (-1); @@ -192,7 +197,7 @@ IN6_IS_ADDR_UNSPECIFIED(nextaddr) || IN6_IS_ADDR_V4MAPPED(nextaddr) || IN6_IS_ADDR_V4COMPAT(nextaddr)) { - ip6stat.ip6s_badoptions++; + V_ip6stat.ip6s_badoptions++; m_freem(m); return (-1); } @@ -200,7 +205,7 @@ IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst) || IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) { - ip6stat.ip6s_badoptions++; + V_ip6stat.ip6s_badoptions++; m_freem(m); return (-1); } @@ -213,7 +218,7 @@ if ((ifa = ip6_getdstifaddr(m)) == NULL) goto bad; if (in6_setscope(nextaddr, ifa->ia_ifp, NULL) != 0) { - ip6stat.ip6s_badscope++; + V_ip6stat.ip6s_badscope++; goto bad; } --- /u/marko/p4/head/src/sys/netinet6/scope6.c 2007-12-27 19:33:11.000000000 +0100 +++ src/sys/netinet6/scope6.c 2008-01-14 19:23:59.000000000 +0100 @@ -32,6 +32,8 @@ #include __FBSDID("$FreeBSD: src/sys/netinet6/scope6.c,v 1.18 2007/12/10 16:03:39 obrien Exp $"); +#include "opt_vimage.h" + #include #include #include @@ -39,20 +41,25 @@ #include #include #include +#include +#include #include #include #include +#include #include #include +#ifndef VIMAGE #ifdef ENABLE_DEFAULT_SCOPE int ip6_use_defzone = 1; #else int ip6_use_defzone = 0; #endif +#endif /* !VIMAGE */ /* * The scope6_lock protects the global sid default stored in @@ -64,16 +71,32 @@ #define SCOPE6_UNLOCK() mtx_unlock(&scope6_lock) #define SCOPE6_LOCK_ASSERT() mtx_assert(&scope6_lock, MA_OWNED) +#ifndef VIMAGE static struct scope6_id sid_default; +#endif + #define SID(ifp) \ (((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->scope6_id) void scope6_init(void) { + INIT_VNET_INET6(curvnet); + +#ifdef ENABLE_DEFAULT_SCOPE + V_ip6_use_defzone = 1; +#else + V_ip6_use_defzone = 0; +#endif + + bzero(&V_sid_default, sizeof(V_sid_default)); + +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif SCOPE6_LOCK_INIT(); - bzero(&sid_default, sizeof(sid_default)); } struct scope6_id * @@ -109,6 +132,7 @@ int scope6_set(struct ifnet *ifp, struct scope6_id *idlist) { + INIT_VNET_NET(ifp->if_vnet); int i; int error = 0; struct scope6_id *sid = NULL; @@ -147,7 +171,7 @@ } if (i == IPV6_ADDR_SCOPE_LINKLOCAL && - idlist->s6id_list[i] > if_index) { + idlist->s6id_list[i] > V_if_index) { /* * XXX: theoretically, there should be no * relationship between link IDs and interface @@ -263,6 +287,8 @@ void scope6_setdefault(struct ifnet *ifp) { + INIT_VNET_INET6(ifp->if_vnet); + /* * Currently, this function just sets the default "interfaces" * and "links" according to the given interface. @@ -271,13 +297,13 @@ */ SCOPE6_LOCK(); if (ifp) { - sid_default.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = + V_sid_default.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = ifp->if_index; - sid_default.s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = + V_sid_default.s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = ifp->if_index; } else { - sid_default.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = 0; - sid_default.s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = 0; + V_sid_default.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = 0; + V_sid_default.s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = 0; } SCOPE6_UNLOCK(); } @@ -285,9 +311,10 @@ int scope6_get_default(struct scope6_id *idlist) { + INIT_VNET_INET6(curvnet); SCOPE6_LOCK(); - *idlist = sid_default; + *idlist = V_sid_default; SCOPE6_UNLOCK(); return (0); @@ -296,6 +323,7 @@ u_int32_t scope6_addr2default(struct in6_addr *addr) { + INIT_VNET_INET6(curvnet); u_int32_t id; /* @@ -310,7 +338,7 @@ * not to lock here? */ SCOPE6_LOCK(); - id = sid_default.s6id_list[in6_addrscope(addr)]; + id = V_sid_default.s6id_list[in6_addrscope(addr)]; SCOPE6_UNLOCK(); return (id); } @@ -326,6 +354,7 @@ int sa6_embedscope(struct sockaddr_in6 *sin6, int defaultok) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; u_int32_t zoneid; @@ -341,7 +370,7 @@ * zone IDs assuming a one-to-one mapping between interfaces * and links. */ - if (if_index < zoneid) + if (V_if_index < zoneid) return (ENXIO); ifp = ifnet_byindex(zoneid); if (ifp == NULL) /* XXX: this can happen for some OS */ @@ -362,6 +391,7 @@ int sa6_recoverscope(struct sockaddr_in6 *sin6) { + INIT_VNET_NET(curvnet); char ip6buf[INET6_ADDRSTRLEN]; u_int32_t zoneid; @@ -379,7 +409,7 @@ zoneid = ntohs(sin6->sin6_addr.s6_addr16[1]); if (zoneid) { /* sanity check */ - if (zoneid < 0 || if_index < zoneid) + if (zoneid < 0 || V_if_index < zoneid) return (ENXIO); if (!ifnet_byindex(zoneid)) return (ENXIO); --- /u/marko/p4/head/src/sys/netinet6/sctp6_usrreq.c 2007-12-27 19:33:12.000000000 +0100 +++ src/sys/netinet6/sctp6_usrreq.c 2008-01-14 19:23:59.000000000 +0100 @@ -32,13 +32,17 @@ #include __FBSDID("$FreeBSD: src/sys/netinet6/sctp6_usrreq.c,v 1.42 2007/12/10 16:03:39 obrien Exp $"); +#include "opt_vimage.h" + #include #include +#include #include #include #include #if defined(INET6) #include +#include #endif #include #include @@ -773,6 +777,7 @@ sctp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *p) { + INIT_VNET_INET6(curvnet); struct sctp_inpcb *inp; struct inpcb *in_inp; struct in6pcb *inp6; @@ -829,7 +834,7 @@ } } if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { - if (!ip6_v6only) { + if (!V_ip6_v6only) { struct sockaddr_in sin; /* convert v4-mapped into v4 addr and send */ @@ -885,6 +890,7 @@ static int sctp6_connect(struct socket *so, struct sockaddr *addr, struct thread *p) { + INIT_VNET_INET6(curvnet); uint32_t vrf_id; int error = 0; struct sctp_inpcb *inp; @@ -960,7 +966,7 @@ } } if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { - if (!ip6_v6only) { + if (!V_ip6_v6only) { /* convert v4-mapped into v4 addr */ in6_sin6_2_sin((struct sockaddr_in *)&ss, sin6); addr = (struct sockaddr *)&ss; --- /u/marko/p4/head/src/sys/netinet6/udp6_usrreq.c 2008-01-28 23:53:57.000000000 +0100 +++ src/sys/netinet6/udp6_usrreq.c 2008-02-27 11:49:56.000000000 +0100 @@ -69,6 +69,7 @@ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -84,11 +85,14 @@ #include #include #include +#include +#include #include #include #include +#include #include #include #include @@ -101,6 +105,7 @@ #include #include #include +#include #include #include #include @@ -110,6 +115,7 @@ #ifdef IPSEC #include #include +#include #endif /* IPSEC */ #include @@ -126,6 +132,7 @@ udp6_append(struct inpcb *inp, struct mbuf *n, int off, struct sockaddr_in6 *fromsa) { + INIT_VNET_INET(inp->inp_vnet); struct socket *so; struct mbuf *opts; @@ -134,8 +141,9 @@ #ifdef IPSEC /* Check AH/ESP integrity. */ if (ipsec6_in_reject(n, inp)) { + INIT_VNET_IPSEC(inp->inp_vnet); m_freem(n); - ipsec6stat.in_polvio++; + V_ipsec6stat.in_polvio++; return; } #endif /* IPSEC */ @@ -159,7 +167,7 @@ m_freem(n); if (opts) m_freem(opts); - udpstat.udps_fullsock++; + V_udpstat.udps_fullsock++; } else sorwakeup_locked(so); } @@ -167,6 +175,8 @@ int udp6_input(struct mbuf **mp, int *offp, int proto) { + INIT_VNET_INET(curvnet); + INIT_VNET_INET6(curvnet); struct mbuf *m = *mp; struct ip6_hdr *ip6; struct udphdr *uh; @@ -193,7 +203,7 @@ return (IPPROTO_DONE); #endif - udpstat.udps_ipackets++; + V_udpstat.udps_ipackets++; /* * Destination port of 0 is illegal, based on RFC768. @@ -205,7 +215,7 @@ ulen = ntohs((u_short)uh->uh_ulen); if (plen != ulen) { - udpstat.udps_badlen++; + V_udpstat.udps_badlen++; goto badunlocked; } @@ -213,11 +223,11 @@ * Checksum extended UDP header and data. */ if (uh->uh_sum == 0) { - udpstat.udps_nosum++; + V_udpstat.udps_nosum++; goto badunlocked; } if (in6_cksum(m, IPPROTO_UDP, off, ulen) != 0) { - udpstat.udps_badsum++; + V_udpstat.udps_badsum++; goto badunlocked; } @@ -227,7 +237,7 @@ init_sin6(&fromsa, m); fromsa.sin6_port = uh->uh_sport; - INP_INFO_RLOCK(&udbinfo); + INP_INFO_RLOCK(&V_udbinfo); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct inpcb *last; @@ -247,7 +257,7 @@ * later. */ last = NULL; - LIST_FOREACH(inp, &udb, inp_list) { + LIST_FOREACH(inp, &V_udb, inp_list) { if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (inp->in6p_lport != uh->uh_dport) @@ -302,20 +312,20 @@ * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ - udpstat.udps_noport++; - udpstat.udps_noportmcast++; + V_udpstat.udps_noport++; + V_udpstat.udps_noportmcast++; goto badheadlocked; } INP_LOCK(last); udp6_append(last, m, off, &fromsa); INP_UNLOCK(last); - INP_INFO_RUNLOCK(&udbinfo); + INP_INFO_RUNLOCK(&V_udbinfo); return (IPPROTO_DONE); } /* * Locate pcb for datagram. */ - inp = in6_pcblookup_hash(&udbinfo, &ip6->ip6_src, uh->uh_sport, + inp = in6_pcblookup_hash(&V_udbinfo, &ip6->ip6_src, uh->uh_sport, &ip6->ip6_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif); if (inp == NULL) { if (udp_log_in_vain) { @@ -329,13 +339,13 @@ ip6_sprintf(ip6bufs, &ip6->ip6_src), ntohs(uh->uh_sport)); } - udpstat.udps_noport++; + V_udpstat.udps_noport++; if (m->m_flags & M_MCAST) { printf("UDP6: M_MCAST is set in a unicast packet.\n"); - udpstat.udps_noportmcast++; + V_udpstat.udps_noportmcast++; goto badheadlocked; } - INP_INFO_RUNLOCK(&udbinfo); + INP_INFO_RUNLOCK(&V_udbinfo); if (udp_blackhole) goto badunlocked; if (badport_bandlim(BANDLIM_ICMP6_UNREACH) < 0) @@ -346,11 +356,11 @@ INP_LOCK(inp); udp6_append(inp, m, off, &fromsa); INP_UNLOCK(inp); - INP_INFO_RUNLOCK(&udbinfo); + INP_INFO_RUNLOCK(&V_udbinfo); return (IPPROTO_DONE); badheadlocked: - INP_INFO_RUNLOCK(&udbinfo); + INP_INFO_RUNLOCK(&V_udbinfo); badunlocked: if (m) m_freem(m); @@ -360,6 +370,7 @@ void udp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { + INIT_VNET_INET(curvnet); struct udphdr uh; struct ip6_hdr *ip6; struct mbuf *m; @@ -414,17 +425,19 @@ bzero(&uh, sizeof(uh)); m_copydata(m, off, sizeof(*uhp), (caddr_t)&uh); - (void) in6_pcbnotify(&udbinfo, sa, uh.uh_dport, + (void) in6_pcbnotify(&V_udbinfo, sa, uh.uh_dport, (struct sockaddr *)ip6cp->ip6c_src, uh.uh_sport, cmd, cmdarg, notify); } else - (void) in6_pcbnotify(&udbinfo, sa, 0, + (void) in6_pcbnotify(&V_udbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify); } static int udp6_getcred(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET(curvnet); + INIT_VNET_INET6(curvnet); struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; @@ -441,16 +454,16 @@ error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); - if ((error = sa6_embedscope(&addrs[0], ip6_use_defzone)) != 0 || - (error = sa6_embedscope(&addrs[1], ip6_use_defzone)) != 0) { + if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || + (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } - INP_INFO_RLOCK(&udbinfo); - inp = in6_pcblookup_hash(&udbinfo, &addrs[1].sin6_addr, + INP_INFO_RLOCK(&V_udbinfo); + inp = in6_pcblookup_hash(&V_udbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, 1, NULL); if (inp == NULL) { - INP_INFO_RUNLOCK(&udbinfo); + INP_INFO_RUNLOCK(&V_udbinfo); return (ENOENT); } INP_LOCK(inp); @@ -464,7 +477,7 @@ cru2x(inp->inp_socket->so_cred, &xuc); out: INP_UNLOCK(inp); - INP_INFO_RUNLOCK(&udbinfo); + INP_INFO_RUNLOCK(&V_udbinfo); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); @@ -477,6 +490,8 @@ udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr6, struct mbuf *control, struct thread *td) { + INIT_VNET_INET(curvnet); + INIT_VNET_INET6(curvnet); u_int32_t ulen = m->m_pkthdr.len; u_int32_t plen = sizeof(struct udphdr) + ulen; struct ip6_hdr *ip6; @@ -510,9 +525,9 @@ * we'll see if we can determine the outgoing interface. If we * can, determine the zone ID based on the interface below. */ - if (sin6->sin6_scope_id == 0 && !ip6_use_defzone) + if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; - if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0) + if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) return (error); } @@ -668,7 +683,7 @@ flags = 0; - udpstat.udps_opackets++; + V_udpstat.udps_opackets++; error = ip6_output(m, optp, NULL, flags, inp->in6p_moptions, NULL, inp); break; @@ -692,6 +707,7 @@ static void udp6_abort(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); @@ -707,7 +723,7 @@ } #endif - INP_INFO_WLOCK(&udbinfo); + INP_INFO_WLOCK(&V_udbinfo); INP_LOCK(inp); if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { in6_pcbdisconnect(inp); @@ -715,12 +731,13 @@ soisdisconnected(so); } INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); } static int udp6_attach(struct socket *so, int proto, struct thread *td) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; int error; @@ -732,14 +749,14 @@ if (error) return (error); } - INP_INFO_WLOCK(&udbinfo); - error = in_pcballoc(so, &udbinfo); + INP_INFO_WLOCK(&V_udbinfo); + error = in_pcballoc(so, &V_udbinfo); if (error) { - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); return (error); } inp = (struct inpcb *)so->so_pcb; - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; @@ -751,7 +768,7 @@ * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ - inp->inp_ip_ttl = ip_defttl; + inp->inp_ip_ttl = V_ip_defttl; INP_UNLOCK(inp); return (0); } @@ -759,13 +776,14 @@ static int udp6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_bind: inp == NULL")); - INP_INFO_WLOCK(&udbinfo); + INP_INFO_WLOCK(&V_udbinfo); INP_LOCK(inp); inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; @@ -791,13 +809,14 @@ error = in6_pcbbind(inp, nam, td->td_ucred); out: INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); return (error); } static void udp6_close(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); @@ -812,7 +831,7 @@ return; } #endif - INP_INFO_WLOCK(&udbinfo); + INP_INFO_WLOCK(&V_udbinfo); INP_LOCK(inp); if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { in6_pcbdisconnect(inp); @@ -820,19 +839,20 @@ soisdisconnected(so); } INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); } static int udp6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_connect: inp == NULL")); - INP_INFO_WLOCK(&udbinfo); + INP_INFO_WLOCK(&V_udbinfo); INP_LOCK(inp); if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { struct sockaddr_in6 *sin6_p; @@ -871,35 +891,37 @@ } out: INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); return (error); } static void udp6_detach(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_detach: inp == NULL")); - INP_INFO_WLOCK(&udbinfo); + INP_INFO_WLOCK(&V_udbinfo); INP_LOCK(inp); in6_pcbdetach(inp); in6_pcbfree(inp); - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); } static int udp6_disconnect(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_disconnect: inp == NULL")); - INP_INFO_WLOCK(&udbinfo); + INP_INFO_WLOCK(&V_udbinfo); INP_LOCK(inp); #ifdef INET @@ -923,7 +945,7 @@ so->so_state &= ~SS_ISCONNECTED; /* XXX */ out: INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); return (0); } @@ -931,13 +953,14 @@ udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; int error = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_send: inp == NULL")); - INP_INFO_WLOCK(&udbinfo); + INP_INFO_WLOCK(&V_udbinfo); INP_LOCK(inp); if (addr) { if (addr->sa_len != sizeof(struct sockaddr_in6)) { @@ -992,12 +1015,12 @@ error = udp6_output(inp, m, addr, control, td); out: INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); return (error); bad: INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&udbinfo); + INP_INFO_WUNLOCK(&V_udbinfo); m_freem(m); return (error); } --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/netinet6/vinet6.h 2007-10-05 12:27:25.000000000 +0200 @@ -0,0 +1,270 @@ +/*- + * Copyright (c) 2006 University of Zagreb + * Copyright (c) 2006 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * XXX RCS tag goes here + */ + +#ifndef _NETINET6_VINET6_H_ +#define _NETINET6_VINET6_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define INIT_VNET_INET6(vnet) \ + INIT_FROM_VNET(vnet, VNET_MOD_INET6, \ + struct vnet_inet6, vnet_inet6) + +#define VNET_INET6(sym) VSYM(vnet_inet6, sym) + + +#ifdef VIMAGE +struct vnet_inet6 { + struct in6_ifaddr * _in6_ifaddr; + + u_int _frag6_nfragpackets; + u_int _frag6_nfrags; + struct ip6q _ip6q; + + struct route_in6 _ip6_forward_rt; + + struct in6_addrpolicy _defaultaddrpolicy; + TAILQ_HEAD(, addrsel_policyent) _addrsel_policytab; + u_int _in6_maxmtu; + int _ip6_auto_linklocal; + + struct ip6stat _ip6stat; + struct rip6stat _rip6stat; + struct icmp6stat _icmp6stat; + + int _rtq_timeout6; + struct callout _rtq_timer6; + struct callout _rtq_mtutimer; + struct callout _nd6_slowtimo_ch; + struct callout _nd6_timer_ch; + struct callout _in6_tmpaddrtimer_ch; + + int _nd6_inuse; + int _nd6_allocated; + struct llinfo_nd6 _llinfo_nd6; + struct nd_drhead _nd_defrouter; + struct nd_prhead _nd_prefix; + struct ifnet * _nd6_defifp; + int _nd6_defifindex; + + struct scope6_id _sid_default; + + TAILQ_HEAD(, dadq) _dadq; + int _dad_init; + + int _icmp6errpps_count; + //int _icmp6errppslim_last; + //int _icmp6_nodeinfo; + + int _ip6_forwarding; + int _ip6_sendredirects; + int _ip6_defhlim; + int _ip6_defmcasthlim; + int _ip6_accept_rtadv; + int _ip6_maxfragpackets; + int _ip6_maxfrags; + int _ip6_log_interval; + int _ip6_hdrnestlimit; + int _ip6_dad_count; + int _ip6_auto_flowlabel; + int _ip6_use_deprecated; + int _ip6_rr_prune; + int _ip6_mcast_pmtu; + int _ip6_v6only; + int _ip6_keepfaith; + int _ip6stealth; + time_t _ip6_log_time; + + int _pmtu_expire; + int _pmtu_probe; + u_long _rip6_sendspace; + u_long _rip6_recvspace; + int _icmp6_rediraccept; + int _icmp6_redirtimeout; + int _icmp6errppslim; + int _icmp6_nodeinfo; + int _udp6_sendspace; + int _udp6_recvspace; + int _ip6qmaxlen; + int _ip6_prefer_tempaddr; + int _ip6_forward_srcrt; + int _ip6_sourcecheck; + int _ip6_sourcecheck_interval; + int _ip6_ours_check_algorithm; + + int _nd6_prune; + int _nd6_delay; + int _nd6_umaxtries; + int _nd6_mmaxtries; + int _nd6_useloopback; + int _nd6_gctimer; + int _nd6_maxndopt; + int _nd6_maxnudhint; + int _nd6_maxqueuelen; + int _nd6_debug; + int _nd6_recalc_reachtm_interval; + int _dad_ignore_ns; + int _dad_maxtry; + int _ip6_use_tempaddr; + int _ip6_desync_factor; + u_int32_t _ip6_temp_preferred_lifetime; + u_int32_t _ip6_temp_valid_lifetime; + + int _ip6_mrouter_ver; + int _pim6; + u_int _mrt6debug; + + int _ip6_temp_regen_advance; + int _ip6_use_defzone; + + struct ip6_pktopts _ip6_opts; +}; +#endif + + +/* + * Symbol translation macros + */ +#define V_in6_ifaddr VNET_INET6(in6_ifaddr) + +#define V_frag6_nfragpackets VNET_INET6(frag6_nfragpackets) +#define V_frag6_nfrags VNET_INET6(frag6_nfrags) +#define V_ip6q VNET_INET6(ip6q) + +#define V_ip6_forward_rt VNET_INET6(ip6_forward_rt) + +#define V_defaultaddrpolicy VNET_INET6(defaultaddrpolicy) +#define V_addrsel_policytab VNET_INET6(addrsel_policytab) +#define V_in6_maxmtu VNET_INET6(in6_maxmtu) +#define V_ip6_auto_linklocal VNET_INET6(ip6_auto_linklocal) + +#define V_ip6stat VNET_INET6(ip6stat) +#define V_rip6stat VNET_INET6(rip6stat) +#define V_icmp6stat VNET_INET6(icmp6stat) + +#define V_rtq_timeout6 VNET_INET6(rtq_timeout6) +#define V_rtq_timer6 VNET_INET6(rtq_timer6) +#define V_rtq_mtutimer VNET_INET6(rtq_mtutimer) +#define V_nd6_slowtimo_ch VNET_INET6(nd6_slowtimo_ch) +#define V_nd6_timer_ch VNET_INET6(nd6_timer_ch) +#define V_in6_tmpaddrtimer_ch VNET_INET6(in6_tmpaddrtimer_ch) + +#define V_nd6_inuse VNET_INET6(nd6_inuse) +#define V_nd6_allocated VNET_INET6(nd6_allocated) +#define V_llinfo_nd6 VNET_INET6(llinfo_nd6) +#define V_nd_defrouter VNET_INET6(nd_defrouter) +#define V_nd_prefix VNET_INET6(nd_prefix) +#define V_nd6_defifp VNET_INET6(nd6_defifp) +#define V_nd6_defifindex VNET_INET6(nd6_defifindex) + +#define V_sid_default VNET_INET6(sid_default) + +#define V_dadq VNET_INET6(dadq) +#define V_dad_init VNET_INET6(dad_init) + +//#define V_icmp6errppslim VNET_INET6(icmp6errppslim) +#define V_icmp6errpps_count VNET_INET6(icmp6errpps_count) +//#define V_icmp6_nodeinfo VNET_INET6(icmp6_nodeinfo) + +#define V_ip6_forwarding VNET_INET6(ip6_forwarding) +#define V_ip6_sendredirects VNET_INET6(ip6_sendredirects) +#define V_ip6_defhlim VNET_INET6(ip6_defhlim) +#define V_ip6_defmcasthlim VNET_INET6(ip6_defmcasthlim) +#define V_ip6_accept_rtadv VNET_INET6(ip6_accept_rtadv) +#define V_ip6_maxfragpackets VNET_INET6(ip6_maxfragpackets) +#define V_ip6_maxfrags VNET_INET6(ip6_maxfrags) +#define V_ip6_log_interval VNET_INET6(ip6_log_interval) +#define V_ip6_hdrnestlimit VNET_INET6(ip6_hdrnestlimit) +#define V_ip6_dad_count VNET_INET6(ip6_dad_count) +#define V_ip6_auto_flowlabel VNET_INET6(ip6_auto_flowlabel) +#define V_ip6_use_deprecated VNET_INET6(ip6_use_deprecated) +#define V_ip6_rr_prune VNET_INET6(ip6_rr_prune) +#define V_ip6_mcast_pmtu VNET_INET6(ip6_mcast_pmtu) +#define V_ip6_v6only VNET_INET6(ip6_v6only) +#define V_ip6_keepfaith VNET_INET6(ip6_keepfaith) +#define V_ip6stealth VNET_INET6(ip6stealth) +#define V_ip6_log_time VNET_INET6(ip6_log_time) + +#define V_pmtu_expire VNET_INET6(pmtu_expire) +#define V_pmtu_probe VNET_INET6(pmtu_probe) +#define V_rip6_sendspace VNET_INET6(rip6_sendspace) +#define V_rip6_recvspace VNET_INET6(rip6_recvspace) +#define V_icmp6_rediraccept VNET_INET6(icmp6_rediraccept) +#define V_icmp6_redirtimeout VNET_INET6(icmp6_redirtimeout) +#define V_icmp6errppslim VNET_INET6(icmp6errppslim) +#define V_icmp6_nodeinfo VNET_INET6(icmp6_nodeinfo) +#define V_udp6_sendspace VNET_INET6(udp6_sendspace) +#define V_udp6_recvspace VNET_INET6(udp6_recvspace) +//#define V_icmp6errppslim_last VNET_INET6(icmp6errppslim_last) +#define V_ip6_prefer_tempaddr VNET_INET6(ip6_prefer_tempaddr) +#define V_ip6qmaxlen VNET_INET6(ip6qmaxlen) +#define V_ip6_forward_srcrt VNET_INET6(ip6_forward_srcrt) +#define V_ip6_sourcecheck VNET_INET6(ip6_sourcecheck) +#define V_ip6_sourcecheck_interval VNET_INET6(ip6_sourcecheck_interval) +#define V_ip6_ours_check_algorithm VNET_INET6(ip6_ours_check_algorithm) +#define V_nd6_prune VNET_INET6(nd6_prune) +#define V_nd6_delay VNET_INET6(nd6_delay) +#define V_nd6_umaxtries VNET_INET6(nd6_umaxtries) +#define V_nd6_mmaxtries VNET_INET6(nd6_mmaxtries) +#define V_nd6_useloopback VNET_INET6(nd6_useloopback) +#define V_nd6_gctimer VNET_INET6(nd6_gctimer) +#define V_nd6_maxndopt VNET_INET6(nd6_maxndopt) +#define V_nd6_maxnudhint VNET_INET6(nd6_maxnudhint) +#define V_nd6_maxqueuelen VNET_INET6(nd6_maxqueuelen) +#define V_nd6_debug VNET_INET6(nd6_debug) +#define V_nd6_recalc_reachtm_interval VNET_INET6(nd6_recalc_reachtm_interval) +#define V_dad_ignore_ns VNET_INET6(dad_ignore_ns) +#define V_dad_maxtry VNET_INET6(dad_maxtry) +#define V_ip6_use_tempaddr VNET_INET6(ip6_use_tempaddr) +#define V_ip6_desync_factor VNET_INET6(ip6_desync_factor) +#define V_ip6_temp_preferred_lifetime VNET_INET6(ip6_temp_preferred_lifetime) +#define V_ip6_temp_valid_lifetime VNET_INET6(ip6_temp_valid_lifetime) + +#define V_ip6_mrouter_ver VNET_INET6(ip6_mrouter_ver) +#define V_pim6 VNET_INET6(pim6) +#define V_mrt6debug VNET_INET6(mrt6debug) +#define V_ip6_temp_regen_advance VNET_INET6(ip6_temp_regen_advance) + +#define V_ip6_use_defzone VNET_INET6(ip6_use_defzone) + +#define V_ip6_opts VNET_INET6(ip6_opts) + +#endif /* !_NETINET6_VINET6_H_ */ --- /u/marko/p4/head/src/sys/netipsec/ipsec.c 2008-02-03 08:16:02.000000000 +0100 +++ src/sys/netipsec/ipsec.c 2008-02-27 11:49:58.000000000 +0100 @@ -37,6 +37,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" +#include "opt_vimage.h" #include #include @@ -53,6 +54,7 @@ #include #include #include +#include #include #include @@ -66,6 +68,7 @@ #include #include #include +#include #include #ifdef INET6 @@ -76,6 +79,8 @@ #include #endif +#include + #include #include #ifdef INET6 @@ -91,18 +96,27 @@ #include #include +#include #include #include +#ifndef VIMAGE #ifdef IPSEC_DEBUG int ipsec_debug = 1; #else int ipsec_debug = 0; #endif +#endif + +static int vnet_ipsec_iattach(const void *); +#ifdef VIMAGE +static int vnet_ipsec_idetach(const void *); +#endif /* NB: name changed so netstat doesn't use it */ +#ifndef VIMAGE struct ipsecstat ipsec4stat; int ip4_ah_offsetmask = 0; /* maybe IP_DF? */ int ip4_ipsec_dfbit = 0; /* DF bit on encap. 0: clear 1: set 2: copy */ @@ -112,7 +126,7 @@ int ip4_ah_net_deflev = IPSEC_LEVEL_USE; struct secpolicy ip4_def_policy; int ip4_ipsec_ecn = 0; /* ECN ignore(-1)/forbidden(0)/allowed(1) */ -int ip4_esp_randpad = -1; +int ip4_esp_randpad; /* * Crypto support requirements: * @@ -121,63 +135,72 @@ * 0 take anything */ int crypto_support = CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE; +#endif SYSCTL_DECL(_net_inet_ipsec); /* net.inet.ipsec */ -SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_POLICY, - def_policy, CTLFLAG_RW, &ip4_def_policy.policy, 0, ""); -SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev, - CTLFLAG_RW, &ip4_esp_trans_deflev, 0, ""); -SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev, - CTLFLAG_RW, &ip4_esp_net_deflev, 0, ""); -SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev, - CTLFLAG_RW, &ip4_ah_trans_deflev, 0, ""); -SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev, - CTLFLAG_RW, &ip4_ah_net_deflev, 0, ""); -SYSCTL_INT(_net_inet_ipsec, IPSECCTL_AH_CLEARTOS, - ah_cleartos, CTLFLAG_RW, &ah_cleartos, 0, ""); -SYSCTL_INT(_net_inet_ipsec, IPSECCTL_AH_OFFSETMASK, - ah_offsetmask, CTLFLAG_RW, &ip4_ah_offsetmask, 0, ""); -SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DFBIT, - dfbit, CTLFLAG_RW, &ip4_ipsec_dfbit, 0, ""); -SYSCTL_INT(_net_inet_ipsec, IPSECCTL_ECN, - ecn, CTLFLAG_RW, &ip4_ipsec_ecn, 0, ""); -SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEBUG, - debug, CTLFLAG_RW, &ipsec_debug, 0, ""); -SYSCTL_INT(_net_inet_ipsec, IPSECCTL_ESP_RANDPAD, - esp_randpad, CTLFLAG_RW, &ip4_esp_randpad, 0, ""); -SYSCTL_INT(_net_inet_ipsec, OID_AUTO, - crypto_support, CTLFLAG_RW, &crypto_support,0, ""); -SYSCTL_STRUCT(_net_inet_ipsec, OID_AUTO, - ipsecstats, CTLFLAG_RD, &ipsec4stat, ipsecstat, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_POLICY, + def_policy, CTLFLAG_RW, ip4_def_policy.policy, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, IPSECCTL_DEF_ESP_TRANSLEV, + esp_trans_deflev, CTLFLAG_RW, ip4_esp_trans_deflev, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, IPSECCTL_DEF_ESP_NETLEV, + esp_net_deflev, CTLFLAG_RW, ip4_esp_net_deflev, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, IPSECCTL_DEF_AH_TRANSLEV, + ah_trans_deflev, CTLFLAG_RW, ip4_ah_trans_deflev, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, IPSECCTL_DEF_AH_NETLEV, + ah_net_deflev, CTLFLAG_RW, ip4_ah_net_deflev, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_AH_CLEARTOS, + ah_cleartos, CTLFLAG_RW, ah_cleartos, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_AH_OFFSETMASK, + ah_offsetmask, CTLFLAG_RW, ip4_ah_offsetmask, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, IPSECCTL_DFBIT, + dfbit, CTLFLAG_RW, ip4_ipsec_dfbit, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, IPSECCTL_ECN, + ecn, CTLFLAG_RW, ip4_ipsec_ecn, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEBUG, + debug, CTLFLAG_RW, ipsec_debug, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, IPSECCTL_ESP_RANDPAD, + esp_randpad, CTLFLAG_RW, ip4_esp_randpad, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, OID_AUTO, + crypto_support, CTLFLAG_RW, crypto_support,0, ""); +SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_ipsec, OID_AUTO, + ipsecstats, CTLFLAG_RD, ipsec4stat, ipsecstat, ""); #ifdef REGRESSION /* * When set to 1, IPsec will send packets with the same sequence number. * This allows to verify if the other side has proper replay attacks detection. */ +#ifndef VIMAGE int ipsec_replay = 0; -SYSCTL_INT(_net_inet_ipsec, OID_AUTO, test_replay, CTLFLAG_RW, &ipsec_replay, 0, - "Emulate replay attack"); +#endif +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, OID_AUTO, test_replay, + CTLFLAG_RW, ipsec_replay, 0, "Emulate replay attack"); /* * When set 1, IPsec will send packets with corrupted HMAC. * This allows to verify if the other side properly detects modified packets. */ +#ifndef VIMAGE int ipsec_integrity = 0; -SYSCTL_INT(_net_inet_ipsec, OID_AUTO, test_integrity, CTLFLAG_RW, - &ipsec_integrity, 0, "Emulate man-in-the-middle attack"); +#endif +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, OID_AUTO, test_integrity, + CTLFLAG_RW, &ipsec_integrity, 0, "Emulate man-in-the-middle attack"); #endif +#ifndef VIMAGE #ifdef INET6 struct ipsecstat ipsec6stat; -int ip6_esp_trans_deflev = IPSEC_LEVEL_USE; -int ip6_esp_net_deflev = IPSEC_LEVEL_USE; -int ip6_ah_trans_deflev = IPSEC_LEVEL_USE; -int ip6_ah_net_deflev = IPSEC_LEVEL_USE; -int ip6_ipsec_ecn = 0; /* ECN ignore(-1)/forbidden(0)/allowed(1) */ -int ip6_esp_randpad = -1; +int ip6_esp_trans_deflev; +int ip6_esp_net_deflev; +int ip6_ah_trans_deflev; +int ip6_ah_net_deflev; +int ip6_ipsec_ecn; /* ECN ignore(-1)/forbidden(0)/allowed(1) */ +int ip6_esp_randpad; +#endif +#endif /* !VIMAGE */ +#ifdef INET6 SYSCTL_DECL(_net_inet6_ipsec6); /* net.inet6.ipsec6 */ @@ -185,26 +208,29 @@ SYSCTL_OID(_net_inet6_ipsec6, IPSECCTL_STATS, stats, CTLFLAG_RD, 0,0, compat_ipsecstats_sysctl, "S", ""); #endif /* COMPAT_KAME */ -SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_POLICY, - def_policy, CTLFLAG_RW, &ip4_def_policy.policy, 0, ""); -SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev, - CTLFLAG_RW, &ip6_esp_trans_deflev, 0, ""); -SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev, - CTLFLAG_RW, &ip6_esp_net_deflev, 0, ""); -SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev, - CTLFLAG_RW, &ip6_ah_trans_deflev, 0, ""); -SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev, - CTLFLAG_RW, &ip6_ah_net_deflev, 0, ""); -SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_ECN, - ecn, CTLFLAG_RW, &ip6_ipsec_ecn, 0, ""); -SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEBUG, - debug, CTLFLAG_RW, &ipsec_debug, 0, ""); -SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_ESP_RANDPAD, - esp_randpad, CTLFLAG_RW, &ip6_esp_randpad, 0, ""); -SYSCTL_STRUCT(_net_inet6_ipsec6, IPSECCTL_STATS, - ipsecstats, CTLFLAG_RD, &ipsec6stat, ipsecstat, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_POLICY, + def_policy, CTLFLAG_RW, ip4_def_policy.policy, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_ESP_TRANSLEV, + esp_trans_deflev, CTLFLAG_RW, ip6_esp_trans_deflev, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_ESP_NETLEV, + esp_net_deflev, CTLFLAG_RW, ip6_esp_net_deflev, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_AH_TRANSLEV, + ah_trans_deflev, CTLFLAG_RW, ip6_ah_trans_deflev, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_AH_NETLEV, + ah_net_deflev, CTLFLAG_RW, ip6_ah_net_deflev, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_ECN, + ecn, CTLFLAG_RW, ip6_ipsec_ecn, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEBUG, + debug, CTLFLAG_RW, ipsec_debug, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_ESP_RANDPAD, + esp_randpad, CTLFLAG_RW, ip6_esp_randpad, 0, ""); +SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_STATS, + ipsecstats, CTLFLAG_RD, ipsec6stat, ipsecstat, ""); #endif /* INET6 */ +VNET_MOD_DECLARE(IPSEC, ipsec, vnet_ipsec_iattach, vnet_ipsec_idetach, + INET, NULL) + static int ipsec4_setspidx_inpcb __P((struct mbuf *, struct inpcb *pcb)); #ifdef INET6 static int ipsec6_setspidx_in6pcb __P((struct mbuf *, struct in6pcb *pcb)); @@ -232,12 +258,13 @@ static struct secpolicy * key_allocsp_default(const char* where, int tag) { + INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP key_allocsp_default from %s:%u\n", where, tag)); - sp = &ip4_def_policy; + sp = &V_ip4_def_policy; if (sp->policy != IPSEC_POLICY_DISCARD && sp->policy != IPSEC_POLICY_NONE) { ipseclog((LOG_INFO, "fixed system default policy: %d->%d\n", @@ -301,6 +328,7 @@ struct inpcb *inp; int *error; { + INIT_VNET_IPSEC(curvnet); struct inpcbpolicy *pcbsp = NULL; struct secpolicy *currsp = NULL; /* policy on socket */ struct secpolicy *sp; @@ -411,6 +439,7 @@ int flag; int *error; { + INIT_VNET_IPSEC(curvnet); struct secpolicyindex spidx; struct secpolicy *sp; @@ -446,6 +475,7 @@ int *error; struct inpcb *inp; { + INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; *error = 0; @@ -455,7 +485,7 @@ sp = ipsec_getpolicybysock(m, dir, inp, error); if (sp == NULL) { IPSEC_ASSERT(*error != 0, ("getpolicy failed w/o error")); - ipsec4stat.ips_out_inval++; + V_ipsec4stat.ips_out_inval++; return NULL; } IPSEC_ASSERT(*error == 0, ("sp w/ error set to %u", *error)); @@ -465,7 +495,7 @@ printf("%s: invalid policy %u\n", __func__, sp->policy); /* fall thru... */ case IPSEC_POLICY_DISCARD: - ipsec4stat.ips_out_polvio++; + V_ipsec4stat.ips_out_polvio++; *error = -EINVAL; /* packet is discarded by caller */ break; case IPSEC_POLICY_BYPASS: @@ -517,6 +547,7 @@ struct mbuf *m; struct in6pcb *pcb; { + //INIT_VNET_IPSEC(curvnet); struct secpolicyindex *spidx; int error; @@ -560,6 +591,7 @@ struct secpolicyindex *spidx; int needport; { + INIT_VNET_IPSEC(curvnet); struct ip *ip = NULL; struct ip ipbuf; u_int v; @@ -753,6 +785,7 @@ struct secpolicyindex *spidx; int needport; { + INIT_VNET_IPSEC(curvnet); int off, nxt; struct tcphdr th; struct udphdr uh; @@ -869,6 +902,7 @@ struct socket *so; struct inpcbpolicy **pcb_sp; { + INIT_VNET_IPSEC(curvnet); struct inpcbpolicy *new; /* sanity check. */ @@ -1012,6 +1046,7 @@ size_t len; struct ucred *cred; { + INIT_VNET_IPSEC(curvnet); struct sadb_x_policy *xpl; struct secpolicy *newsp = NULL; int error; @@ -1061,6 +1096,7 @@ struct secpolicy *pcb_sp; struct mbuf **mp; { + INIT_VNET_IPSEC(curvnet); /* sanity check. */ if (pcb_sp == NULL || mp == NULL) @@ -1087,6 +1123,7 @@ size_t len; struct ucred *cred; { + INIT_VNET_IPSEC(curvnet); struct sadb_x_policy *xpl; struct secpolicy **pcb_sp; @@ -1121,6 +1158,7 @@ size_t len; struct mbuf **mp; { + INIT_VNET_IPSEC(curvnet); struct sadb_x_policy *xpl; struct secpolicy *pcb_sp; @@ -1180,6 +1218,7 @@ size_t len; struct ucred *cred; { + INIT_VNET_IPSEC(curvnet); struct sadb_x_policy *xpl; struct secpolicy **pcb_sp; @@ -1214,6 +1253,7 @@ size_t len; struct mbuf **mp; { + INIT_VNET_IPSEC(curvnet); struct sadb_x_policy *xpl; struct secpolicy *pcb_sp; @@ -1272,6 +1312,7 @@ ipsec_get_reqlevel(isr) struct ipsecrequest *isr; { + INIT_VNET_IPSEC(curvnet); u_int level = 0; u_int esp_trans_deflev, esp_net_deflev; u_int ah_trans_deflev, ah_net_deflev; @@ -1286,7 +1327,7 @@ #define IPSEC_CHECK_DEFAULT(lev) \ (((lev) != IPSEC_LEVEL_USE && (lev) != IPSEC_LEVEL_REQUIRE \ && (lev) != IPSEC_LEVEL_UNIQUE) \ - ? (ipsec_debug \ + ? (V_ipsec_debug \ ? log(LOG_INFO, "fixed system default level " #lev ":%d->%d\n",\ (lev), IPSEC_LEVEL_REQUIRE) \ : 0), \ @@ -1298,18 +1339,18 @@ switch (((struct sockaddr *)&isr->sp->spidx.src)->sa_family) { #ifdef INET case AF_INET: - esp_trans_deflev = IPSEC_CHECK_DEFAULT(ip4_esp_trans_deflev); - esp_net_deflev = IPSEC_CHECK_DEFAULT(ip4_esp_net_deflev); - ah_trans_deflev = IPSEC_CHECK_DEFAULT(ip4_ah_trans_deflev); - ah_net_deflev = IPSEC_CHECK_DEFAULT(ip4_ah_net_deflev); + esp_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip4_esp_trans_deflev); + esp_net_deflev = IPSEC_CHECK_DEFAULT(V_ip4_esp_net_deflev); + ah_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip4_ah_trans_deflev); + ah_net_deflev = IPSEC_CHECK_DEFAULT(V_ip4_ah_net_deflev); break; #endif #ifdef INET6 case AF_INET6: - esp_trans_deflev = IPSEC_CHECK_DEFAULT(ip6_esp_trans_deflev); - esp_net_deflev = IPSEC_CHECK_DEFAULT(ip6_esp_net_deflev); - ah_trans_deflev = IPSEC_CHECK_DEFAULT(ip6_ah_trans_deflev); - ah_net_deflev = IPSEC_CHECK_DEFAULT(ip6_ah_net_deflev); + esp_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip6_esp_trans_deflev); + esp_net_deflev = IPSEC_CHECK_DEFAULT(V_ip6_esp_net_deflev); + ah_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip6_ah_trans_deflev); + ah_net_deflev = IPSEC_CHECK_DEFAULT(V_ip6_ah_net_deflev); break; #endif /* INET6 */ default: @@ -1376,6 +1417,7 @@ int ipsec_in_reject(struct secpolicy *sp, struct mbuf *m) { + INIT_VNET_IPSEC(curvnet); struct ipsecrequest *isr; int need_auth; @@ -1451,6 +1493,7 @@ struct mbuf *m; struct inpcb *inp; { + INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; int error; int result; @@ -1469,7 +1512,7 @@ if (sp != NULL) { result = ipsec_in_reject(sp, m); if (result) - ipsec4stat.ips_in_polvio++; + V_ipsec4stat.ips_in_polvio++; KEY_FREESP(&sp); } else { result = 0; /* XXX should be panic ? @@ -1489,6 +1532,7 @@ struct mbuf *m; struct inpcb *inp; { + INIT_VNET_IPSEC(curvnet); struct secpolicy *sp = NULL; int error; int result; @@ -1509,7 +1553,7 @@ if (sp != NULL) { result = ipsec_in_reject(sp, m); if (result) - ipsec6stat.ips_in_polvio++; + V_ipsec6stat.ips_in_polvio++; KEY_FREESP(&sp); } else { result = 0; @@ -1526,6 +1570,7 @@ static size_t ipsec_hdrsiz(struct secpolicy *sp) { + INIT_VNET_IPSEC(curvnet); struct ipsecrequest *isr; size_t siz; @@ -1588,6 +1633,7 @@ u_int dir; struct inpcb *inp; { + INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; int error; size_t size; @@ -1628,6 +1674,7 @@ u_int dir; struct in6pcb *in6p; { + INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; int error; size_t size; @@ -1729,6 +1776,7 @@ u_int32_t seq; struct secasvar *sav; { + INIT_VNET_IPSEC(curvnet); struct secreplay *replay; u_int32_t diff; int fr; @@ -1937,9 +1985,64 @@ static void ipsec_attach(void) { - SECPOLICY_LOCK_INIT(&ip4_def_policy); - ip4_def_policy.refcnt = 1; /* NB: disallow free */ +#ifdef VIMAGE + vnet_mod_register(&vnet_ipsec_modinfo); +#else + vnet_ipsec_iattach(NULL); +#endif } + +static int +vnet_ipsec_iattach(unused) + const void *unused; +{ + INIT_VNET_IPSEC(curvnet); + + #ifdef IPSEC_DEBUG + V_ipsec_debug = 1; + #else + V_ipsec_debug = 0; + #endif + + SECPOLICY_LOCK_INIT(&V_ip4_def_policy); + V_ip4_def_policy.refcnt = 1; /* NB: disallow free */ + + V_ip4_ah_offsetmask = 0; /* maybe IP_DF? */ + V_ip4_ipsec_dfbit = 0; /* DF bit on encap. 0: clear 1: set 2: copy */ + V_ip4_esp_trans_deflev = IPSEC_LEVEL_USE; + V_ip4_esp_net_deflev = IPSEC_LEVEL_USE; + V_ip4_ah_trans_deflev = IPSEC_LEVEL_USE; + V_ip4_ah_net_deflev = IPSEC_LEVEL_USE; + V_ip4_ipsec_ecn = 0; + + V_ip4_esp_randpad = -1; + V_crypto_support = CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE; +#ifdef REGRESSION + V_ipsec_replay = 0; + V_ipsec_integrity = 0; +#endif + +#ifdef INET6 + V_ip6_esp_trans_deflev = IPSEC_LEVEL_USE; + V_ip6_esp_net_deflev = IPSEC_LEVEL_USE; + V_ip6_ah_trans_deflev = IPSEC_LEVEL_USE; + V_ip6_ah_net_deflev = IPSEC_LEVEL_USE; + V_ip6_ipsec_ecn = 0; /* ECN ignore(-1)/forbidden(0)/allowed(1) */ + V_ip6_esp_randpad = -1; +#endif + + return 0; +} + +/* XXX finish this! */ +#ifdef VIMAGE +static int +vnet_ipsec_idetach(unused) + const void *unused; +{ + return 0; +} +#endif SYSINIT(ipsec, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, ipsec_attach, NULL) --- /u/marko/p4/head/src/sys/netipsec/ipsec.h 2008-02-03 08:16:02.000000000 +0100 +++ src/sys/netipsec/ipsec.h 2008-02-27 11:49:59.000000000 +0100 @@ -345,9 +345,9 @@ extern int ip4_esp_randpad; extern int crypto_support; -#define ipseclog(x) do { if (ipsec_debug) log x; } while (0) +#define ipseclog(x) do { if (V_ipsec_debug) log x; } while (0) /* for openbsd compatibility */ -#define DPRINTF(x) do { if (ipsec_debug) printf x; } while (0) +#define DPRINTF(x) do { if (V_ipsec_debug) printf x; } while (0) extern struct ipsecrequest *ipsec_newisr(void); extern void ipsec_delisr(struct ipsecrequest *); --- /u/marko/p4/head/src/sys/netipsec/ipsec_input.c 2007-11-30 21:34:33.000000000 +0100 +++ src/sys/netipsec/ipsec_input.c 2007-12-10 11:26:14.000000000 +0100 @@ -44,6 +44,7 @@ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_enc.h" +#include "opt_vimage.h" #include #include @@ -54,6 +55,7 @@ #include #include #include +#include #include #include @@ -68,6 +70,7 @@ #include #ifdef INET6 +#include #include #endif #include @@ -88,6 +91,7 @@ #include #include +#include #include #include @@ -107,13 +111,14 @@ static int ipsec_common_input(struct mbuf *m, int skip, int protoff, int af, int sproto) { + INIT_VNET_IPSEC(curvnet); union sockaddr_union dst_address; struct secasvar *sav; u_int32_t spi; int error; - IPSEC_ISTAT(sproto, espstat.esps_input, ahstat.ahs_input, - ipcompstat.ipcomps_input); + IPSEC_ISTAT(sproto, V_espstat.esps_input, V_ahstat.ahs_input, + V_ipcompstat.ipcomps_input); IPSEC_ASSERT(m != NULL, ("null packet")); @@ -121,19 +126,19 @@ sproto == IPPROTO_IPCOMP, ("unexpected security protocol %u", sproto)); - if ((sproto == IPPROTO_ESP && !esp_enable) || - (sproto == IPPROTO_AH && !ah_enable) || - (sproto == IPPROTO_IPCOMP && !ipcomp_enable)) { + if ((sproto == IPPROTO_ESP && !V_esp_enable) || + (sproto == IPPROTO_AH && !V_ah_enable) || + (sproto == IPPROTO_IPCOMP && !V_ipcomp_enable)) { m_freem(m); - IPSEC_ISTAT(sproto, espstat.esps_pdrops, ahstat.ahs_pdrops, - ipcompstat.ipcomps_pdrops); + IPSEC_ISTAT(sproto, V_espstat.esps_pdrops, V_ahstat.ahs_pdrops, + V_ipcompstat.ipcomps_pdrops); return EOPNOTSUPP; } if (m->m_pkthdr.len - skip < 2 * sizeof (u_int32_t)) { m_freem(m); - IPSEC_ISTAT(sproto, espstat.esps_hdrops, ahstat.ahs_hdrops, - ipcompstat.ipcomps_hdrops); + IPSEC_ISTAT(sproto, V_espstat.esps_hdrops, V_ahstat.ahs_hdrops, + V_ipcompstat.ipcomps_hdrops); DPRINTF(("%s: packet too small\n", __func__)); return EINVAL; } @@ -178,8 +183,8 @@ default: DPRINTF(("%s: unsupported protocol family %u\n", __func__, af)); m_freem(m); - IPSEC_ISTAT(sproto, espstat.esps_nopf, ahstat.ahs_nopf, - ipcompstat.ipcomps_nopf); + IPSEC_ISTAT(sproto, V_espstat.esps_nopf, V_ahstat.ahs_nopf, + V_ipcompstat.ipcomps_nopf); return EPFNOSUPPORT; } @@ -189,8 +194,8 @@ DPRINTF(("%s: no key association found for SA %s/%08lx/%u\n", __func__, ipsec_address(&dst_address), (u_long) ntohl(spi), sproto)); - IPSEC_ISTAT(sproto, espstat.esps_notdb, ahstat.ahs_notdb, - ipcompstat.ipcomps_notdb); + IPSEC_ISTAT(sproto, V_espstat.esps_notdb, V_ahstat.ahs_notdb, + V_ipcompstat.ipcomps_notdb); m_freem(m); return ENOENT; } @@ -199,8 +204,8 @@ DPRINTF(("%s: attempted to use uninitialized SA %s/%08lx/%u\n", __func__, ipsec_address(&dst_address), (u_long) ntohl(spi), sproto)); - IPSEC_ISTAT(sproto, espstat.esps_noxform, ahstat.ahs_noxform, - ipcompstat.ipcomps_noxform); + IPSEC_ISTAT(sproto, V_espstat.esps_noxform, V_ahstat.ahs_noxform, + V_ipcompstat.ipcomps_noxform); KEY_FREESAV(&sav); m_freem(m); return ENXIO; @@ -276,6 +281,7 @@ ipsec4_common_input_cb(struct mbuf *m, struct secasvar *sav, int skip, int protoff, struct m_tag *mt) { + INIT_VNET_IPSEC(curvnet); int prot, af, sproto; struct ip *ip; struct m_tag *mtag; @@ -304,8 +310,8 @@ /* Sanity check */ if (m == NULL) { DPRINTF(("%s: null mbuf", __func__)); - IPSEC_ISTAT(sproto, espstat.esps_badkcr, ahstat.ahs_badkcr, - ipcompstat.ipcomps_badkcr); + IPSEC_ISTAT(sproto, V_espstat.esps_badkcr, V_ahstat.ahs_badkcr, + V_ipcompstat.ipcomps_badkcr); KEY_FREESAV(&sav); return EINVAL; } @@ -316,8 +322,8 @@ DPRINTF(("%s: processing failed for SA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); - IPSEC_ISTAT(sproto, espstat.esps_hdrops, ahstat.ahs_hdrops, - ipcompstat.ipcomps_hdrops); + IPSEC_ISTAT(sproto, V_espstat.esps_hdrops, V_ahstat.ahs_hdrops, + V_ipcompstat.ipcomps_hdrops); error = ENOBUFS; goto bad; } @@ -338,9 +344,9 @@ struct ip ipn; if (m->m_pkthdr.len - skip < sizeof(struct ip)) { - IPSEC_ISTAT(sproto, espstat.esps_hdrops, - ahstat.ahs_hdrops, - ipcompstat.ipcomps_hdrops); + IPSEC_ISTAT(sproto, V_espstat.esps_hdrops, + V_ahstat.ahs_hdrops, + V_ipcompstat.ipcomps_hdrops); error = EINVAL; goto bad; } @@ -369,9 +375,9 @@ ipsp_address(saidx->dst), (u_long) ntohl(sav->spi))); - IPSEC_ISTAT(sproto, espstat.esps_pdrops, - ahstat.ahs_pdrops, - ipcompstat.ipcomps_pdrops); + IPSEC_ISTAT(sproto, V_espstat.esps_pdrops, + V_ahstat.ahs_pdrops, + V_ipcompstat.ipcomps_pdrops); error = EACCES; goto bad; } @@ -382,9 +388,9 @@ struct ip6_hdr ip6n; if (m->m_pkthdr.len - skip < sizeof(struct ip6_hdr)) { - IPSEC_ISTAT(sproto, espstat.esps_hdrops, - ahstat.ahs_hdrops, - ipcompstat.ipcomps_hdrops); + IPSEC_ISTAT(sproto, V_espstat.esps_hdrops, + V_ahstat.ahs_hdrops, + V_ipcompstat.ipcomps_hdrops); error = EINVAL; goto bad; } @@ -411,9 +417,9 @@ ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); - IPSEC_ISTAT(sproto, espstat.esps_pdrops, - ahstat.ahs_pdrops, - ipcompstat.ipcomps_pdrops); + IPSEC_ISTAT(sproto, V_espstat.esps_pdrops, + V_ahstat.ahs_pdrops, + V_ipcompstat.ipcomps_pdrops); error = EACCES; goto bad; } @@ -434,8 +440,8 @@ sizeof(struct tdb_ident), M_NOWAIT); if (mtag == NULL) { DPRINTF(("%s: failed to get tag\n", __func__)); - IPSEC_ISTAT(sproto, espstat.esps_hdrops, - ahstat.ahs_hdrops, ipcompstat.ipcomps_hdrops); + IPSEC_ISTAT(sproto, V_espstat.esps_hdrops, + V_ahstat.ahs_hdrops, V_ipcompstat.ipcomps_hdrops); error = ENOMEM; goto bad; } @@ -472,8 +478,8 @@ * Re-dispatch via software interrupt. */ if ((error = netisr_queue(NETISR_IP, m))) { - IPSEC_ISTAT(sproto, espstat.esps_qfull, ahstat.ahs_qfull, - ipcompstat.ipcomps_qfull); + IPSEC_ISTAT(sproto, V_espstat.esps_qfull, V_ahstat.ahs_qfull, + V_ipcompstat.ipcomps_qfull); DPRINTF(("%s: queue full; proto %u packet dropped\n", __func__, sproto)); @@ -497,6 +503,7 @@ int ipsec6_common_input(struct mbuf **mp, int *offp, int proto) { + INIT_VNET_IPSEC(curvnet); int l = 0; int protoff; struct ip6_ext ip6e; @@ -526,9 +533,9 @@ if (protoff + l != *offp) { DPRINTF(("%s: bad packet header chain, protoff %u, " "l %u, off %u\n", __func__, protoff, l, *offp)); - IPSEC_ISTAT(proto, espstat.esps_hdrops, - ahstat.ahs_hdrops, - ipcompstat.ipcomps_hdrops); + IPSEC_ISTAT(proto, V_espstat.esps_hdrops, + V_ahstat.ahs_hdrops, + V_ipcompstat.ipcomps_hdrops); m_freem(*mp); *mp = NULL; return IPPROTO_DONE; @@ -547,6 +554,8 @@ ipsec6_common_input_cb(struct mbuf *m, struct secasvar *sav, int skip, int protoff, struct m_tag *mt) { + INIT_VNET_INET6(curvnet); + INIT_VNET_IPSEC(curvnet); int prot, af, sproto; struct ip6_hdr *ip6; struct m_tag *mtag; @@ -573,8 +582,8 @@ /* Sanity check */ if (m == NULL) { DPRINTF(("%s: null mbuf", __func__)); - IPSEC_ISTAT(sproto, espstat.esps_badkcr, ahstat.ahs_badkcr, - ipcompstat.ipcomps_badkcr); + IPSEC_ISTAT(sproto, V_espstat.esps_badkcr, V_ahstat.ahs_badkcr, + V_ipcompstat.ipcomps_badkcr); error = EINVAL; goto bad; } @@ -587,8 +596,8 @@ __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); - IPSEC_ISTAT(sproto, espstat.esps_hdrops, ahstat.ahs_hdrops, - ipcompstat.ipcomps_hdrops); + IPSEC_ISTAT(sproto, V_espstat.esps_hdrops, V_ahstat.ahs_hdrops, + V_ipcompstat.ipcomps_hdrops); error = EACCES; goto bad; } @@ -606,9 +615,9 @@ struct ip ipn; if (m->m_pkthdr.len - skip < sizeof(struct ip)) { - IPSEC_ISTAT(sproto, espstat.esps_hdrops, - ahstat.ahs_hdrops, - ipcompstat.ipcomps_hdrops); + IPSEC_ISTAT(sproto, V_espstat.esps_hdrops, + V_ahstat.ahs_hdrops, + V_ipcompstat.ipcomps_hdrops); error = EINVAL; goto bad; } @@ -633,8 +642,8 @@ ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); - IPSEC_ISTATsproto, (espstat.esps_pdrops, - ahstat.ahs_pdrops, ipcompstat.ipcomps_pdrops); + IPSEC_ISTATsproto, (V_espstat.esps_pdrops, + V_ahstat.ahs_pdrops, V_ipcompstat.ipcomps_pdrops); error = EACCES; goto bad; } @@ -646,9 +655,9 @@ struct ip6_hdr ip6n; if (m->m_pkthdr.len - skip < sizeof(struct ip6_hdr)) { - IPSEC_ISTAT(sproto, espstat.esps_hdrops, - ahstat.ahs_hdrops, - ipcompstat.ipcomps_hdrops); + IPSEC_ISTAT(sproto, V_espstat.esps_hdrops, + V_ahstat.ahs_hdrops, + V_ipcompstat.ipcomps_hdrops); error = EINVAL; goto bad; } @@ -675,8 +684,8 @@ ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); - IPSEC_ISTAT(sproto, espstat.esps_pdrops, - ahstat.ahs_pdrops, ipcompstat.ipcomps_pdrops); + IPSEC_ISTAT(sproto, V_espstat.esps_pdrops, + V_ahstat.ahs_pdrops, V_ipcompstat.ipcomps_pdrops); error = EACCES; goto bad; } @@ -696,8 +705,8 @@ sizeof(struct tdb_ident), M_NOWAIT); if (mtag == NULL) { DPRINTF(("%s: failed to get tag\n", __func__)); - IPSEC_ISTAT(sproto, espstat.esps_hdrops, - ahstat.ahs_hdrops, ipcompstat.ipcomps_hdrops); + IPSEC_ISTAT(sproto, V_espstat.esps_hdrops, + V_ahstat.ahs_hdrops, V_ipcompstat.ipcomps_hdrops); error = ENOMEM; goto bad; } @@ -742,8 +751,8 @@ nest = 0; nxt = nxt8; while (nxt != IPPROTO_DONE) { - if (ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) { - ip6stat.ip6s_toomanyhdr++; + if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) { + V_ip6stat.ip6s_toomanyhdr++; error = EINVAL; goto bad; } @@ -753,7 +762,7 @@ * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < skip) { - ip6stat.ip6s_tooshort++; + V_ip6stat.ip6s_tooshort++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); error = EINVAL; goto bad; --- /u/marko/p4/head/src/sys/netipsec/ipsec_mbuf.c 2007-08-31 03:48:10.000000000 +0200 +++ src/sys/netipsec/ipsec_mbuf.c 2007-10-22 18:07:00.000000000 +0200 @@ -31,16 +31,19 @@ */ #include "opt_param.h" +#include "opt_vimage.h" #include #include #include #include +#include #include #include #include +#include /* * Make space for a new header of length hlen at skip bytes @@ -53,6 +56,7 @@ struct mbuf * m_makespace(struct mbuf *m0, int skip, int hlen, int *off) { + INIT_VNET_IPSEC(curvnet); struct mbuf *m; unsigned remain; @@ -88,7 +92,7 @@ return (NULL); n->m_next = m->m_next; /* splice new mbuf */ m->m_next = n; - ipsec4stat.ips_mbinserted++; + V_ipsec4stat.ips_mbinserted++; if (hlen <= M_TRAILINGSPACE(m) + remain) { /* * New header fits in the old mbuf if we copy @@ -122,7 +126,7 @@ /* splice in second mbuf */ n2->m_next = n->m_next; n->m_next = n2; - ipsec4stat.ips_mbinserted++; + V_ipsec4stat.ips_mbinserted++; } else { memcpy(mtod(n, caddr_t) + hlen, mtod(m, caddr_t) + skip, remain); @@ -155,6 +159,7 @@ caddr_t m_pad(struct mbuf *m, int n) { + INIT_VNET_IPSEC(curvnet); register struct mbuf *m0, *m1; register int len, pad; caddr_t retval; @@ -227,6 +232,7 @@ int m_striphdr(struct mbuf *m, int skip, int hlen) { + INIT_VNET_IPSEC(curvnet); struct mbuf *m1; int roff; @@ -238,7 +244,7 @@ /* Remove the header and associated data from the mbuf. */ if (roff == 0) { /* The header was at the beginning of the mbuf */ - ipsec4stat.ips_input_front++; + V_ipsec4stat.ips_input_front++; m_adj(m1, hlen); if ((m1->m_flags & M_PKTHDR) == 0) m->m_pkthdr.len -= hlen; @@ -250,7 +256,7 @@ * so first let's remove the remainder of the header from * the beginning of the remainder of the mbuf chain, if any. */ - ipsec4stat.ips_input_end++; + V_ipsec4stat.ips_input_end++; if (roff + hlen > m1->m_len) { /* Adjust the next mbuf by the remainder */ m_adj(m1->m_next, roff + hlen - m1->m_len); @@ -275,7 +281,7 @@ * The header lies in the "middle" of the mbuf; copy * the remainder of the mbuf down over the header. */ - ipsec4stat.ips_input_middle++; + V_ipsec4stat.ips_input_middle++; bcopy(mtod(m1, u_char *) + roff + hlen, mtod(m1, u_char *) + roff, m1->m_len - (roff + hlen)); --- /u/marko/p4/head/src/sys/netipsec/ipsec_output.c 2007-11-30 21:34:33.000000000 +0100 +++ src/sys/netipsec/ipsec_output.c 2007-12-10 11:26:14.000000000 +0100 @@ -33,6 +33,7 @@ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_enc.h" +#include "opt_vimage.h" #include #include @@ -42,6 +43,7 @@ #include #include #include +#include #include #include @@ -54,6 +56,7 @@ #include #include #ifdef INET6 +#include #include #endif @@ -79,12 +82,14 @@ #include #include #include +#include #include int ipsec_process_done(struct mbuf *m, struct ipsecrequest *isr) { + INIT_VNET_IPSEC(curvnet); struct tdb_ident *tdbi; struct m_tag *mtag; struct secasvar *sav; @@ -156,7 +161,7 @@ * doing further processing. */ if (isr->next) { - ipsec4stat.ips_out_bundlesa++; + V_ipsec4stat.ips_out_bundlesa++; return ipsec4_process_packet(m, isr->next, 0, 0); } key_sa_recordxfer(sav, m); /* record data transfer */ @@ -201,6 +206,7 @@ int *error ) { + INIT_VNET_IPSEC(curvnet); #define IPSEC_OSTAT(x,y,z) (isr->saidx.proto == IPPROTO_ESP ? (x)++ : \ isr->saidx.proto == IPPROTO_AH ? (y)++ : (z)++) struct secasvar *sav; @@ -282,7 +288,7 @@ * this packet because it is responsibility for * upper layer to retransmit the packet. */ - ipsec4stat.ips_out_nosa++; + V_ipsec4stat.ips_out_nosa++; goto bad; } sav = isr->sav; @@ -304,13 +310,13 @@ /* * Check system global policy controls. */ - if ((isr->saidx.proto == IPPROTO_ESP && !esp_enable) || - (isr->saidx.proto == IPPROTO_AH && !ah_enable) || - (isr->saidx.proto == IPPROTO_IPCOMP && !ipcomp_enable)) { + if ((isr->saidx.proto == IPPROTO_ESP && !V_esp_enable) || + (isr->saidx.proto == IPPROTO_AH && !V_ah_enable) || + (isr->saidx.proto == IPPROTO_IPCOMP && !V_ipcomp_enable)) { DPRINTF(("%s: IPsec outbound packet dropped due" " to policy (check your sysctls)\n", __func__)); - IPSEC_OSTAT(espstat.esps_pdrops, ahstat.ahs_pdrops, - ipcompstat.ipcomps_pdrops); + IPSEC_OSTAT(V_espstat.esps_pdrops, V_ahstat.ahs_pdrops, + V_ipcompstat.ipcomps_pdrops); *error = EHOSTUNREACH; goto bad; } @@ -321,8 +327,8 @@ */ if (sav->tdb_xform == NULL) { DPRINTF(("%s: no transform for SA\n", __func__)); - IPSEC_OSTAT(espstat.esps_noxform, ahstat.ahs_noxform, - ipcompstat.ipcomps_noxform); + IPSEC_OSTAT(V_espstat.esps_noxform, V_ahstat.ahs_noxform, + V_ipcompstat.ipcomps_noxform); *error = EHOSTUNREACH; goto bad; } @@ -345,6 +351,7 @@ int flags, int tunalready) { + INIT_VNET_IPSEC(curvnet); struct secasindex saidx; struct secasvar *sav; struct ip *ip; @@ -384,10 +391,10 @@ } ip = mtod(m, struct ip *); /* Honor system-wide control of how to handle IP_DF */ - switch (ip4_ipsec_dfbit) { + switch (V_ip4_ipsec_dfbit) { case 0: /* clear in outer header */ case 1: /* set in outer header */ - setdf = ip4_ipsec_dfbit; + setdf = V_ip4_ipsec_dfbit; break; default: /* propagate to outer header */ setdf = ntohs(ip->ip_off & IP_DF); @@ -552,6 +559,7 @@ int flags, int *tun) { + INIT_VNET_IPSEC(curvnet); struct ipsecrequest *isr; struct secasindex saidx; int error = 0; @@ -616,6 +624,7 @@ static int ipsec6_encapsulate(struct mbuf *m, struct secasvar *sav) { + INIT_VNET_IPSEC(curvnet); struct ip6_hdr *oip6; struct ip6_hdr *ip6; size_t plen; @@ -663,7 +672,7 @@ /* construct new IPv6 header. see RFC 2401 5.1.2.2 */ /* ECN consideration. */ - ip6_ecn_ingress(ip6_ipsec_ecn, &ip6->ip6_flow, &oip6->ip6_flow); + ip6_ecn_ingress(V_ip6_ipsec_ecn, &ip6->ip6_flow, &oip6->ip6_flow); if (plen < IPV6_MAXPACKET - sizeof(struct ip6_hdr)) ip6->ip6_plen = htons(plen); else { @@ -685,6 +694,8 @@ int ipsec6_output_tunnel(struct ipsec_output_state *state, struct secpolicy *sp, int flags) { + INIT_VNET_INET6(curvnet); + INIT_VNET_IPSEC(curvnet); struct ip6_hdr *ip6; struct ipsecrequest *isr; struct secasindex saidx; @@ -736,14 +747,14 @@ ipseclog((LOG_ERR, "%s: family mismatched between " "inner and outer, spi=%u\n", __func__, ntohl(isr->sav->spi))); - ipsec6stat.ips_out_inval++; + V_ipsec6stat.ips_out_inval++; error = EAFNOSUPPORT; goto bad; } m = ipsec6_splithdr(m); if (!m) { - ipsec6stat.ips_out_nomem++; + V_ipsec6stat.ips_out_nomem++; error = ENOMEM; goto bad; } @@ -771,8 +782,8 @@ rtalloc(state->ro); } if (state->ro->ro_rt == 0) { - ip6stat.ip6s_noroute++; - ipsec6stat.ips_out_noroute++; + V_ip6stat.ip6s_noroute++; + V_ipsec6stat.ips_out_noroute++; error = EHOSTUNREACH; goto bad; } @@ -786,7 +797,7 @@ m = ipsec6_splithdr(m); if (!m) { - ipsec6stat.ips_out_nomem++; + V_ipsec6stat.ips_out_nomem++; error = ENOMEM; goto bad; } --- /u/marko/p4/head/src/sys/netipsec/key.c 2007-08-31 03:48:10.000000000 +0200 +++ src/sys/netipsec/key.c 2007-10-22 18:07:01.000000000 +0200 @@ -37,6 +37,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" +#include "opt_vimage.h" #include #include @@ -56,17 +57,20 @@ #include #include #include +#include #include #include #include +#include #include #include #include #include #ifdef INET6 +#include #include #include #include @@ -84,6 +88,7 @@ #include #include #include +#include #include #ifdef INET6 @@ -111,6 +116,7 @@ * field hits 0 (= no external reference other than from SA header. */ +#ifndef VIMAGE u_int32_t key_debug_level = 0; static u_int key_spi_trycnt = 1000; static u_int32_t key_spi_minval = 0x100; @@ -123,8 +129,11 @@ static int key_preferred_oldsa = 1; /* preferred old sa rather than new sa.*/ static u_int32_t acq_seq = 0; +#endif +#ifndef VIMAGE static LIST_HEAD(_sptree, secpolicy) sptree[IPSEC_DIR_MAX]; /* SPD */ +#endif static struct mtx sptree_lock; #define SPTREE_LOCK_INIT() \ mtx_init(&sptree_lock, "sptree", \ @@ -134,7 +143,9 @@ #define SPTREE_UNLOCK() mtx_unlock(&sptree_lock) #define SPTREE_LOCK_ASSERT() mtx_assert(&sptree_lock, MA_OWNED) +#ifndef VIMAGE static LIST_HEAD(_sahtree, secashead) sahtree; /* SAD */ +#endif static struct mtx sahtree_lock; #define SAHTREE_LOCK_INIT() \ mtx_init(&sahtree_lock, "sahtree", \ @@ -143,9 +154,10 @@ #define SAHTREE_LOCK() mtx_lock(&sahtree_lock) #define SAHTREE_UNLOCK() mtx_unlock(&sahtree_lock) #define SAHTREE_LOCK_ASSERT() mtx_assert(&sahtree_lock, MA_OWNED) - /* registed list */ +#ifndef VIMAGE static LIST_HEAD(_regtree, secreg) regtree[SADB_SATYPE_MAX + 1]; +#endif static struct mtx regtree_lock; #define REGTREE_LOCK_INIT() \ mtx_init(®tree_lock, "regtree", "fast ipsec regtree", MTX_DEF) @@ -154,7 +166,9 @@ #define REGTREE_UNLOCK() mtx_unlock(®tree_lock) #define REGTREE_LOCK_ASSERT() mtx_assert(®tree_lock, MA_OWNED) +#ifndef VIMAGE static LIST_HEAD(_acqtree, secacq) acqtree; /* acquiring list */ +#endif static struct mtx acq_lock; #define ACQ_LOCK_INIT() \ mtx_init(&acq_lock, "acqtree", "fast ipsec acquire list", MTX_DEF) @@ -163,7 +177,9 @@ #define ACQ_UNLOCK() mtx_unlock(&acq_lock) #define ACQ_LOCK_ASSERT() mtx_assert(&acq_lock, MA_OWNED) +#ifndef VIMAGE static LIST_HEAD(_spacqtree, secspacq) spacqtree; /* SP acquiring list */ +#endif static struct mtx spacq_lock; #define SPACQ_LOCK_INIT() \ mtx_init(&spacq_lock, "spacqtree", \ @@ -180,6 +196,7 @@ static const u_int saorder_state_valid_prefer_new[] = { SADB_SASTATE_MATURE, SADB_SASTATE_DYING, }; +#ifndef VIMAGE static u_int saorder_state_alive[] = { /* except DEAD */ SADB_SASTATE_MATURE, SADB_SASTATE_DYING, SADB_SASTATE_LARVAL @@ -188,7 +205,7 @@ SADB_SASTATE_MATURE, SADB_SASTATE_DYING, SADB_SASTATE_LARVAL, SADB_SASTATE_DEAD }; - +#endif static const int minsize[] = { sizeof(struct sadb_msg), /* SADB_EXT_RESERVED */ sizeof(struct sadb_sa), /* SADB_EXT_SA */ @@ -233,61 +250,60 @@ 0, /* SADB_X_EXT_POLICY */ sizeof(struct sadb_x_sa2), /* SADB_X_SA2 */ }; - +#ifndef VIMAGE static int ipsec_esp_keymin = 256; static int ipsec_esp_auth = 0; static int ipsec_ah_keymin = 128; - +#endif #ifdef SYSCTL_DECL SYSCTL_DECL(_net_key); #endif -SYSCTL_INT(_net_key, KEYCTL_DEBUG_LEVEL, debug, CTLFLAG_RW, \ - &key_debug_level, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_key, KEYCTL_DEBUG_LEVEL, debug, + CTLFLAG_RW, key_debug_level, 0, ""); /* max count of trial for the decision of spi value */ -SYSCTL_INT(_net_key, KEYCTL_SPI_TRY, spi_trycnt, CTLFLAG_RW, \ - &key_spi_trycnt, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_key, KEYCTL_SPI_TRY, spi_trycnt, + CTLFLAG_RW, key_spi_trycnt, 0, ""); /* minimum spi value to allocate automatically. */ -SYSCTL_INT(_net_key, KEYCTL_SPI_MIN_VALUE, spi_minval, CTLFLAG_RW, \ - &key_spi_minval, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_SPI_MIN_VALUE, + spi_minval, CTLFLAG_RW, key_spi_minval, 0, ""); /* maximun spi value to allocate automatically. */ -SYSCTL_INT(_net_key, KEYCTL_SPI_MAX_VALUE, spi_maxval, CTLFLAG_RW, \ - &key_spi_maxval, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_SPI_MAX_VALUE, + spi_maxval, CTLFLAG_RW, key_spi_maxval, 0, ""); /* interval to initialize randseed */ -SYSCTL_INT(_net_key, KEYCTL_RANDOM_INT, int_random, CTLFLAG_RW, \ - &key_int_random, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_RANDOM_INT, + int_random, CTLFLAG_RW, key_int_random, 0, ""); /* lifetime for larval SA */ -SYSCTL_INT(_net_key, KEYCTL_LARVAL_LIFETIME, larval_lifetime, CTLFLAG_RW, \ - &key_larval_lifetime, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_LARVAL_LIFETIME, + larval_lifetime, CTLFLAG_RW, key_larval_lifetime, 0, ""); /* counter for blocking to send SADB_ACQUIRE to IKEd */ -SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_COUNT, blockacq_count, CTLFLAG_RW, \ - &key_blockacq_count, 0, ""); - +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_BLOCKACQ_COUNT, + blockacq_count, CTLFLAG_RW, key_blockacq_count, 0, ""); /* lifetime for blocking to send SADB_ACQUIRE to IKEd */ -SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_LIFETIME, blockacq_lifetime, CTLFLAG_RW, \ - &key_blockacq_lifetime, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_BLOCKACQ_LIFETIME, + blockacq_lifetime, CTLFLAG_RW, key_blockacq_lifetime, 0, ""); /* ESP auth */ -SYSCTL_INT(_net_key, KEYCTL_ESP_AUTH, esp_auth, CTLFLAG_RW, \ - &ipsec_esp_auth, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_ESP_AUTH, esp_auth, + CTLFLAG_RW, ipsec_esp_auth, 0, ""); /* minimum ESP key length */ -SYSCTL_INT(_net_key, KEYCTL_ESP_KEYMIN, esp_keymin, CTLFLAG_RW, \ - &ipsec_esp_keymin, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_ESP_KEYMIN, + esp_keymin, CTLFLAG_RW, ipsec_esp_keymin, 0, ""); /* minimum AH key length */ -SYSCTL_INT(_net_key, KEYCTL_AH_KEYMIN, ah_keymin, CTLFLAG_RW, \ - &ipsec_ah_keymin, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_AH_KEYMIN, ah_keymin, + CTLFLAG_RW, ipsec_ah_keymin, 0, ""); /* perfered old SA rather than new SA */ -SYSCTL_INT(_net_key, KEYCTL_PREFERED_OLDSA, preferred_oldsa, CTLFLAG_RW,\ - &key_preferred_oldsa, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_PREFERED_OLDSA, + preferred_oldsa, CTLFLAG_RW, key_preferred_oldsa, 0, ""); #define __LIST_CHAINED(elm) \ (!((elm)->chain.le_next == NULL && (elm)->chain.le_prev == NULL)) @@ -553,8 +569,9 @@ int key_havesp(u_int dir) { + INIT_VNET_IPSEC(curvnet); return (dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND ? - LIST_FIRST(&sptree[dir]) != NULL : 1); + LIST_FIRST(&V_sptree[dir]) != NULL : 1); } /* %%% IPsec policy management */ @@ -567,6 +584,7 @@ struct secpolicy * key_allocsp(struct secpolicyindex *spidx, u_int dir, const char* where, int tag) { + INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; IPSEC_ASSERT(spidx != NULL, ("null spidx")); @@ -582,7 +600,7 @@ kdebug_secpolicyindex(spidx)); SPTREE_LOCK(); - LIST_FOREACH(sp, &sptree[dir], chain) { + LIST_FOREACH(sp, &V_sptree[dir], chain) { KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("*** in SPD\n"); kdebug_secpolicyindex(&sp->spidx)); @@ -623,6 +641,7 @@ u_int dir, const char* where, int tag) { + INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; IPSEC_ASSERT(dst != NULL, ("null dst")); @@ -639,7 +658,7 @@ kdebug_sockaddr(&dst->sa)); SPTREE_LOCK(); - LIST_FOREACH(sp, &sptree[dir], chain) { + LIST_FOREACH(sp, &V_sptree[dir], chain) { KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("*** in SPD\n"); kdebug_secpolicyindex(&sp->spidx)); @@ -684,6 +703,7 @@ const struct sockaddr *idst, const char* where, int tag) { + INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; const int dir = IPSEC_DIR_INBOUND; struct ipsecrequest *r1, *r2, *p; @@ -700,7 +720,7 @@ } SPTREE_LOCK(); - LIST_FOREACH(sp, &sptree[dir], chain) { + LIST_FOREACH(sp, &V_sptree[dir], chain) { if (sp->state == IPSEC_SPSTATE_DEAD) continue; @@ -758,6 +778,7 @@ int key_checkrequest(struct ipsecrequest *isr, const struct secasindex *saidx) { + INIT_VNET_IPSEC(curvnet); u_int level; int error; @@ -852,6 +873,7 @@ static struct secasvar * key_allocsa_policy(const struct secasindex *saidx) { + INIT_VNET_IPSEC(curvnet); #define N(a) _ARRAYLEN(a) struct secashead *sah; struct secasvar *sav; @@ -859,11 +881,11 @@ const u_int *state_valid; SAHTREE_LOCK(); - LIST_FOREACH(sah, &sahtree, chain) { + LIST_FOREACH(sah, &V_sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, saidx, CMP_MODE_REQID)) { - if (key_preferred_oldsa) { + if (V_key_preferred_oldsa) { state_valid = saorder_state_valid_prefer_old; arraysize = N(saorder_state_valid_prefer_old); } else { @@ -900,6 +922,7 @@ static struct secasvar * key_do_allocsa_policy(struct secashead *sah, u_int state) { + INIT_VNET_IPSEC(curvnet); struct secasvar *sav, *nextsav, *candidate, *d; /* initilize */ @@ -928,7 +951,7 @@ IPSEC_ASSERT(sav->lft_c != NULL, ("null sav lifetime")); /* What the best method is to compare ? */ - if (key_preferred_oldsa) { + if (V_key_preferred_oldsa) { if (candidate->lft_c->addtime > sav->lft_c->addtime) { candidate = sav; @@ -1045,6 +1068,7 @@ u_int32_t spi, const char* where, int tag) { + INIT_VNET_IPSEC(curvnet); struct secashead *sah; struct secasvar *sav; u_int stateidx, arraysize, state; @@ -1062,14 +1086,14 @@ * encrypted so we can't check internal IP header. */ SAHTREE_LOCK(); - if (key_preferred_oldsa) { + if (V_key_preferred_oldsa) { saorder_state_valid = saorder_state_valid_prefer_old; arraysize = _ARRAYLEN(saorder_state_valid_prefer_old); } else { saorder_state_valid = saorder_state_valid_prefer_new; arraysize = _ARRAYLEN(saorder_state_valid_prefer_new); } - LIST_FOREACH(sah, &sahtree, chain) { + LIST_FOREACH(sah, &V_sahtree, chain) { /* search valid state */ for (stateidx = 0; stateidx < arraysize; stateidx++) { state = saorder_state_valid[stateidx]; @@ -1114,6 +1138,7 @@ void _key_freesp(struct secpolicy **spp, const char* where, int tag) { + INIT_VNET_IPSEC(curvnet); struct secpolicy *sp = *spp; IPSEC_ASSERT(sp != NULL, ("null sp")); @@ -1139,6 +1164,7 @@ void key_freeso(struct socket *so) { + INIT_VNET_IPSEC(curvnet); IPSEC_ASSERT(so != NULL, ("null so")); switch (so->so_proto->pr_domain->dom_family) { @@ -1207,6 +1233,7 @@ void key_freesav(struct secasvar **psav, const char* where, int tag) { + INIT_VNET_IPSEC(curvnet); struct secasvar *sav = *psav; IPSEC_ASSERT(sav != NULL, ("null sav")); @@ -1265,12 +1292,13 @@ static struct secpolicy * key_getsp(struct secpolicyindex *spidx) { + INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; IPSEC_ASSERT(spidx != NULL, ("null spidx")); SPTREE_LOCK(); - LIST_FOREACH(sp, &sptree[spidx->dir], chain) { + LIST_FOREACH(sp, &V_sptree[spidx->dir], chain) { if (sp->state == IPSEC_SPSTATE_DEAD) continue; if (key_cmpspidx_exactly(spidx, &sp->spidx)) { @@ -1291,10 +1319,11 @@ static struct secpolicy * key_getspbyid(u_int32_t id) { + INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; SPTREE_LOCK(); - LIST_FOREACH(sp, &sptree[IPSEC_DIR_INBOUND], chain) { + LIST_FOREACH(sp, &V_sptree[IPSEC_DIR_INBOUND], chain) { if (sp->state == IPSEC_SPSTATE_DEAD) continue; if (sp->id == id) { @@ -1303,7 +1332,7 @@ } } - LIST_FOREACH(sp, &sptree[IPSEC_DIR_OUTBOUND], chain) { + LIST_FOREACH(sp, &V_sptree[IPSEC_DIR_OUTBOUND], chain) { if (sp->state == IPSEC_SPSTATE_DEAD) continue; if (sp->id == id) { @@ -1320,6 +1349,7 @@ struct secpolicy * key_newsp(const char* where, int tag) { + INIT_VNET_IPSEC(curvnet); struct secpolicy *newsp = NULL; newsp = (struct secpolicy *) @@ -1354,6 +1384,7 @@ size_t len; int *error; { + INIT_VNET_IPSEC(curvnet); struct secpolicy *newsp; IPSEC_ASSERT(xpl0 != NULL, ("null xpl0")); @@ -1751,6 +1782,7 @@ struct mbuf *m; const struct sadb_msghdr *mhp; { + INIT_VNET_IPSEC(curvnet); struct sadb_address *src0, *dst0; struct sadb_x_policy *xpl0, *xpl; struct sadb_lifetime *lft = NULL; @@ -1905,7 +1937,7 @@ newsp->refcnt = 1; /* do not reclaim until I say I do */ newsp->state = IPSEC_SPSTATE_ALIVE; - LIST_INSERT_TAIL(&sptree[newsp->spidx.dir], newsp, secpolicy, chain); + LIST_INSERT_TAIL(&V_sptree[newsp->spidx.dir], newsp, secpolicy, chain); /* delete the entry in spacqtree */ if (mhp->msg->sadb_msg_type == SADB_X_SPDUPDATE) { @@ -1973,13 +2005,14 @@ static u_int32_t key_getnewspid() { + INIT_VNET_IPSEC(curvnet); u_int32_t newid = 0; - int count = key_spi_trycnt; /* XXX */ + int count = V_key_spi_trycnt; /* XXX */ struct secpolicy *sp; /* when requesting to allocate spi ranged */ while (count--) { - newid = (policy_id = (policy_id == ~0 ? 1 : policy_id + 1)); + newid = (V_policy_id = (V_policy_id == ~0 ? 1 : V_policy_id + 1)); if ((sp = key_getspbyid(newid)) == NULL) break; @@ -2014,6 +2047,7 @@ struct mbuf *m; const struct sadb_msghdr *mhp; { + INIT_VNET_IPSEC(curvnet); struct sadb_address *src0, *dst0; struct sadb_x_policy *xpl0; struct secpolicyindex spidx; @@ -2112,6 +2146,7 @@ struct mbuf *m; const struct sadb_msghdr *mhp; { + INIT_VNET_IPSEC(curvnet); u_int32_t id; struct secpolicy *sp; @@ -2204,6 +2239,7 @@ struct mbuf *m; const struct sadb_msghdr *mhp; { + INIT_VNET_IPSEC(curvnet); u_int32_t id; struct secpolicy *sp; struct mbuf *n; @@ -2255,6 +2291,7 @@ key_spdacquire(sp) struct secpolicy *sp; { + INIT_VNET_IPSEC(curvnet); struct mbuf *result = NULL, *m; struct secspacq *newspacq; @@ -2266,7 +2303,7 @@ /* Get an entry to check whether sent message or not. */ newspacq = key_getspacq(&sp->spidx); if (newspacq != NULL) { - if (key_blockacq_count < newspacq->count) { + if (V_key_blockacq_count < newspacq->count) { /* reset counter and do send message. */ newspacq->count = 0; } else { @@ -2317,6 +2354,8 @@ struct mbuf *m; const struct sadb_msghdr *mhp; { + printf("\n---> key_spdflush()..\n"); + INIT_VNET_IPSEC(curvnet); struct sadb_msg *newmsg; struct secpolicy *sp; u_int dir; @@ -2331,7 +2370,7 @@ for (dir = 0; dir < IPSEC_DIR_MAX; dir++) { SPTREE_LOCK(); - LIST_FOREACH(sp, &sptree[dir], chain) + LIST_FOREACH(sp, &V_sptree[dir], chain) sp->state = IPSEC_SPSTATE_DEAD; SPTREE_UNLOCK(); } @@ -2369,6 +2408,7 @@ struct mbuf *m; const struct sadb_msghdr *mhp; { + INIT_VNET_IPSEC(curvnet); struct secpolicy *sp; int cnt; u_int dir; @@ -2382,7 +2422,7 @@ /* search SPD entry and get buffer size. */ cnt = 0; for (dir = 0; dir < IPSEC_DIR_MAX; dir++) { - LIST_FOREACH(sp, &sptree[dir], chain) { + LIST_FOREACH(sp, &V_sptree[dir], chain) { cnt++; } } @@ -2391,7 +2431,7 @@ return key_senderror(so, m, ENOENT); for (dir = 0; dir < IPSEC_DIR_MAX; dir++) { - LIST_FOREACH(sp, &sptree[dir], chain) { + LIST_FOREACH(sp, &V_sptree[dir], chain) { --cnt; n = key_setdumpsp(sp, SADB_X_SPDDUMP, cnt, mhp->msg->sadb_msg_pid); @@ -2614,6 +2654,7 @@ key_newsah(saidx) struct secasindex *saidx; { + INIT_VNET_IPSEC(curvnet); struct secashead *newsah; IPSEC_ASSERT(saidx != NULL, ("null saidx")); @@ -2629,7 +2670,7 @@ newsah->state = SADB_SASTATE_MATURE; SAHTREE_LOCK(); - LIST_INSERT_HEAD(&sahtree, newsah, chain); + LIST_INSERT_HEAD(&V_sahtree, newsah, chain); SAHTREE_UNLOCK(); } return(newsah); @@ -2642,6 +2683,7 @@ key_delsah(sah) struct secashead *sah; { + INIT_VNET_IPSEC(curvnet); struct secasvar *sav, *nextsav; u_int stateidx; int zombie = 0; @@ -2651,9 +2693,9 @@ /* searching all SA registerd in the secindex. */ for (stateidx = 0; - stateidx < _ARRAYLEN(saorder_state_any); + stateidx < _ARRAYLEN(V_saorder_state_any); stateidx++) { - u_int state = saorder_state_any[stateidx]; + u_int state = V_saorder_state_any[stateidx]; LIST_FOREACH_SAFE(sav, &sah->savtree[state], chain, nextsav) { if (sav->refcnt == 0) { /* sanity check */ @@ -2698,6 +2740,7 @@ const char* where; int tag; { + INIT_VNET_IPSEC(curvnet); struct secasvar *newsav; const struct sadb_sa *xsa; @@ -2721,7 +2764,7 @@ /* sync sequence number */ if (mhp->msg->sadb_msg_seq == 0) newsav->seq = - (acq_seq = (acq_seq == ~0 ? 1 : ++acq_seq)); + (V_acq_seq = (V_acq_seq == ~0 ? 1 : ++V_acq_seq)); else #endif newsav->seq = mhp->msg->sadb_msg_seq; @@ -2864,10 +2907,11 @@ key_getsah(saidx) struct secasindex *saidx; { + INIT_VNET_IPSEC(curvnet); struct secashead *sah; SAHTREE_LOCK(); - LIST_FOREACH(sah, &sahtree, chain) { + LIST_FOREACH(sah, &V_sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, saidx, CMP_REQID)) @@ -2890,6 +2934,7 @@ struct secasindex *saidx; u_int32_t spi; { + INIT_VNET_IPSEC(curvnet); struct secashead *sah; struct secasvar *sav; @@ -2903,7 +2948,7 @@ sav = NULL; /* check all SAD */ SAHTREE_LOCK(); - LIST_FOREACH(sah, &sahtree, chain) { + LIST_FOREACH(sah, &V_sahtree, chain) { if (!key_ismyaddr((struct sockaddr *)&sah->saidx.dst)) continue; sav = key_getsavbyspi(sah, spi); @@ -2926,6 +2971,7 @@ struct secashead *sah; u_int32_t spi; { + INIT_VNET_IPSEC(curvnet); struct secasvar *sav; u_int stateidx, state; @@ -2933,10 +2979,10 @@ SAHTREE_LOCK_ASSERT(); /* search all status */ for (stateidx = 0; - stateidx < _ARRAYLEN(saorder_state_alive); + stateidx < _ARRAYLEN(V_saorder_state_alive); stateidx++) { - state = saorder_state_alive[stateidx]; + state = V_saorder_state_alive[stateidx]; LIST_FOREACH(sav, &sah->savtree[state], chain) { /* sanity check */ @@ -2969,6 +3015,7 @@ struct mbuf *m; const struct sadb_msghdr *mhp; { + INIT_VNET_IPSEC(curvnet); int error = 0; IPSEC_ASSERT(m != NULL, ("null mbuf")); @@ -3201,6 +3248,7 @@ static int key_mature(struct secasvar *sav) { + INIT_VNET_IPSEC(curvnet); int error; /* check SPI value */ @@ -3631,6 +3679,7 @@ key_dup_keymsg(const struct sadb_key *src, u_int len, struct malloc_type *type) { + INIT_VNET_IPSEC(curvnet); struct seckey *dst; dst = (struct seckey *)malloc(sizeof(struct seckey), type, M_NOWAIT); if (dst != NULL) { @@ -3664,6 +3713,7 @@ key_dup_lifemsg(const struct sadb_lifetime *src, struct malloc_type *type) { + INIT_VNET_IPSEC(curvnet); struct seclifetime *dst = NULL; dst = (struct seclifetime *)malloc(sizeof(struct seclifetime), @@ -3689,6 +3739,7 @@ struct sockaddr *sa; { #ifdef INET + INIT_VNET_INET(curvnet); struct sockaddr_in *sin; struct in_ifaddr *ia; #endif @@ -3699,7 +3750,7 @@ #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; - for (ia = in_ifaddrhead.tqh_first; ia; + for (ia = V_in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) { if (sin->sin_family == ia->ia_addr.sin_family && @@ -3733,10 +3784,11 @@ key_ismyaddr6(sin6) struct sockaddr_in6 *sin6; { + INIT_VNET_INET6(curvnet); struct in6_ifaddr *ia; struct in6_multi *in6m; - for (ia = in6_ifaddr; ia; ia = ia->ia_next) { + for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) { if (key_sockaddrcmp((struct sockaddr *)&sin6, (struct sockaddr *)&ia->ia_addr, 0) == 0) return 1; @@ -4054,6 +4106,7 @@ static void key_flush_spd(time_t now) { + INIT_VNET_IPSEC(curvnet); static u_int16_t sptree_scangen = 0; u_int16_t gen = sptree_scangen++; struct secpolicy *sp; @@ -4063,7 +4116,7 @@ for (dir = 0; dir < IPSEC_DIR_MAX; dir++) { restart: SPTREE_LOCK(); - LIST_FOREACH(sp, &sptree[dir], chain) { + LIST_FOREACH(sp, &V_sptree[dir], chain) { if (sp->scangen == gen) /* previously handled */ continue; sp->scangen = gen; @@ -4091,12 +4144,13 @@ static void key_flush_sad(time_t now) { + INIT_VNET_IPSEC(curvnet); struct secashead *sah, *nextsah; struct secasvar *sav, *nextsav; /* SAD */ SAHTREE_LOCK(); - LIST_FOREACH_SAFE(sah, &sahtree, chain, nextsah) { + LIST_FOREACH_SAFE(sah, &V_sahtree, chain, nextsah) { /* if sah has been dead, then delete it and process next sah. */ if (sah->state == SADB_SASTATE_DEAD) { key_delsah(sah); @@ -4105,7 +4159,7 @@ /* if LARVAL entry doesn't become MATURE, delete it. */ LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_LARVAL], chain, nextsav) { - if (now - sav->created > key_larval_lifetime) + if (now - sav->created > V_key_larval_lifetime) KEY_FREESAV(&sav); } @@ -4229,13 +4283,14 @@ static void key_flush_acq(time_t now) { + INIT_VNET_IPSEC(curvnet); struct secacq *acq, *nextacq; /* ACQ tree */ ACQ_LOCK(); - for (acq = LIST_FIRST(&acqtree); acq != NULL; acq = nextacq) { + for (acq = LIST_FIRST(&V_acqtree); acq != NULL; acq = nextacq) { nextacq = LIST_NEXT(acq, chain); - if (now - acq->created > key_blockacq_lifetime + if (now - acq->created > V_key_blockacq_lifetime && __LIST_CHAINED(acq)) { LIST_REMOVE(acq, chain); free(acq, M_IPSEC_SAQ); @@ -4247,13 +4302,14 @@ static void key_flush_spacq(time_t now) { + INIT_VNET_IPSEC(curvnet); struct secspacq *acq, *nextacq; /* SP ACQ tree */ SPACQ_LOCK(); - for (acq = LIST_FIRST(&spacqtree); acq != NULL; acq = nextacq) { + for (acq = LIST_FIRST(&V_spacqtree); acq != NULL; acq = nextacq) { nextacq = LIST_NEXT(acq, chain); - if (now - acq->created > key_blockacq_lifetime + if (now - acq->created > V_key_blockacq_lifetime && __LIST_CHAINED(acq)) { LIST_REMOVE(acq, chain); free(acq, M_IPSEC_SAQ); @@ -4272,11 +4328,13 @@ key_timehandler(void) { time_t now = time_second; - + + VNET_ITERLOOP_BEGIN(); key_flush_spd(now); key_flush_sad(now); key_flush_acq(now); key_flush_spacq(now); + VNET_ITERLOOP_END(); #ifndef IPSEC_DEBUG2 /* do exchange to tick time !! */ @@ -4389,6 +4447,7 @@ struct mbuf *m; const struct sadb_msghdr *mhp; { + INIT_VNET_IPSEC(curvnet); struct sadb_address *src0, *dst0; struct secasindex saidx; struct secashead *newsah; @@ -4583,17 +4642,18 @@ struct sadb_spirange *spirange; struct secasindex *saidx; { + INIT_VNET_IPSEC(curvnet); u_int32_t newspi; u_int32_t min, max; - int count = key_spi_trycnt; + int count = V_key_spi_trycnt; /* set spi range to allocate */ if (spirange != NULL) { min = spirange->sadb_spirange_min; max = spirange->sadb_spirange_max; } else { - min = key_spi_minval; - max = key_spi_maxval; + min = V_key_spi_minval; + max = V_key_spi_maxval; } /* IPCOMP needs 2-byte SPI */ if (saidx->proto == IPPROTO_IPCOMP) { @@ -4640,7 +4700,7 @@ /* statistics */ keystat.getspi_count = - (keystat.getspi_count + key_spi_trycnt - count) / 2; + (keystat.getspi_count + V_key_spi_trycnt - count) / 2; return newspi; } @@ -4664,6 +4724,7 @@ struct mbuf *m; const struct sadb_msghdr *mhp; { + INIT_VNET_IPSEC(curvnet); struct sadb_sa *sa0; struct sadb_address *src0, *dst0; struct secasindex saidx; @@ -4862,6 +4923,7 @@ struct mbuf *m; const struct sadb_msghdr *mhp; { + INIT_VNET_IPSEC(curvnet); struct sadb_sa *sa0; struct sadb_address *src0, *dst0; struct secasindex saidx; @@ -4985,6 +5047,7 @@ struct mbuf *m; const struct sadb_msghdr *mhp; { + INIT_VNET_IPSEC(curvnet); const struct sadb_ident *idsrc, *iddst; int idsrclen, iddstlen; @@ -5107,6 +5170,7 @@ struct mbuf *m; const struct sadb_msghdr *mhp; { + INIT_VNET_IPSEC(curvnet); struct sadb_sa *sa0; struct sadb_address *src0, *dst0; struct secasindex saidx; @@ -5163,7 +5227,7 @@ /* get a SA header */ SAHTREE_LOCK(); - LIST_FOREACH(sah, &sahtree, chain) { + LIST_FOREACH(sah, &V_sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0) @@ -5218,6 +5282,7 @@ const struct sadb_msghdr *mhp; u_int16_t proto; { + INIT_VNET_IPSEC(curvnet); struct sadb_address *src0, *dst0; struct secasindex saidx; struct secashead *sah; @@ -5231,7 +5296,7 @@ KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx); SAHTREE_LOCK(); - LIST_FOREACH(sah, &sahtree, chain) { + LIST_FOREACH(sah, &V_sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0) @@ -5239,9 +5304,9 @@ /* Delete all non-LARVAL SAs. */ for (stateidx = 0; - stateidx < _ARRAYLEN(saorder_state_alive); + stateidx < _ARRAYLEN(V_saorder_state_alive); stateidx++) { - state = saorder_state_alive[stateidx]; + state = V_saorder_state_alive[stateidx]; if (state == SADB_SASTATE_LARVAL) continue; for (sav = LIST_FIRST(&sah->savtree[state]); @@ -5303,6 +5368,7 @@ struct mbuf *m; const struct sadb_msghdr *mhp; { + INIT_VNET_IPSEC(curvnet); struct sadb_sa *sa0; struct sadb_address *src0, *dst0; struct secasindex saidx; @@ -5346,7 +5412,7 @@ /* get a SA header */ SAHTREE_LOCK(); - LIST_FOREACH(sah, &sahtree, chain) { + LIST_FOREACH(sah, &V_sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0) @@ -5408,6 +5474,7 @@ static struct mbuf * key_getcomb_esp() { + INIT_VNET_IPSEC(curvnet); struct sadb_comb *comb; struct enc_xform *algo; struct mbuf *result = NULL, *m, *n; @@ -5423,14 +5490,14 @@ continue; /* discard algorithms with key size smaller than system min */ - if (_BITS(algo->maxkey) < ipsec_esp_keymin) + if (_BITS(algo->maxkey) < V_ipsec_esp_keymin) continue; - if (_BITS(algo->minkey) < ipsec_esp_keymin) - encmin = ipsec_esp_keymin; + if (_BITS(algo->minkey) < V_ipsec_esp_keymin) + encmin = V_ipsec_esp_keymin; else encmin = _BITS(algo->minkey); - if (ipsec_esp_auth) + if (V_ipsec_esp_auth) m = key_getcomb_ah(); else { IPSEC_ASSERT(l <= MLEN, @@ -5486,6 +5553,7 @@ u_int16_t* min, u_int16_t* max) { + INIT_VNET_IPSEC(curvnet); *min = *max = ah->keysize; if (ah->keysize == 0) { /* @@ -5510,6 +5578,7 @@ static struct mbuf * key_getcomb_ah() { + INIT_VNET_IPSEC(curvnet); struct sadb_comb *comb; struct auth_hash *algo; struct mbuf *m; @@ -5529,7 +5598,7 @@ continue; key_getsizes_ah(algo, i, &minkeysize, &maxkeysize); /* discard algorithms with key size smaller than system min */ - if (_BITS(minkeysize) < ipsec_ah_keymin) + if (_BITS(minkeysize) < V_ipsec_ah_keymin) continue; if (!m) { @@ -5670,6 +5739,7 @@ static int key_acquire(const struct secasindex *saidx, struct secpolicy *sp) { + INIT_VNET_IPSEC(curvnet); struct mbuf *result = NULL, *m; struct secacq *newacq; u_int8_t satype; @@ -5688,7 +5758,7 @@ */ /* Get an entry to check whether sending message or not. */ if ((newacq = key_getacq(saidx)) != NULL) { - if (key_blockacq_count < newacq->count) { + if (V_key_blockacq_count < newacq->count) { /* reset counter and do send message. */ newacq->count = 0; } else { @@ -5835,6 +5905,7 @@ static struct secacq * key_newacq(const struct secasindex *saidx) { + INIT_VNET_IPSEC(curvnet); struct secacq *newacq; /* get new entry */ @@ -5846,13 +5917,13 @@ /* copy secindex */ bcopy(saidx, &newacq->saidx, sizeof(newacq->saidx)); - newacq->seq = (acq_seq == ~0 ? 1 : ++acq_seq); + newacq->seq = (V_acq_seq == ~0 ? 1 : ++V_acq_seq); newacq->created = time_second; newacq->count = 0; /* add to acqtree */ ACQ_LOCK(); - LIST_INSERT_HEAD(&acqtree, newacq, chain); + LIST_INSERT_HEAD(&V_acqtree, newacq, chain); ACQ_UNLOCK(); return newacq; @@ -5861,10 +5932,11 @@ static struct secacq * key_getacq(const struct secasindex *saidx) { + INIT_VNET_IPSEC(curvnet); struct secacq *acq; ACQ_LOCK(); - LIST_FOREACH(acq, &acqtree, chain) { + LIST_FOREACH(acq, &V_acqtree, chain) { if (key_cmpsaidx(saidx, &acq->saidx, CMP_EXACTLY)) break; } @@ -5877,10 +5949,11 @@ key_getacqbyseq(seq) u_int32_t seq; { + INIT_VNET_IPSEC(curvnet); struct secacq *acq; ACQ_LOCK(); - LIST_FOREACH(acq, &acqtree, chain) { + LIST_FOREACH(acq, &V_acqtree, chain) { if (acq->seq == seq) break; } @@ -5893,6 +5966,7 @@ key_newspacq(spidx) struct secpolicyindex *spidx; { + INIT_VNET_IPSEC(curvnet); struct secspacq *acq; /* get new entry */ @@ -5909,7 +5983,7 @@ /* add to spacqtree */ SPACQ_LOCK(); - LIST_INSERT_HEAD(&spacqtree, acq, chain); + LIST_INSERT_HEAD(&V_spacqtree, acq, chain); SPACQ_UNLOCK(); return acq; @@ -5919,10 +5993,11 @@ key_getspacq(spidx) struct secpolicyindex *spidx; { + INIT_VNET_IPSEC(curvnet); struct secspacq *acq; SPACQ_LOCK(); - LIST_FOREACH(acq, &spacqtree, chain) { + LIST_FOREACH(acq, &V_spacqtree, chain) { if (key_cmpspidx_exactly(spidx, &acq->spidx)) { /* NB: return holding spacq_lock */ return acq; @@ -5953,6 +6028,7 @@ struct mbuf *m; const struct sadb_msghdr *mhp; { + INIT_VNET_IPSEC(curvnet); const struct sadb_address *src0, *dst0; struct secasindex saidx; struct secashead *sah; @@ -6033,7 +6109,7 @@ /* get a SA index */ SAHTREE_LOCK(); - LIST_FOREACH(sah, &sahtree, chain) { + LIST_FOREACH(sah, &V_sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, &saidx, CMP_MODE_REQID)) @@ -6074,6 +6150,7 @@ struct mbuf *m; const struct sadb_msghdr *mhp; { + INIT_VNET_IPSEC(curvnet); struct secreg *reg, *newreg = 0; IPSEC_ASSERT(so != NULL, ("null socket")); @@ -6082,7 +6159,7 @@ IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* check for invalid register message */ - if (mhp->msg->sadb_msg_satype >= sizeof(regtree)/sizeof(regtree[0])) + if (mhp->msg->sadb_msg_satype >= sizeof(V_regtree)/sizeof(V_regtree[0])) return key_senderror(so, m, EINVAL); /* When SATYPE_UNSPEC is specified, only return sabd_supported. */ @@ -6091,7 +6168,7 @@ /* check whether existing or not */ REGTREE_LOCK(); - LIST_FOREACH(reg, ®tree[mhp->msg->sadb_msg_satype], chain) { + LIST_FOREACH(reg, &V_regtree[mhp->msg->sadb_msg_satype], chain) { if (reg->so == so) { REGTREE_UNLOCK(); ipseclog((LOG_DEBUG, "%s: socket exists already.\n", @@ -6112,7 +6189,7 @@ ((struct keycb *)sotorawcb(so))->kp_registered++; /* add regnode to regtree. */ - LIST_INSERT_HEAD(®tree[mhp->msg->sadb_msg_satype], newreg, chain); + LIST_INSERT_HEAD(&V_regtree[mhp->msg->sadb_msg_satype], newreg, chain); REGTREE_UNLOCK(); setmsg: @@ -6228,6 +6305,7 @@ void key_freereg(struct socket *so) { + INIT_VNET_IPSEC(curvnet); struct secreg *reg; int i; @@ -6240,7 +6318,7 @@ */ REGTREE_LOCK(); for (i = 0; i <= SADB_SATYPE_MAX; i++) { - LIST_FOREACH(reg, ®tree[i], chain) { + LIST_FOREACH(reg, &V_regtree[i], chain) { if (reg->so == so && __LIST_CHAINED(reg)) { LIST_REMOVE(reg, chain); free(reg, M_IPSEC_SAR); @@ -6394,6 +6472,7 @@ struct mbuf *m; const struct sadb_msghdr *mhp; { + INIT_VNET_IPSEC(curvnet); struct sadb_msg *newmsg; struct secashead *sah, *nextsah; struct secasvar *sav, *nextsav; @@ -6414,7 +6493,7 @@ /* no SATYPE specified, i.e. flushing all SA. */ SAHTREE_LOCK(); - for (sah = LIST_FIRST(&sahtree); + for (sah = LIST_FIRST(&V_sahtree); sah != NULL; sah = nextsah) { nextsah = LIST_NEXT(sah, chain); @@ -6424,9 +6503,9 @@ continue; for (stateidx = 0; - stateidx < _ARRAYLEN(saorder_state_alive); + stateidx < _ARRAYLEN(V_saorder_state_alive); stateidx++) { - state = saorder_state_any[stateidx]; + state = V_saorder_state_any[stateidx]; for (sav = LIST_FIRST(&sah->savtree[state]); sav != NULL; sav = nextsav) { @@ -6477,6 +6556,7 @@ struct mbuf *m; const struct sadb_msghdr *mhp; { + INIT_VNET_IPSEC(curvnet); struct secashead *sah; struct secasvar *sav; u_int16_t proto; @@ -6502,15 +6582,15 @@ /* count sav entries to be sent to the userland. */ cnt = 0; SAHTREE_LOCK(); - LIST_FOREACH(sah, &sahtree, chain) { + LIST_FOREACH(sah, &V_sahtree, chain) { if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC && proto != sah->saidx.proto) continue; for (stateidx = 0; - stateidx < _ARRAYLEN(saorder_state_any); + stateidx < _ARRAYLEN(V_saorder_state_any); stateidx++) { - state = saorder_state_any[stateidx]; + state = V_saorder_state_any[stateidx]; LIST_FOREACH(sav, &sah->savtree[state], chain) { cnt++; } @@ -6524,7 +6604,7 @@ /* send this to the userland, one at a time. */ newmsg = NULL; - LIST_FOREACH(sah, &sahtree, chain) { + LIST_FOREACH(sah, &V_sahtree, chain) { if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC && proto != sah->saidx.proto) continue; @@ -6538,9 +6618,9 @@ } for (stateidx = 0; - stateidx < _ARRAYLEN(saorder_state_any); + stateidx < _ARRAYLEN(V_saorder_state_any); stateidx++) { - state = saorder_state_any[stateidx]; + state = V_saorder_state_any[stateidx]; LIST_FOREACH(sav, &sah->savtree[state], chain) { n = key_setdumpsa(sav, SADB_DUMP, satype, --cnt, mhp->msg->sadb_msg_pid); @@ -6657,6 +6737,7 @@ struct mbuf *m; struct socket *so; { + INIT_VNET_IPSEC(curvnet); struct sadb_msg *msg; struct sadb_msghdr mh; u_int orglen; @@ -6684,7 +6765,7 @@ if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len != m->m_pkthdr.len) { ipseclog((LOG_DEBUG, "%s: invalid message length.\n",__func__)); - pfkeystat.out_invlen++; + V_pfkeystat.out_invlen++; error = EINVAL; goto senderror; } @@ -6692,7 +6773,7 @@ if (msg->sadb_msg_version != PF_KEY_V2) { ipseclog((LOG_DEBUG, "%s: PF_KEY version %u is mismatched.\n", __func__, msg->sadb_msg_version)); - pfkeystat.out_invver++; + V_pfkeystat.out_invver++; error = EINVAL; goto senderror; } @@ -6700,7 +6781,7 @@ if (msg->sadb_msg_type > SADB_MAX) { ipseclog((LOG_DEBUG, "%s: invalid type %u is passed.\n", __func__, msg->sadb_msg_type)); - pfkeystat.out_invmsgtype++; + V_pfkeystat.out_invmsgtype++; error = EINVAL; goto senderror; } @@ -6753,7 +6834,7 @@ ipseclog((LOG_DEBUG, "%s: must specify satype " "when msg type=%u.\n", __func__, msg->sadb_msg_type)); - pfkeystat.out_invsatype++; + V_pfkeystat.out_invsatype++; error = EINVAL; goto senderror; } @@ -6773,7 +6854,7 @@ case SADB_X_SPDDELETE2: ipseclog((LOG_DEBUG, "%s: illegal satype=%u\n", __func__, msg->sadb_msg_type)); - pfkeystat.out_invsatype++; + V_pfkeystat.out_invsatype++; error = EINVAL; goto senderror; } @@ -6784,7 +6865,7 @@ case SADB_SATYPE_MIP: ipseclog((LOG_DEBUG, "%s: type %u isn't supported.\n", __func__, msg->sadb_msg_satype)); - pfkeystat.out_invsatype++; + V_pfkeystat.out_invsatype++; error = EOPNOTSUPP; goto senderror; case 1: /* XXX: What does it do? */ @@ -6794,7 +6875,7 @@ default: ipseclog((LOG_DEBUG, "%s: invalid type %u is passed.\n", __func__, msg->sadb_msg_satype)); - pfkeystat.out_invsatype++; + V_pfkeystat.out_invsatype++; error = EINVAL; goto senderror; } @@ -6812,7 +6893,7 @@ if (src0->sadb_address_proto != dst0->sadb_address_proto) { ipseclog((LOG_DEBUG, "%s: upper layer protocol " "mismatched.\n", __func__)); - pfkeystat.out_invaddr++; + V_pfkeystat.out_invaddr++; error = EINVAL; goto senderror; } @@ -6822,7 +6903,7 @@ PFKEY_ADDR_SADDR(dst0)->sa_family) { ipseclog((LOG_DEBUG, "%s: address family mismatched.\n", __func__)); - pfkeystat.out_invaddr++; + V_pfkeystat.out_invaddr++; error = EINVAL; goto senderror; } @@ -6830,7 +6911,7 @@ PFKEY_ADDR_SADDR(dst0)->sa_len) { ipseclog((LOG_DEBUG, "%s: address struct size " "mismatched.\n", __func__)); - pfkeystat.out_invaddr++; + V_pfkeystat.out_invaddr++; error = EINVAL; goto senderror; } @@ -6839,7 +6920,7 @@ case AF_INET: if (PFKEY_ADDR_SADDR(src0)->sa_len != sizeof(struct sockaddr_in)) { - pfkeystat.out_invaddr++; + V_pfkeystat.out_invaddr++; error = EINVAL; goto senderror; } @@ -6847,7 +6928,7 @@ case AF_INET6: if (PFKEY_ADDR_SADDR(src0)->sa_len != sizeof(struct sockaddr_in6)) { - pfkeystat.out_invaddr++; + V_pfkeystat.out_invaddr++; error = EINVAL; goto senderror; } @@ -6855,7 +6936,7 @@ default: ipseclog((LOG_DEBUG, "%s: unsupported address family\n", __func__)); - pfkeystat.out_invaddr++; + V_pfkeystat.out_invaddr++; error = EAFNOSUPPORT; goto senderror; } @@ -6877,7 +6958,7 @@ dst0->sadb_address_prefixlen > plen) { ipseclog((LOG_DEBUG, "%s: illegal prefixlen.\n", __func__)); - pfkeystat.out_invaddr++; + V_pfkeystat.out_invaddr++; error = EINVAL; goto senderror; } @@ -6890,7 +6971,7 @@ if (msg->sadb_msg_type >= sizeof(key_typesw)/sizeof(key_typesw[0]) || key_typesw[msg->sadb_msg_type] == NULL) { - pfkeystat.out_invmsgtype++; + V_pfkeystat.out_invmsgtype++; error = EINVAL; goto senderror; } @@ -6928,6 +7009,7 @@ struct mbuf *m; struct sadb_msghdr *mhp; { + INIT_VNET_IPSEC(curvnet); struct mbuf *n; struct sadb_ext *ext; size_t off, end; @@ -6984,7 +7066,7 @@ ipseclog((LOG_DEBUG, "%s: duplicate ext_type " "%u\n", __func__, ext->sadb_ext_type)); m_freem(m); - pfkeystat.out_dupext++; + V_pfkeystat.out_dupext++; return EINVAL; } break; @@ -6992,7 +7074,7 @@ ipseclog((LOG_DEBUG, "%s: invalid ext_type %u\n", __func__, ext->sadb_ext_type)); m_freem(m); - pfkeystat.out_invexttype++; + V_pfkeystat.out_invexttype++; return EINVAL; } @@ -7000,7 +7082,7 @@ if (key_validate_ext(ext, extlen)) { m_freem(m); - pfkeystat.out_invlen++; + V_pfkeystat.out_invlen++; return EINVAL; } @@ -7018,7 +7100,7 @@ if (off != end) { m_freem(m); - pfkeystat.out_invlen++; + V_pfkeystat.out_invlen++; return EINVAL; } @@ -7085,30 +7167,64 @@ } void -key_init() +key_init(void) { + INIT_VNET_IPSEC(curvnet); int i; + V_key_debug_level = 0; + V_key_spi_trycnt = 1000; + V_key_spi_minval = 0x100; + V_key_spi_maxval = 0x0fffffff; /* XXX */ + V_policy_id = 0; + V_key_int_random = 60; /*interval to initialize randseed,1(m)*/ + V_key_larval_lifetime = 30; /* interval to expire acquiring, 30(s)*/ + V_key_blockacq_count = 10; /* counter for blocking SADB_ACQUIRE.*/ + V_key_blockacq_lifetime = 20; /* lifetime for blocking SADB_ACQUIRE.*/ + V_key_preferred_oldsa = 1; /* preferred old sa rather than new sa.*/ + V_acq_seq = 0; + + V_saorder_state_alive[0] = SADB_SASTATE_MATURE; + V_saorder_state_alive[1] = SADB_SASTATE_DYING; + V_saorder_state_alive[2] = SADB_SASTATE_LARVAL; + V_saorder_state_any[0] = SADB_SASTATE_MATURE; + V_saorder_state_any[1] = SADB_SASTATE_DYING; + V_saorder_state_any[2] = SADB_SASTATE_LARVAL; + V_saorder_state_any[3] = SADB_SASTATE_DEAD; + + V_ipsec_esp_keymin = 256; + V_ipsec_esp_auth = 0; + V_ipsec_ah_keymin = 128; +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif SPTREE_LOCK_INIT(); REGTREE_LOCK_INIT(); SAHTREE_LOCK_INIT(); ACQ_LOCK_INIT(); SPACQ_LOCK_INIT(); - +#ifdef VIMAGE + } +#endif for (i = 0; i < IPSEC_DIR_MAX; i++) - LIST_INIT(&sptree[i]); + LIST_INIT(&V_sptree[i]); - LIST_INIT(&sahtree); + LIST_INIT(&V_sahtree); for (i = 0; i <= SADB_SATYPE_MAX; i++) - LIST_INIT(®tree[i]); + LIST_INIT(&V_regtree[i]); - LIST_INIT(&acqtree); - LIST_INIT(&spacqtree); + LIST_INIT(&V_acqtree); + LIST_INIT(&V_spacqtree); /* system default */ - ip4_def_policy.policy = IPSEC_POLICY_NONE; - ip4_def_policy.refcnt++; /*never reclaim this*/ + V_ip4_def_policy.policy = IPSEC_POLICY_NONE; + V_ip4_def_policy.refcnt++; /*never reclaim this*/ + +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif #ifndef IPSEC_DEBUG2 timeout((void *)key_timehandler, (void *)0, hz); @@ -7118,9 +7234,74 @@ keystat.getspi_count = 1; printf("Fast IPsec: Initialized Security Association Processing.\n"); +} - return; +#ifdef VIMAGE +void key_destroy(void) +{ + INIT_VNET_IPSEC(curvnet); + struct secpolicy *sp, *nextsp; + struct secspacq *acq, *nextacq; + struct secashead *sah, *nextsah; + struct secreg *reg; + int i; + + SPTREE_LOCK(); + for (i = 0; i < IPSEC_DIR_MAX; i++) { + for (sp = LIST_FIRST(&V_sptree[i]); + sp != NULL; sp = nextsp) { + nextsp = LIST_NEXT(sp, chain); + if (__LIST_CHAINED(sp)) { + LIST_REMOVE(sp, chain); + free(sp, M_IPSEC_SP); + } + } + } + SPTREE_UNLOCK(); + + SAHTREE_LOCK(); + for (sah = LIST_FIRST(&V_sahtree); sah != NULL; sah = nextsah) { + nextsah = LIST_NEXT(sah, chain); + if (__LIST_CHAINED(sah)) { + LIST_REMOVE(sah, chain); + free(sah, M_IPSEC_SAH); + } + } + SAHTREE_UNLOCK(); + + REGTREE_LOCK(); + for (i = 0; i <= SADB_SATYPE_MAX; i++) { + LIST_FOREACH(reg, &V_regtree[i], chain) { + if (__LIST_CHAINED(reg)) { + LIST_REMOVE(reg, chain); + free(reg, M_IPSEC_SAR); + break; + } + } + } + REGTREE_UNLOCK(); + + ACQ_LOCK(); + for (acq = LIST_FIRST(&V_spacqtree); acq != NULL; acq = nextacq) { + nextacq = LIST_NEXT(acq, chain); + if (__LIST_CHAINED(acq)) { + LIST_REMOVE(acq, chain); + free(acq, M_IPSEC_SAQ); + } + } + ACQ_UNLOCK(); + + SPACQ_LOCK(); + for (acq = LIST_FIRST(&V_spacqtree); acq != NULL; acq = nextacq) { + nextacq = LIST_NEXT(acq, chain); + if (__LIST_CHAINED(acq)) { + LIST_REMOVE(acq, chain); + free(acq, M_IPSEC_SAQ); + } + } + SPACQ_UNLOCK(); } +#endif /* * XXX: maybe This function is called after INBOUND IPsec processing. @@ -7192,11 +7373,12 @@ key_sa_routechange(dst) struct sockaddr *dst; { + INIT_VNET_IPSEC(curvnet); struct secashead *sah; struct route *ro; SAHTREE_LOCK(); - LIST_FOREACH(sah, &sahtree, chain) { + LIST_FOREACH(sah, &V_sahtree, chain) { ro = &sah->sa_route; if (ro->ro_rt && dst->sa_len == ro->ro_dst.sa_len && bcmp(dst, &ro->ro_dst, dst->sa_len) == 0) { --- /u/marko/p4/head/src/sys/netipsec/key.h 2007-08-31 03:48:10.000000000 +0200 +++ src/sys/netipsec/key.h 2007-10-05 12:27:26.000000000 +0200 @@ -96,6 +96,9 @@ extern void key_freereg __P((struct socket *)); extern int key_parse __P((struct mbuf *, struct socket *)); extern void key_init __P((void)); +#ifdef VIMAGE +extern void key_destroy(void); +#endif extern void key_sa_recordxfer __P((struct secasvar *, struct mbuf *)); extern void key_sa_routechange __P((struct sockaddr *)); extern void key_sa_stir_iv __P((struct secasvar *)); --- /u/marko/p4/head/src/sys/netipsec/key_debug.h 2007-08-31 03:48:11.000000000 +0200 +++ src/sys/netipsec/key_debug.h 2007-10-05 12:27:26.000000000 +0200 @@ -54,7 +54,7 @@ #define KEYDEBUG_IPSEC_DUMP (KEYDEBUG_IPSEC | KEYDEBUG_DUMP) #define KEYDEBUG(lev,arg) \ - do { if ((key_debug_level & (lev)) == (lev)) { arg; } } while (0) + do { if ((V_key_debug_level & (lev)) == (lev)) { arg; } } while (0) extern u_int32_t key_debug_level; #endif /*_KERNEL*/ --- /u/marko/p4/head/src/sys/netipsec/keysock.c 2007-11-19 16:42:18.000000000 +0100 +++ src/sys/netipsec/keysock.c 2007-12-10 11:26:14.000000000 +0100 @@ -31,6 +31,7 @@ */ #include "opt_ipsec.h" +#include "opt_vimage.h" /* This code has derived from sys/net/rtsock.c on FreeBSD2.2.5 */ @@ -50,17 +51,27 @@ #include #include #include +#include #include #include +#include + +#include +#include #include #include #include #include - +#include +#include +#ifdef VIMAGE +#include +#endif #include +#ifndef VIMAGE struct key_cb { int key_count; int any_count; @@ -69,10 +80,13 @@ static struct sockaddr key_dst = { 2, PF_KEY, }; static struct sockaddr key_src = { 2, PF_KEY, }; +#endif static int key_sendup0 __P((struct rawcb *, struct mbuf *, int)); +#ifndef VIMAGE struct pfkeystat pfkeystat; +#endif /* * key_output() @@ -80,25 +94,26 @@ int key_output(struct mbuf *m, struct socket *so) { + INIT_VNET_IPSEC(curvnet); struct sadb_msg *msg; int len, error = 0; if (m == 0) panic("%s: NULL pointer was passed.\n", __func__); - pfkeystat.out_total++; - pfkeystat.out_bytes += m->m_pkthdr.len; + V_pfkeystat.out_total++; + V_pfkeystat.out_bytes += m->m_pkthdr.len; len = m->m_pkthdr.len; if (len < sizeof(struct sadb_msg)) { - pfkeystat.out_tooshort++; + V_pfkeystat.out_tooshort++; error = EINVAL; goto end; } if (m->m_len < sizeof(struct sadb_msg)) { if ((m = m_pullup(m, sizeof(struct sadb_msg))) == 0) { - pfkeystat.out_nomem++; + V_pfkeystat.out_nomem++; error = ENOBUFS; goto end; } @@ -109,9 +124,9 @@ KEYDEBUG(KEYDEBUG_KEY_DUMP, kdebug_mbuf(m)); msg = mtod(m, struct sadb_msg *); - pfkeystat.out_msgtype[msg->sadb_msg_type]++; + V_pfkeystat.out_msgtype[msg->sadb_msg_type]++; if (len != PFKEY_UNUNIT64(msg->sadb_msg_len)) { - pfkeystat.out_invlen++; + V_pfkeystat.out_invlen++; error = EINVAL; goto end; } @@ -133,6 +148,7 @@ struct mbuf *m; int promisc; { + INIT_VNET_IPSEC(curvnet); int error; if (promisc) { @@ -142,7 +158,7 @@ if (m && m->m_len < sizeof(struct sadb_msg)) m = m_pullup(m, sizeof(struct sadb_msg)); if (!m) { - pfkeystat.in_nomem++; + V_pfkeystat.in_nomem++; m_freem(m); return ENOBUFS; } @@ -155,12 +171,12 @@ pmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len); /* pid and seq? */ - pfkeystat.in_msgtype[pmsg->sadb_msg_type]++; + V_pfkeystat.in_msgtype[pmsg->sadb_msg_type]++; } - if (!sbappendaddr(&rp->rcb_socket->so_rcv, (struct sockaddr *)&key_src, + if (!sbappendaddr(&rp->rcb_socket->so_rcv, (struct sockaddr *)&V_key_src, m, NULL)) { - pfkeystat.in_nomem++; + V_pfkeystat.in_nomem++; m_freem(m); error = ENOBUFS; } else @@ -177,6 +193,7 @@ u_int len; int target; /*target of the resulting message*/ { + INIT_VNET_IPSEC(curvnet); struct mbuf *m, *n, *mprev; int tlen; @@ -192,9 +209,9 @@ * we increment statistics here, just in case we have ENOBUFS * in this function. */ - pfkeystat.in_total++; - pfkeystat.in_bytes += len; - pfkeystat.in_msgtype[msg->sadb_msg_type]++; + V_pfkeystat.in_total++; + V_pfkeystat.in_bytes += len; + V_pfkeystat.in_msgtype[msg->sadb_msg_type]++; /* * Get mbuf chain whenever possible (not clusters), @@ -211,14 +228,14 @@ if (tlen == len) { MGETHDR(n, M_DONTWAIT, MT_DATA); if (n == NULL) { - pfkeystat.in_nomem++; + V_pfkeystat.in_nomem++; return ENOBUFS; } n->m_len = MHLEN; } else { MGET(n, M_DONTWAIT, MT_DATA); if (n == NULL) { - pfkeystat.in_nomem++; + V_pfkeystat.in_nomem++; return ENOBUFS; } n->m_len = MLEN; @@ -228,7 +245,7 @@ if ((n->m_flags & M_EXT) == 0) { m_free(n); m_freem(m); - pfkeystat.in_nomem++; + V_pfkeystat.in_nomem++; return ENOBUFS; } n->m_len = MCLBYTES; @@ -251,9 +268,9 @@ m_copyback(m, 0, len, (caddr_t)msg); /* avoid duplicated statistics */ - pfkeystat.in_total--; - pfkeystat.in_bytes -= len; - pfkeystat.in_msgtype[msg->sadb_msg_type]--; + V_pfkeystat.in_total--; + V_pfkeystat.in_bytes -= len; + V_pfkeystat.in_msgtype[msg->sadb_msg_type]--; return key_sendup_mbuf(so, m, target); } @@ -265,6 +282,8 @@ struct mbuf *m; int target; { + INIT_VNET_NET(curvnet); + INIT_VNET_IPSEC(curvnet); struct mbuf *n; struct keycb *kp; int sendup; @@ -276,22 +295,22 @@ if (so == NULL && target == KEY_SENDUP_ONE) panic("%s: NULL pointer was passed.\n", __func__); - pfkeystat.in_total++; - pfkeystat.in_bytes += m->m_pkthdr.len; + V_pfkeystat.in_total++; + V_pfkeystat.in_bytes += m->m_pkthdr.len; if (m->m_len < sizeof(struct sadb_msg)) { m = m_pullup(m, sizeof(struct sadb_msg)); if (m == NULL) { - pfkeystat.in_nomem++; + V_pfkeystat.in_nomem++; return ENOBUFS; } } if (m->m_len >= sizeof(struct sadb_msg)) { struct sadb_msg *msg; msg = mtod(m, struct sadb_msg *); - pfkeystat.in_msgtype[msg->sadb_msg_type]++; + V_pfkeystat.in_msgtype[msg->sadb_msg_type]++; } mtx_lock(&rawcb_mtx); - LIST_FOREACH(rp, &rawcb_list, list) + LIST_FOREACH(rp, &V_rawcb_list, list) { if (rp->rcb_proto.sp_family != PF_KEY) continue; @@ -333,14 +352,14 @@ sendup++; break; } - pfkeystat.in_msgtarget[target]++; + V_pfkeystat.in_msgtarget[target]++; if (!sendup) continue; if ((n = m_copy(m, 0, (int)M_COPYALL)) == NULL) { m_freem(m); - pfkeystat.in_nomem++; + V_pfkeystat.in_nomem++; mtx_unlock(&rawcb_mtx); return ENOBUFS; } @@ -382,6 +401,7 @@ static int key_attach(struct socket *so, int proto, struct thread *td) { + INIT_VNET_IPSEC(curvnet); struct keycb *kp; int error; @@ -410,10 +430,10 @@ kp->kp_promisc = kp->kp_registered = 0; if (kp->kp_raw.rcb_proto.sp_protocol == PF_KEY) /* XXX: AF_KEY */ - key_cb.key_count++; - key_cb.any_count++; - kp->kp_raw.rcb_laddr = &key_src; - kp->kp_raw.rcb_faddr = &key_dst; + V_key_cb.key_count++; + V_key_cb.any_count++; + kp->kp_raw.rcb_laddr = &V_key_src; + kp->kp_raw.rcb_faddr = &V_key_dst; soisconnected(so); so->so_options |= SO_USELOOPBACK; @@ -458,13 +478,14 @@ static void key_detach(struct socket *so) { + INIT_VNET_IPSEC(curvnet); struct keycb *kp = (struct keycb *)sotorawcb(so); KASSERT(kp != NULL, ("key_detach: kp == NULL")); if (kp->kp_raw.rcb_proto.sp_protocol == PF_KEY) /* XXX: AF_KEY */ - key_cb.key_count--; - key_cb.any_count--; + V_key_cb.key_count--; + V_key_cb.any_count--; key_freereg(so); raw_usrreqs.pru_detach(so); @@ -560,7 +581,14 @@ static void key_init0(void) { - bzero((caddr_t)&key_cb, sizeof(key_cb)); + INIT_VNET_IPSEC(curvnet); + + V_key_dst.sa_len = 2; + V_key_dst.sa_family = PF_KEY; + V_key_src.sa_len = 2; + V_key_src.sa_family = PF_KEY; + + bzero((caddr_t)&V_key_cb, sizeof(V_key_cb)); key_init(); } @@ -568,6 +596,9 @@ .dom_family = PF_KEY, .dom_name = "key", .dom_init = key_init0, +#ifdef VIMAGE + .dom_destroy = key_destroy, +#endif .dom_protosw = keysw, .dom_protoswNPROTOSW = &keysw[sizeof(keysw)/sizeof(keysw[0])] }; --- /u/marko/p4/head/src/sys/netipsec/keysock.h 2007-08-31 03:48:11.000000000 +0200 +++ src/sys/netipsec/keysock.h 2007-10-05 12:27:26.000000000 +0200 @@ -57,7 +57,12 @@ /* others */ u_quad_t sockerr; /* # of socket related errors */ }; - +#ifdef VIMAGE +struct key_cb { + int key_count; + int any_count; +}; +#endif #define KEY_SENDUP_ONE 0 #define KEY_SENDUP_ALL 1 #define KEY_SENDUP_REGISTERED 2 --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/netipsec/vipsec.h 2007-10-05 12:27:26.000000000 +0200 @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2007 University of Zagreb + * Copyright (c) 2007 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * XXX RCS tag goes here + */ + +#ifndef _NETIPSEC_VIPSEC_H_ +#define _NETIPSEC_VIPSEC_H_ + + +#ifdef VIMAGE +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +struct vnet_ipsec { + int _ipsec_debug; + struct ipsecstat _ipsec4stat; + struct secpolicy _ip4_def_policy; + + int _ip4_esp_trans_deflev; + int _ip4_esp_net_deflev; + int _ip4_ah_trans_deflev; + int _ip4_ah_net_deflev; + int _ip4_ah_offsetmask; + int _ip4_ipsec_dfbit; + int _ip4_ipsec_ecn; + int _ip4_esp_randpad; + + int _ipsec_replay; + int _ipsec_integrity; + int _crypto_support; + + u_int32_t _key_debug_level; + u_int _key_spi_trycnt; + u_int32_t _key_spi_minval; + u_int32_t _key_spi_maxval; + u_int32_t _policy_id; + u_int _key_int_random; + u_int _key_larval_lifetime; + int _key_blockacq_count; + int _key_blockacq_lifetime; + int _key_preferred_oldsa; + u_int32_t _acq_seq; + + u_int _saorder_state_alive[3]; + u_int _saorder_state_any[4]; + int _esp_enable; + struct espstat _espstat; + int _esp_max_ivlen; + int _ipsec_esp_keymin; + int _ipsec_esp_auth; + int _ipsec_ah_keymin; + int _ipip_allow; + struct ipipstat _ipipstat; + + struct ipsecstat _ipsec6stat; + int _ip6_esp_trans_deflev; + int _ip6_esp_net_deflev; + int _ip6_ah_trans_deflev; + int _ip6_ah_net_deflev; + int _ip6_ipsec_ecn; + int _ip6_esp_randpad; + + int _ah_enable; + int _ah_cleartos; + struct ahstat _ahstat; + + int _ipcomp_enable; + struct ipcompstat _ipcompstat; + + struct pfkeystat _pfkeystat; + struct key_cb _key_cb; + struct sockaddr _key_dst; + struct sockaddr _key_src; + + LIST_HEAD(, secpolicy) _sptree[IPSEC_DIR_MAX]; + LIST_HEAD(, secashead) _sahtree; + LIST_HEAD(, secreg) _regtree[SADB_SATYPE_MAX + 1]; + LIST_HEAD(, secacq) _acqtree; + LIST_HEAD(, secspacq) _spacqtree; +}; +#endif + +/* + * Symbol translation macros + */ +#define INIT_VNET_IPSEC(vnet) \ + INIT_FROM_VNET(vnet, VNET_MOD_IPSEC, struct vnet_ipsec, vnet_ipsec) + +#define VNET_IPSEC(sym) VSYM(vnet_ipsec, sym) + +#define V_ipsec_debug VNET_IPSEC(ipsec_debug) +#define V_ipsec4stat VNET_IPSEC(ipsec4stat) +#define V_ip4_def_policy VNET_IPSEC(ip4_def_policy) +#define V_ip4_ah_offsetmask VNET_IPSEC(ip4_ah_offsetmask) +#define V_ip4_ipsec_dfbit VNET_IPSEC(ip4_ipsec_dfbit) +#define V_ip4_esp_trans_deflev VNET_IPSEC(ip4_esp_trans_deflev) +#define V_ip4_esp_net_deflev VNET_IPSEC(ip4_esp_net_deflev) +#define V_ip4_ah_trans_deflev VNET_IPSEC(ip4_ah_trans_deflev) +#define V_ip4_ah_net_deflev VNET_IPSEC(ip4_ah_net_deflev) +#define V_ip4_ipsec_ecn VNET_IPSEC(ip4_ipsec_ecn) +#define V_ip4_esp_randpad VNET_IPSEC(ip4_esp_randpad) +#define V_ipsec_replay VNET_IPSEC(ipsec_replay) +#define V_ipsec_integrity VNET_IPSEC(ipsec_integrity) +#define V_crypto_support VNET_IPSEC(crypto_support) +#define V_key_debug_level VNET_IPSEC(key_debug_level) +#define V_key_spi_trycnt VNET_IPSEC(key_spi_trycnt) +#define V_key_spi_minval VNET_IPSEC(key_spi_minval) +#define V_key_spi_maxval VNET_IPSEC(key_spi_maxval) +#define V_policy_id VNET_IPSEC(policy_id) +#define V_key_int_random VNET_IPSEC(key_int_random) +#define V_key_larval_lifetime VNET_IPSEC(key_larval_lifetime) +#define V_key_blockacq_count VNET_IPSEC(key_blockacq_count) +#define V_key_blockacq_lifetime VNET_IPSEC(key_blockacq_lifetime) +#define V_key_preferred_oldsa VNET_IPSEC(key_preferred_oldsa) +#define V_acq_seq VNET_IPSEC(acq_seq) +#define V_saorder_state_alive VNET_IPSEC(saorder_state_alive) +#define V_saorder_state_any VNET_IPSEC(saorder_state_any) +#define V_esp_enable VNET_IPSEC(esp_enable) +#define V_espstat VNET_IPSEC(espstat) +#define V_esp_max_ivlen VNET_IPSEC(esp_max_ivlen) +#define V_ipsec_esp_keymin VNET_IPSEC(ipsec_esp_keymin) +#define V_ipsec_esp_auth VNET_IPSEC(ipsec_esp_auth) +#define V_ipsec_ah_keymin VNET_IPSEC(ipsec_ah_keymin) +#define V_ipip_allow VNET_IPSEC(ipip_allow) +#define V_ipipstat VNET_IPSEC(ipipstat) +#define V_ipsec6stat VNET_IPSEC(ipsec6stat) +#define V_ip6_esp_trans_deflev VNET_IPSEC(ip6_esp_trans_deflev) +#define V_ip6_esp_net_deflev VNET_IPSEC(ip6_esp_net_deflev) +#define V_ip6_ah_trans_deflev VNET_IPSEC(ip6_ah_trans_deflev) +#define V_ip6_ah_net_deflev VNET_IPSEC(ip6_ah_net_deflev) +#define V_ip6_ipsec_ecn VNET_IPSEC(ip6_ipsec_ecn) +#define V_ip6_esp_randpad VNET_IPSEC(ip6_esp_randpad) +#define V_ah_enable VNET_IPSEC(ah_enable) +#define V_ah_cleartos VNET_IPSEC(ah_cleartos) +#define V_ahstat VNET_IPSEC(ahstat) +#define V_ipcomp_enable VNET_IPSEC(ipcomp_enable) +#define V_ipcompstat VNET_IPSEC(ipcompstat) +#define V_pfkeystat VNET_IPSEC(pfkeystat) +#define V_key_cb VNET_IPSEC(key_cb) +#define V_key_dst VNET_IPSEC(key_dst) +#define V_key_src VNET_IPSEC(key_src) +#define V_sptree VNET_IPSEC(sptree) +#define V_sahtree VNET_IPSEC(sahtree) +#define V_regtree VNET_IPSEC(regtree) +#define V_acqtree VNET_IPSEC(acqtree) +#define V_spacqtree VNET_IPSEC(spacqtree) +#endif /* !_NETIPSEC_VIPSEC_H_ */ --- /u/marko/p4/head/src/sys/netipsec/xform_ah.c 2007-08-31 03:48:11.000000000 +0200 +++ src/sys/netipsec/xform_ah.c 2007-10-22 18:07:02.000000000 +0200 @@ -38,6 +38,7 @@ */ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -46,6 +47,7 @@ #include #include #include +#include #include @@ -60,6 +62,7 @@ #include #include #include +#include #ifdef INET6 #include @@ -87,17 +90,23 @@ #define AUTHSIZE(sav) \ ((sav->flags & SADB_X_EXT_OLD) ? 16 : AH_HMAC_HASHLEN) +#ifndef VIMAGE int ah_enable = 1; /* control flow of packets with AH */ int ah_cleartos = 1; /* clear ip_tos when doing AH calc */ struct ahstat ahstat; +#endif SYSCTL_DECL(_net_inet_ah); -SYSCTL_INT(_net_inet_ah, OID_AUTO, - ah_enable, CTLFLAG_RW, &ah_enable, 0, ""); -SYSCTL_INT(_net_inet_ah, OID_AUTO, - ah_cleartos, CTLFLAG_RW, &ah_cleartos, 0, ""); -SYSCTL_STRUCT(_net_inet_ah, IPSECCTL_STATS, - stats, CTLFLAG_RD, &ahstat, ahstat, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ah, OID_AUTO, + ah_enable, CTLFLAG_RW, ah_enable, 0, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ah, OID_AUTO, + ah_cleartos, CTLFLAG_RW, ah_cleartos, 0, ""); +SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_ah, IPSECCTL_STATS, + stats, CTLFLAG_RD, ahstat, ahstat, ""); + +static int ah_iattach(const void *); + +VNET_MOD_DECLARE_STATELESS(AH, ah, ah_iattach, NULL, IPSEC) static unsigned char ipseczeroes[256]; /* larger than an ip6 extension hdr */ @@ -159,6 +168,7 @@ int ah_init0(struct secasvar *sav, struct xformsw *xsp, struct cryptoini *cria) { + INIT_VNET_IPSEC(curvnet); struct auth_hash *thash; int keylen; @@ -213,12 +223,13 @@ static int ah_init(struct secasvar *sav, struct xformsw *xsp) { + INIT_VNET_IPSEC(curvnet); struct cryptoini cria; int error; error = ah_init0(sav, xsp, &cria); return error ? error : - crypto_newsession(&sav->tdb_cryptoid, &cria, crypto_support); + crypto_newsession(&sav->tdb_cryptoid, &cria, V_crypto_support); } /* @@ -247,6 +258,7 @@ static int ah_massage_headers(struct mbuf **m0, int proto, int skip, int alg, int out) { + INIT_VNET_IPSEC(curvnet); struct mbuf *m = *m0; unsigned char *ptr; int off, count; @@ -277,7 +289,7 @@ /* Fix the IP header */ ip = mtod(m, struct ip *); - if (ah_cleartos) + if (V_ah_cleartos) ip->ip_tos = 0; ip->ip_ttl = 0; ip->ip_sum = 0; @@ -551,6 +563,7 @@ static int ah_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff) { + INIT_VNET_IPSEC(curvnet); struct auth_hash *ahx; struct tdb_ident *tdbi; struct tdb_crypto *tc; @@ -575,14 +588,14 @@ IP6_EXTHDR_GET(ah, struct newah *, m, skip, rplen); if (ah == NULL) { DPRINTF(("ah_input: cannot pullup header\n")); - ahstat.ahs_hdrops++; /*XXX*/ + V_ahstat.ahs_hdrops++; /*XXX*/ m_freem(m); return ENOBUFS; } /* Check replay window, if applicable. */ if (sav->replay && !ipsec_chkreplay(ntohl(ah->ah_seq), sav)) { - ahstat.ahs_replay++; + V_ahstat.ahs_replay++; DPRINTF(("%s: packet replay failure: %s\n", __func__, ipsec_logsastr(sav))); m_freem(m); @@ -599,17 +612,17 @@ hl, (u_long) (authsize + rplen - sizeof (struct ah)), ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); - ahstat.ahs_badauthl++; + V_ahstat.ahs_badauthl++; m_freem(m); return EACCES; } - ahstat.ahs_ibytes += m->m_pkthdr.len - skip - hl; + V_ahstat.ahs_ibytes += m->m_pkthdr.len - skip - hl; /* Get crypto descriptors. */ crp = crypto_getreq(1); if (crp == NULL) { DPRINTF(("%s: failed to acquire crypto descriptor\n",__func__)); - ahstat.ahs_crypto++; + V_ahstat.ahs_crypto++; m_freem(m); return ENOBUFS; } @@ -649,7 +662,7 @@ } if (tc == NULL) { DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__)); - ahstat.ahs_crypto++; + V_ahstat.ahs_crypto++; crypto_freereq(crp); m_freem(m); return ENOBUFS; @@ -673,7 +686,7 @@ skip, ahx->type, 0); if (error != 0) { /* NB: mbuf is free'd by ah_massage_headers */ - ahstat.ahs_hdrops++; + V_ahstat.ahs_hdrops++; free(tc, M_XDATA); crypto_freereq(crp); return error; @@ -722,6 +735,7 @@ static int ah_input_cb(struct cryptop *crp) { + INIT_VNET_IPSEC(curvnet); int rplen, error, skip, protoff; unsigned char calc[AH_ALEN_MAX]; struct mbuf *m; @@ -747,7 +761,7 @@ sav = KEY_ALLOCSA(&tc->tc_dst, tc->tc_proto, tc->tc_spi); if (sav == NULL) { - ahstat.ahs_notdb++; + V_ahstat.ahs_notdb++; DPRINTF(("%s: SA expired while in crypto\n", __func__)); error = ENOBUFS; /*XXX*/ goto bad; @@ -770,19 +784,19 @@ return error; } - ahstat.ahs_noxform++; + V_ahstat.ahs_noxform++; DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } else { - ahstat.ahs_hist[sav->alg_auth]++; + V_ahstat.ahs_hist[sav->alg_auth]++; crypto_freereq(crp); /* No longer needed. */ crp = NULL; } /* Shouldn't happen... */ if (m == NULL) { - ahstat.ahs_crypto++; + V_ahstat.ahs_crypto++; DPRINTF(("%s: bogus returned buffer from crypto\n", __func__)); error = EINVAL; goto bad; @@ -808,7 +822,7 @@ "in SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); - ahstat.ahs_badauth++; + V_ahstat.ahs_badauth++; error = EACCES; goto bad; } @@ -839,7 +853,7 @@ m_copydata(m, skip + offsetof(struct newah, ah_seq), sizeof (seq), (caddr_t) &seq); if (ipsec_updatereplay(ntohl(seq), sav)) { - ahstat.ahs_replay++; + V_ahstat.ahs_replay++; error = ENOBUFS; /*XXX as above*/ goto bad; } @@ -853,7 +867,7 @@ DPRINTF(("%s: mangled mbuf chain for SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); - ahstat.ahs_hdrops++; + V_ahstat.ahs_hdrops++; goto bad; } @@ -884,6 +898,7 @@ int skip, int protoff) { + INIT_VNET_IPSEC(curvnet); struct secasvar *sav; struct auth_hash *ahx; struct cryptodesc *crda; @@ -902,7 +917,7 @@ ahx = sav->tdb_authalgxform; IPSEC_ASSERT(ahx != NULL, ("null authentication xform")); - ahstat.ahs_output++; + V_ahstat.ahs_output++; /* Figure out header size. */ rplen = HDRSIZE(sav); @@ -925,7 +940,7 @@ sav->sah->saidx.dst.sa.sa_family, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); - ahstat.ahs_nopf++; + V_ahstat.ahs_nopf++; error = EPFNOSUPPORT; goto bad; } @@ -936,20 +951,20 @@ ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi), rplen + authsize + m->m_pkthdr.len, maxpacketsize)); - ahstat.ahs_toobig++; + V_ahstat.ahs_toobig++; error = EMSGSIZE; goto bad; } /* Update the counters. */ - ahstat.ahs_obytes += m->m_pkthdr.len - skip; + V_ahstat.ahs_obytes += m->m_pkthdr.len - skip; m = m_unshare(m, M_NOWAIT); if (m == NULL) { DPRINTF(("%s: cannot clone mbuf chain, SA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); - ahstat.ahs_hdrops++; + V_ahstat.ahs_hdrops++; error = ENOBUFS; goto bad; } @@ -962,7 +977,7 @@ rplen + authsize, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); - ahstat.ahs_hdrops++; /*XXX differs from openbsd */ + V_ahstat.ahs_hdrops++; /*XXX differs from openbsd */ error = ENOBUFS; goto bad; } @@ -990,7 +1005,7 @@ __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); - ahstat.ahs_wrap++; + V_ahstat.ahs_wrap++; error = EINVAL; goto bad; } @@ -1007,7 +1022,7 @@ if (crp == NULL) { DPRINTF(("%s: failed to acquire crypto descriptors\n", __func__)); - ahstat.ahs_crypto++; + V_ahstat.ahs_crypto++; error = ENOBUFS; goto bad; } @@ -1029,7 +1044,7 @@ if (tc == NULL) { crypto_freereq(crp); DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__)); - ahstat.ahs_crypto++; + V_ahstat.ahs_crypto++; error = ENOBUFS; goto bad; } @@ -1112,6 +1127,7 @@ static int ah_output_cb(struct cryptop *crp) { + INIT_VNET_IPSEC(curvnet); int skip, protoff, error; struct tdb_crypto *tc; struct ipsecrequest *isr; @@ -1131,7 +1147,7 @@ IPSECREQUEST_LOCK(isr); sav = KEY_ALLOCSA(&tc->tc_dst, tc->tc_proto, tc->tc_spi); if (sav == NULL) { - ahstat.ahs_notdb++; + V_ahstat.ahs_notdb++; DPRINTF(("%s: SA expired while in crypto\n", __func__)); error = ENOBUFS; /*XXX*/ goto bad; @@ -1150,7 +1166,7 @@ return error; } - ahstat.ahs_noxform++; + V_ahstat.ahs_noxform++; DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; @@ -1158,12 +1174,12 @@ /* Shouldn't happen... */ if (m == NULL) { - ahstat.ahs_crypto++; + V_ahstat.ahs_crypto++; DPRINTF(("%s: bogus returned buffer from crypto\n", __func__)); error = EINVAL; goto bad; } - ahstat.ahs_hist[sav->alg_auth]++; + V_ahstat.ahs_hist[sav->alg_auth]++; /* * Copy original headers (with the new protocol number) back @@ -1210,9 +1226,26 @@ ah_init, ah_zeroize, ah_input, ah_output, }; +static int +ah_iattach(unused) + const void *unused; +{ + INIT_VNET_IPSEC(curvnet); + + V_ah_enable = 1; /* control flow of packets with AH */ + V_ah_cleartos = 1; /* clear ip_tos when doing AH calc */ + + return 0; +} + static void ah_attach(void) { +#ifdef VIMAGE + vnet_mod_register(&vnet_ah_modinfo); +#else + ah_iattach(NULL); +#endif xform_register(&ah_xformsw); } SYSINIT(ah_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ah_attach, NULL); --- /u/marko/p4/head/src/sys/netipsec/xform_esp.c 2007-08-31 03:48:11.000000000 +0200 +++ src/sys/netipsec/xform_esp.c 2007-10-22 18:07:02.000000000 +0200 @@ -37,6 +37,7 @@ */ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -46,6 +47,7 @@ #include #include #include +#include #include @@ -62,6 +64,7 @@ #include #include #include +#include #ifdef INET6 #include @@ -75,20 +78,26 @@ #include #include +#ifndef VIMAGE int esp_enable = 1; struct espstat espstat; +#endif SYSCTL_DECL(_net_inet_esp); -SYSCTL_INT(_net_inet_esp, OID_AUTO, - esp_enable, CTLFLAG_RW, &esp_enable, 0, ""); -SYSCTL_STRUCT(_net_inet_esp, IPSECCTL_STATS, - stats, CTLFLAG_RD, &espstat, espstat, ""); - +SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_esp, OID_AUTO, + esp_enable, CTLFLAG_RW, esp_enable, 0, ""); +SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_esp, IPSECCTL_STATS, + stats, CTLFLAG_RD, espstat, espstat, ""); +#ifndef VIMAGE static int esp_max_ivlen; /* max iv length over all algorithms */ - +#endif static int esp_input_cb(struct cryptop *op); static int esp_output_cb(struct cryptop *crp); +static int esp_iattach(const void *); + +VNET_MOD_DECLARE_STATELESS(ESP, esp, esp_iattach, NULL, IPSEC) + /* * NB: this is public for use by the PF_KEY support. * NB: if you add support here; be sure to add code to esp_attach below! @@ -122,6 +131,7 @@ size_t esp_hdrsiz(struct secasvar *sav) { + INIT_VNET_IPSEC(curvnet); size_t size; if (sav != NULL) { @@ -145,7 +155,7 @@ * + sizeof (next header field) * + max icv supported. */ - size = sizeof (struct newesp) + esp_max_ivlen + 9 + 16; + size = sizeof (struct newesp) + V_esp_max_ivlen + 9 + 16; } return size; } @@ -156,6 +166,7 @@ static int esp_init(struct secasvar *sav, struct xformsw *xsp) { + INIT_VNET_IPSEC(curvnet); struct enc_xform *txform; struct cryptoini cria, crie; int keylen; @@ -224,13 +235,13 @@ /* init both auth & enc */ crie.cri_next = &cria; error = crypto_newsession(&sav->tdb_cryptoid, - &crie, crypto_support); + &crie, V_crypto_support); } else if (sav->tdb_encalgxform) { error = crypto_newsession(&sav->tdb_cryptoid, - &crie, crypto_support); + &crie, V_crypto_support); } else if (sav->tdb_authalgxform) { error = crypto_newsession(&sav->tdb_cryptoid, - &cria, crypto_support); + &cria, V_crypto_support); } else { /* XXX cannot happen? */ DPRINTF(("%s: no encoding OR authentication xform!\n", @@ -266,6 +277,7 @@ static int esp_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff) { + INIT_VNET_IPSEC(curvnet); struct auth_hash *esph; struct enc_xform *espx; struct tdb_ident *tdbi; @@ -314,7 +326,7 @@ plen, espx->blocksize, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); - espstat.esps_badilen++; + V_espstat.esps_badilen++; m_freem(m); return EINVAL; } @@ -325,13 +337,13 @@ if (esph && sav->replay && !ipsec_chkreplay(ntohl(esp->esp_seq), sav)) { DPRINTF(("%s: packet replay check for %s\n", __func__, ipsec_logsastr(sav))); /*XXX*/ - espstat.esps_replay++; + V_espstat.esps_replay++; m_freem(m); return ENOBUFS; /*XXX*/ } /* Update the counters */ - espstat.esps_ibytes += m->m_pkthdr.len - (skip + hlen + alen); + V_espstat.esps_ibytes += m->m_pkthdr.len - (skip + hlen + alen); /* Find out if we've already done crypto */ for (mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_CRYPTO_DONE, NULL); @@ -350,7 +362,7 @@ if (crp == NULL) { DPRINTF(("%s: failed to acquire crypto descriptors\n", __func__)); - espstat.esps_crypto++; + V_espstat.esps_crypto++; m_freem(m); return ENOBUFS; } @@ -365,7 +377,7 @@ if (tc == NULL) { crypto_freereq(crp); DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__)); - espstat.esps_crypto++; + V_espstat.esps_crypto++; m_freem(m); return ENOBUFS; } @@ -450,6 +462,7 @@ static int esp_input_cb(struct cryptop *crp) { + INIT_VNET_IPSEC(curvnet); u_int8_t lastthree[3], aalg[AH_HMAC_HASHLEN]; int hlen, skip, protoff, error; struct mbuf *m; @@ -474,7 +487,7 @@ sav = KEY_ALLOCSA(&tc->tc_dst, tc->tc_proto, tc->tc_spi); if (sav == NULL) { - espstat.esps_notdb++; + V_espstat.esps_notdb++; DPRINTF(("%s: SA gone during crypto (SA %s/%08lx proto %u)\n", __func__, ipsec_address(&tc->tc_dst), (u_long) ntohl(tc->tc_spi), tc->tc_proto)); @@ -502,7 +515,7 @@ return error; } - espstat.esps_noxform++; + V_espstat.esps_noxform++; DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; @@ -510,12 +523,12 @@ /* Shouldn't happen... */ if (m == NULL) { - espstat.esps_crypto++; + V_espstat.esps_crypto++; DPRINTF(("%s: bogus returned buffer from crypto\n", __func__)); error = EINVAL; goto bad; } - espstat.esps_hist[sav->alg_enc]++; + V_espstat.esps_hist[sav->alg_enc]++; /* If authentication was performed, check now. */ if (esph != NULL) { @@ -524,7 +537,7 @@ * the verification for us. Otherwise we need to * check the authentication calculation. */ - ahstat.ahs_hist[sav->alg_auth]++; + V_ahstat.ahs_hist[sav->alg_auth]++; if (mtag == NULL) { /* Copy the authenticator from the packet */ m_copydata(m, m->m_pkthdr.len - AH_HMAC_HASHLEN, @@ -539,7 +552,7 @@ __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); - espstat.esps_badauth++; + V_espstat.esps_badauth++; error = EACCES; goto bad; } @@ -569,7 +582,7 @@ if (ipsec_updatereplay(ntohl(seq), sav)) { DPRINTF(("%s: packet replay check for %s\n", __func__, ipsec_logsastr(sav))); - espstat.esps_replay++; + V_espstat.esps_replay++; error = ENOBUFS; goto bad; } @@ -584,7 +597,7 @@ /* Remove the ESP header and IV from the mbuf. */ error = m_striphdr(m, skip, hlen); if (error) { - espstat.esps_hdrops++; + V_espstat.esps_hdrops++; DPRINTF(("%s: bad mbuf chain, SA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); @@ -596,7 +609,7 @@ /* Verify pad length */ if (lastthree[1] + 2 > m->m_pkthdr.len - skip) { - espstat.esps_badilen++; + V_espstat.esps_badilen++; DPRINTF(("%s: invalid padding length %d for %u byte packet " "in SA %s/%08lx\n", __func__, lastthree[1], m->m_pkthdr.len - skip, @@ -609,7 +622,7 @@ /* Verify correct decryption by checking the last padding bytes */ if ((sav->flags & SADB_X_EXT_PMASK) != SADB_X_EXT_PRAND) { if (lastthree[1] != lastthree[0] && lastthree[1] != 0) { - espstat.esps_badenc++; + V_espstat.esps_badenc++; DPRINTF(("%s: decryption failed for packet in " "SA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), @@ -653,6 +666,7 @@ int protoff ) { + INIT_VNET_IPSEC(curvnet); struct enc_xform *espx; struct auth_hash *esph; int hlen, rlen, plen, padding, blks, alen, i, roff; @@ -696,7 +710,7 @@ else alen = 0; - espstat.esps_output++; + V_espstat.esps_output++; saidx = &sav->sah->saidx; /* Check for maximum packet size violations. */ @@ -716,7 +730,7 @@ "family %d, SA %s/%08lx\n", __func__, saidx->dst.sa.sa_family, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); - espstat.esps_nopf++; + V_espstat.esps_nopf++; error = EPFNOSUPPORT; goto bad; } @@ -725,19 +739,19 @@ "(len %u, max len %u)\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi), skip + hlen + rlen + padding + alen, maxpacketsize)); - espstat.esps_toobig++; + V_espstat.esps_toobig++; error = EMSGSIZE; goto bad; } /* Update the counters. */ - espstat.esps_obytes += m->m_pkthdr.len - skip; + V_espstat.esps_obytes += m->m_pkthdr.len - skip; m = m_unshare(m, M_NOWAIT); if (m == NULL) { DPRINTF(("%s: cannot clone mbuf chain, SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); - espstat.esps_hdrops++; + V_espstat.esps_hdrops++; error = ENOBUFS; goto bad; } @@ -748,7 +762,7 @@ DPRINTF(("%s: %u byte ESP hdr inject failed for SA %s/%08lx\n", __func__, hlen, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); - espstat.esps_hdrops++; /* XXX diffs from openbsd */ + V_espstat.esps_hdrops++; /* XXX diffs from openbsd */ error = ENOBUFS; goto bad; } @@ -812,7 +826,7 @@ if (crp == NULL) { DPRINTF(("%s: failed to acquire crypto descriptors\n", __func__)); - espstat.esps_crypto++; + V_espstat.esps_crypto++; error = ENOBUFS; goto bad; } @@ -841,7 +855,7 @@ if (tc == NULL) { crypto_freereq(crp); DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__)); - espstat.esps_crypto++; + V_espstat.esps_crypto++; error = ENOBUFS; goto bad; } @@ -885,6 +899,7 @@ static int esp_output_cb(struct cryptop *crp) { + INIT_VNET_IPSEC(curvnet); struct tdb_crypto *tc; struct ipsecrequest *isr; struct secasvar *sav; @@ -899,7 +914,7 @@ IPSECREQUEST_LOCK(isr); sav = KEY_ALLOCSA(&tc->tc_dst, tc->tc_proto, tc->tc_spi); if (sav == NULL) { - espstat.esps_notdb++; + V_espstat.esps_notdb++; DPRINTF(("%s: SA gone during crypto (SA %s/%08lx proto %u)\n", __func__, ipsec_address(&tc->tc_dst), (u_long) ntohl(tc->tc_spi), tc->tc_proto)); @@ -922,7 +937,7 @@ return error; } - espstat.esps_noxform++; + V_espstat.esps_noxform++; DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; @@ -930,14 +945,14 @@ /* Shouldn't happen... */ if (m == NULL) { - espstat.esps_crypto++; + V_espstat.esps_crypto++; DPRINTF(("%s: bogus returned buffer from crypto\n", __func__)); error = EINVAL; goto bad; } - espstat.esps_hist[sav->alg_enc]++; + V_espstat.esps_hist[sav->alg_enc]++; if (sav->tdb_authalgxform != NULL) - ahstat.ahs_hist[sav->alg_auth]++; + V_ahstat.ahs_hist[sav->alg_auth]++; /* Release crypto descriptors. */ free(tc, M_XDATA); @@ -983,14 +998,19 @@ esp_output }; -static void -esp_attach(void) +static int +esp_iattach(unused) + const void *unused; { -#define MAXIV(xform) \ - if (xform.blocksize > esp_max_ivlen) \ - esp_max_ivlen = xform.blocksize \ + INIT_VNET_IPSEC(curvnet); - esp_max_ivlen = 0; + V_esp_enable = 1; + V_esp_max_ivlen = 0; + +#define MAXIV(xform) \ + if (xform.blocksize > V_esp_max_ivlen) \ + V_esp_max_ivlen = xform.blocksize \ + MAXIV(enc_xform_des); /* SADB_EALG_DESCBC */ MAXIV(enc_xform_3des); /* SADB_EALG_3DESCBC */ MAXIV(enc_xform_rijndael128); /* SADB_X_EALG_AES */ @@ -1000,7 +1020,19 @@ MAXIV(enc_xform_null); /* SADB_EALG_NULL */ MAXIV(enc_xform_camellia); /* SADB_X_EALG_CAMELLIACBC */ - xform_register(&esp_xformsw); #undef MAXIV + + return 0; +} + +static void +esp_attach(void) +{ +#ifdef VIMAGE + vnet_mod_register(&vnet_esp_modinfo); +#else + esp_iattach(NULL); +#endif + xform_register(&esp_xformsw); } SYSINIT(esp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, esp_attach, NULL); --- /u/marko/p4/head/src/sys/netipsec/xform_ipcomp.c 2007-08-31 03:48:11.000000000 +0200 +++ src/sys/netipsec/xform_ipcomp.c 2007-10-22 18:07:03.000000000 +0200 @@ -31,6 +31,7 @@ /* IP payload compression protocol (IPComp), see RFC 2393 */ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -41,6 +42,7 @@ #include #include #include +#include #include #include @@ -50,6 +52,7 @@ #include #include #include +#include #ifdef INET6 #include @@ -66,14 +69,20 @@ #include #include +#ifndef VIMAGE int ipcomp_enable = 0; struct ipcompstat ipcompstat; +#endif SYSCTL_DECL(_net_inet_ipcomp); -SYSCTL_INT(_net_inet_ipcomp, OID_AUTO, - ipcomp_enable, CTLFLAG_RW, &ipcomp_enable, 0, ""); -SYSCTL_STRUCT(_net_inet_ipcomp, IPSECCTL_STATS, - stats, CTLFLAG_RD, &ipcompstat, ipcompstat, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipcomp, OID_AUTO, + ipcomp_enable, CTLFLAG_RW, ipcomp_enable, 0, ""); +SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_ipcomp, IPSECCTL_STATS, + stats, CTLFLAG_RD, ipcompstat, ipcompstat, ""); + +static int ipcomp_iattach(const void *); + +VNET_MOD_DECLARE_STATELESS(IPCOMP, ipcomp, ipcomp_iattach, NULL, IPSEC) static int ipcomp_input_cb(struct cryptop *crp); static int ipcomp_output_cb(struct cryptop *crp); @@ -96,6 +105,7 @@ static int ipcomp_init(struct secasvar *sav, struct xformsw *xsp) { + INIT_VNET_IPSEC(curvnet); struct comp_algo *tcomp; struct cryptoini cric; @@ -114,7 +124,7 @@ bzero(&cric, sizeof (cric)); cric.cri_alg = sav->tdb_compalgxform->type; - return crypto_newsession(&sav->tdb_cryptoid, &cric, crypto_support); + return crypto_newsession(&sav->tdb_cryptoid, &cric, V_crypto_support); } /* @@ -136,6 +146,7 @@ static int ipcomp_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff) { + INIT_VNET_IPSEC(curvnet); struct tdb_crypto *tc; struct cryptodesc *crdc; struct cryptop *crp; @@ -148,7 +159,7 @@ if (crp == NULL) { m_freem(m); DPRINTF(("%s: no crypto descriptors\n", __func__)); - ipcompstat.ipcomps_crypto++; + V_ipcompstat.ipcomps_crypto++; return ENOBUFS; } /* Get IPsec-specific opaque pointer */ @@ -157,7 +168,7 @@ m_freem(m); crypto_freereq(crp); DPRINTF(("%s: cannot allocate tdb_crypto\n", __func__)); - ipcompstat.ipcomps_crypto++; + V_ipcompstat.ipcomps_crypto++; return ENOBUFS; } crdc = crp->crp_desc; @@ -208,6 +219,7 @@ static int ipcomp_input_cb(struct cryptop *crp) { + INIT_VNET_IPSEC(curvnet); struct cryptodesc *crd; struct tdb_crypto *tc; int skip, protoff; @@ -230,7 +242,7 @@ sav = KEY_ALLOCSA(&tc->tc_dst, tc->tc_proto, tc->tc_spi); if (sav == NULL) { - ipcompstat.ipcomps_notdb++; + V_ipcompstat.ipcomps_notdb++; DPRINTF(("%s: SA expired while in crypto\n", __func__)); error = ENOBUFS; /*XXX*/ goto bad; @@ -253,19 +265,19 @@ return error; } - ipcompstat.ipcomps_noxform++; + V_ipcompstat.ipcomps_noxform++; DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } /* Shouldn't happen... */ if (m == NULL) { - ipcompstat.ipcomps_crypto++; + V_ipcompstat.ipcomps_crypto++; DPRINTF(("%s: null mbuf returned from crypto\n", __func__)); error = EINVAL; goto bad; } - ipcompstat.ipcomps_hist[sav->alg_comp]++; + V_ipcompstat.ipcomps_hist[sav->alg_comp]++; clen = crp->crp_olen; /* Length of data after processing */ @@ -277,7 +289,7 @@ m->m_pkthdr.len = clen + hlen + skip; if (m->m_len < skip + hlen && (m = m_pullup(m, skip + hlen)) == 0) { - ipcompstat.ipcomps_hdrops++; /*XXX*/ + V_ipcompstat.ipcomps_hdrops++; /*XXX*/ DPRINTF(("%s: m_pullup failed\n", __func__)); error = EINVAL; /*XXX*/ goto bad; @@ -290,7 +302,7 @@ /* Remove the IPCOMP header */ error = m_striphdr(m, skip, hlen); if (error) { - ipcompstat.ipcomps_hdrops++; + V_ipcompstat.ipcomps_hdrops++; DPRINTF(("%s: bad mbuf chain, IPCA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); @@ -328,6 +340,7 @@ int protoff ) { + INIT_VNET_IPSEC(curvnet); struct secasvar *sav; struct comp_algo *ipcompx; int error, ralen, hlen, maxpacketsize, roff; @@ -348,7 +361,7 @@ ralen = m->m_pkthdr.len - skip; /* Raw payload length before comp. */ hlen = IPCOMP_HLENGTH; - ipcompstat.ipcomps_output++; + V_ipcompstat.ipcomps_output++; /* Check for maximum packet size violations. */ switch (sav->sah->saidx.dst.sa.sa_family) { @@ -363,7 +376,7 @@ break; #endif /* INET6 */ default: - ipcompstat.ipcomps_nopf++; + V_ipcompstat.ipcomps_nopf++; DPRINTF(("%s: unknown/unsupported protocol family %d, " "IPCA %s/%08lx\n", __func__, sav->sah->saidx.dst.sa.sa_family, @@ -373,7 +386,7 @@ goto bad; } if (skip + hlen + ralen > maxpacketsize) { - ipcompstat.ipcomps_toobig++; + V_ipcompstat.ipcomps_toobig++; DPRINTF(("%s: packet in IPCA %s/%08lx got too big " "(len %u, max len %u)\n", __func__, ipsec_address(&sav->sah->saidx.dst), @@ -384,11 +397,11 @@ } /* Update the counters */ - ipcompstat.ipcomps_obytes += m->m_pkthdr.len - skip; + V_ipcompstat.ipcomps_obytes += m->m_pkthdr.len - skip; m = m_unshare(m, M_NOWAIT); if (m == NULL) { - ipcompstat.ipcomps_hdrops++; + V_ipcompstat.ipcomps_hdrops++; DPRINTF(("%s: cannot clone mbuf chain, IPCA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); @@ -399,7 +412,7 @@ /* Inject IPCOMP header */ mo = m_makespace(m, skip, hlen, &roff); if (mo == NULL) { - ipcompstat.ipcomps_wrap++; + V_ipcompstat.ipcomps_wrap++; DPRINTF(("%s: IPCOMP header inject failed for IPCA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); @@ -434,7 +447,7 @@ /* Get crypto descriptors */ crp = crypto_getreq(1); if (crp == NULL) { - ipcompstat.ipcomps_crypto++; + V_ipcompstat.ipcomps_crypto++; DPRINTF(("%s: failed to acquire crypto descriptor\n",__func__)); error = ENOBUFS; goto bad; @@ -454,7 +467,7 @@ tc = (struct tdb_crypto *) malloc(sizeof(struct tdb_crypto), M_XDATA, M_NOWAIT|M_ZERO); if (tc == NULL) { - ipcompstat.ipcomps_crypto++; + V_ipcompstat.ipcomps_crypto++; DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__)); crypto_freereq(crp); error = ENOBUFS; @@ -488,6 +501,7 @@ static int ipcomp_output_cb(struct cryptop *crp) { + INIT_VNET_IPSEC(curvnet); struct tdb_crypto *tc; struct ipsecrequest *isr; struct secasvar *sav; @@ -504,7 +518,7 @@ IPSECREQUEST_LOCK(isr); sav = KEY_ALLOCSA(&tc->tc_dst, tc->tc_proto, tc->tc_spi); if (sav == NULL) { - ipcompstat.ipcomps_notdb++; + V_ipcompstat.ipcomps_notdb++; DPRINTF(("%s: SA expired while in crypto\n", __func__)); error = ENOBUFS; /*XXX*/ goto bad; @@ -523,19 +537,19 @@ error = crypto_dispatch(crp); return error; } - ipcompstat.ipcomps_noxform++; + V_ipcompstat.ipcomps_noxform++; DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } /* Shouldn't happen... */ if (m == NULL) { - ipcompstat.ipcomps_crypto++; + V_ipcompstat.ipcomps_crypto++; DPRINTF(("%s: bogus return buffer from crypto\n", __func__)); error = EINVAL; goto bad; } - ipcompstat.ipcomps_hist[sav->alg_comp]++; + V_ipcompstat.ipcomps_hist[sav->alg_comp]++; if (rlen > crp->crp_olen) { /* Adjust the length in the IP header */ @@ -552,7 +566,7 @@ break; #endif /* INET6 */ default: - ipcompstat.ipcomps_nopf++; + V_ipcompstat.ipcomps_nopf++; DPRINTF(("%s: unknown/unsupported protocol " "family %d, IPCA %s/%08lx\n", __func__, sav->sah->saidx.dst.sa.sa_family, @@ -592,9 +606,25 @@ ipcomp_output }; +static int +ipcomp_iattach(unused) + const void *unused; +{ + INIT_VNET_IPSEC(curvnet); + + V_ipcomp_enable = 0; + + return 0; +} + static void ipcomp_attach(void) { +#ifdef VIMAGE + vnet_mod_register(&vnet_ipcomp_modinfo); +#else + ipcomp_iattach(NULL); +#endif xform_register(&ipcomp_xformsw); } SYSINIT(ipcomp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipcomp_attach, NULL); --- /u/marko/p4/head/src/sys/netipsec/xform_ipip.c 2007-11-30 21:34:34.000000000 +0100 +++ src/sys/netipsec/xform_ipip.c 2007-12-10 11:26:14.000000000 +0100 @@ -42,6 +42,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_enc.h" +#include "opt_vimage.h" #include #include @@ -50,11 +51,13 @@ #include #include #include +#include #include #include #include #include +#include #include #include @@ -63,8 +66,10 @@ #include #include #include +#include #include +#include #include #include @@ -90,20 +95,26 @@ * We can control the acceptance of IP4 packets by altering the sysctl * net.inet.ipip.allow value. Zero means drop them, all else is acceptance. */ +#ifndef VIMAGE int ipip_allow = 0; struct ipipstat ipipstat; +#endif SYSCTL_DECL(_net_inet_ipip); -SYSCTL_INT(_net_inet_ipip, OID_AUTO, - ipip_allow, CTLFLAG_RW, &ipip_allow, 0, ""); -SYSCTL_STRUCT(_net_inet_ipip, IPSECCTL_STATS, - stats, CTLFLAG_RD, &ipipstat, ipipstat, ""); +SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipip, OID_AUTO, + ipip_allow, CTLFLAG_RW, ipip_allow, 0, ""); +SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_ipip, IPSECCTL_STATS, + stats, CTLFLAG_RD, ipipstat, ipipstat, ""); /* XXX IPCOMP */ #define M_IPSEC (M_AUTHIPHDR|M_AUTHIPDGM|M_DECRYPTED) static void _ipip_input(struct mbuf *m, int iphlen, struct ifnet *gifp); +static int ipip_iattach(const void *); + +VNET_MOD_DECLARE_STATELESS(IPIP, ipip, ipip_iattach, NULL, IPSEC) + #ifdef INET6 /* * Really only a wrapper for ipip_input(), for use with IPv6. @@ -155,6 +166,8 @@ static void _ipip_input(struct mbuf *m, int iphlen, struct ifnet *gifp) { + INIT_VNET_NET(curvnet); + INIT_VNET_IPSEC(curvnet); register struct sockaddr_in *sin; register struct ifnet *ifp; register struct ifaddr *ifa; @@ -170,7 +183,7 @@ u_int8_t v; int hlen; - ipipstat.ipips_ipackets++; + V_ipipstat.ipips_ipackets++; m_copydata(m, 0, 1, &v); @@ -186,7 +199,7 @@ break; #endif default: - ipipstat.ipips_family++; + V_ipipstat.ipips_family++; m_freem(m); return /* EAFNOSUPPORT */; } @@ -195,7 +208,7 @@ if (m->m_len < hlen) { if ((m = m_pullup(m, hlen)) == NULL) { DPRINTF(("%s: m_pullup (1) failed\n", __func__)); - ipipstat.ipips_hdrops++; + V_ipipstat.ipips_hdrops++; return; } } @@ -232,7 +245,7 @@ /* Sanity check */ if (m->m_pkthdr.len < sizeof(struct ip)) { - ipipstat.ipips_hdrops++; + V_ipipstat.ipips_hdrops++; m_freem(m); return; } @@ -252,7 +265,7 @@ break; #endif default: - ipipstat.ipips_family++; + V_ipipstat.ipips_family++; m_freem(m); return; /* EAFNOSUPPORT */ } @@ -263,7 +276,7 @@ if (m->m_len < hlen) { if ((m = m_pullup(m, hlen)) == NULL) { DPRINTF(("%s: m_pullup (2) failed\n", __func__)); - ipipstat.ipips_hdrops++; + V_ipipstat.ipips_hdrops++; return; } } @@ -280,7 +293,7 @@ case 4: ipo = mtod(m, struct ip *); nxt = ipo->ip_p; - ip_ecn_egress(ip4_ipsec_ecn, &otos, &ipo->ip_tos); + ip_ecn_egress(V_ip4_ipsec_ecn, &otos, &ipo->ip_tos); break; #endif /* INET */ #ifdef INET6 @@ -288,7 +301,7 @@ ip6 = (struct ip6_hdr *) ipo; nxt = ip6->ip6_nxt; itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; - ip_ecn_egress(ip6_ipsec_ecn, &otos, &itos); + ip_ecn_egress(V_ip6_ipsec_ecn, &otos, &itos); ip6->ip6_flow &= ~htonl(0xff << 20); ip6->ip6_flow |= htonl((u_int32_t) itos << 20); break; @@ -300,9 +313,9 @@ /* Check for local address spoofing. */ if ((m->m_pkthdr.rcvif == NULL || !(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK)) && - ipip_allow != 2) { + V_ipip_allow != 2) { IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { #ifdef INET if (ipo) { @@ -314,7 +327,7 @@ if (sin->sin_addr.s_addr == ipo->ip_src.s_addr) { - ipipstat.ipips_spoof++; + V_ipipstat.ipips_spoof++; m_freem(m); IFNET_RUNLOCK(); return; @@ -331,7 +344,7 @@ sin6 = (struct sockaddr_in6 *) ifa->ifa_addr; if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, &ip6->ip6_src)) { - ipipstat.ipips_spoof++; + V_ipipstat.ipips_spoof++; m_freem(m); IFNET_RUNLOCK(); return; @@ -345,7 +358,7 @@ } /* Statistics */ - ipipstat.ipips_ibytes += m->m_pkthdr.len - iphlen; + V_ipipstat.ipips_ibytes += m->m_pkthdr.len - iphlen; #ifdef DEV_ENC switch (v >> 4) { @@ -391,7 +404,7 @@ } if (netisr_queue(isr, m)) { /* (0) on success. */ - ipipstat.ipips_qfull++; + V_ipipstat.ipips_qfull++; DPRINTF(("%s: packet dropped because of full queue\n", __func__)); } @@ -406,6 +419,10 @@ int protoff ) { + INIT_VNET_IPSEC(curvnet); +#ifdef INET + INIT_VNET_INET(curvnet); +#endif /* INET */ struct secasvar *sav; u_int8_t tp, otos; struct secasindex *saidx; @@ -440,7 +457,7 @@ "address in SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); - ipipstat.ipips_unspec++; + V_ipipstat.ipips_unspec++; error = EINVAL; goto bad; } @@ -448,7 +465,7 @@ M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); if (m == 0) { DPRINTF(("%s: M_PREPEND failed\n", __func__)); - ipipstat.ipips_hdrops++; + V_ipipstat.ipips_hdrops++; error = ENOBUFS; goto bad; } @@ -458,7 +475,7 @@ ipo->ip_v = IPVERSION; ipo->ip_hl = 5; ipo->ip_len = htons(m->m_pkthdr.len); - ipo->ip_ttl = ip_defttl; + ipo->ip_ttl = V_ip_defttl; ipo->ip_sum = 0; ipo->ip_src = saidx->src.sin.sin_addr; ipo->ip_dst = saidx->dst.sin.sin_addr; @@ -517,7 +534,7 @@ "address in SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); - ipipstat.ipips_unspec++; + V_ipipstat.ipips_unspec++; error = ENOBUFS; goto bad; } @@ -532,7 +549,7 @@ M_PREPEND(m, sizeof(struct ip6_hdr), M_DONTWAIT); if (m == 0) { DPRINTF(("%s: M_PREPEND failed\n", __func__)); - ipipstat.ipips_hdrops++; + V_ipipstat.ipips_hdrops++; error = ENOBUFS; goto bad; } @@ -543,7 +560,7 @@ ip6o->ip6_vfc &= ~IPV6_VERSION_MASK; ip6o->ip6_vfc |= IPV6_VERSION; ip6o->ip6_plen = htons(m->m_pkthdr.len); - ip6o->ip6_hlim = ip_defttl; + ip6o->ip6_hlim = V_ip_defttl; ip6o->ip6_dst = saidx->dst.sin6.sin6_addr; ip6o->ip6_src = saidx->src.sin6.sin6_addr; @@ -582,12 +599,12 @@ nofamily: DPRINTF(("%s: unsupported protocol family %u\n", __func__, saidx->dst.sa.sa_family)); - ipipstat.ipips_family++; + V_ipipstat.ipips_family++; error = EAFNOSUPPORT; /* XXX diffs from openbsd */ goto bad; } - ipipstat.ipips_opackets++; + V_ipipstat.ipips_opackets++; *mp = m; #ifdef INET @@ -597,7 +614,7 @@ tdb->tdb_cur_bytes += m->m_pkthdr.len - sizeof(struct ip); #endif - ipipstat.ipips_obytes += m->m_pkthdr.len - sizeof(struct ip); + V_ipipstat.ipips_obytes += m->m_pkthdr.len - sizeof(struct ip); } #endif /* INET */ @@ -608,7 +625,7 @@ tdb->tdb_cur_bytes += m->m_pkthdr.len - sizeof(struct ip6_hdr); #endif - ipipstat.ipips_obytes += + V_ipipstat.ipips_obytes += m->m_pkthdr.len - sizeof(struct ip6_hdr); } #endif /* INET6 */ @@ -653,21 +670,25 @@ extern struct domain inetdomain; static struct protosw ipe4_protosw = -{ SOCK_RAW, &inetdomain, IPPROTO_IPV4, PR_ATOMIC|PR_ADDR|PR_LASTHDR, - ip4_input, - 0, 0, rip_ctloutput, - 0, - 0, 0, 0, 0, - &rip_usrreqs +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_IPV4, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, + .pr_input = ip4_input, + .pr_ctloutput = rip_ctloutput, + .pr_usrreqs = &rip_usrreqs }; #ifdef INET6 static struct ip6protosw ipe6_protosw = -{ SOCK_RAW, &inetdomain, IPPROTO_IPV6, PR_ATOMIC|PR_ADDR|PR_LASTHDR, - ip4_input6, - 0, 0, rip_ctloutput, - 0, - 0, 0, 0, 0, - &rip_usrreqs +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_IPV6, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, + .pr_input = ip4_input6, + .pr_ctloutput = rip_ctloutput, + .pr_usrreqs = &rip_usrreqs }; #endif @@ -686,17 +707,39 @@ return ((m->m_flags & M_IPSEC) != 0 ? 1 : 0); } -static void -ipe4_attach(void) +static int +ipip_iattach(unused) + const void *unused; { + INIT_VNET_IPSEC(curvnet); + + V_ipip_allow = 0; + +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return 0; +#endif + xform_register(&ipe4_xformsw); /* attach to encapsulation framework */ /* XXX save return cookie for detach on module remove */ (void) encap_attach_func(AF_INET, -1, - ipe4_encapcheck, &ipe4_protosw, NULL); + ipe4_encapcheck, &ipe4_protosw, NULL); #ifdef INET6 (void) encap_attach_func(AF_INET6, -1, - ipe4_encapcheck, (struct protosw *)&ipe6_protosw, NULL); + ipe4_encapcheck, (struct protosw *)&ipe6_protosw, NULL); +#endif + + return 0; +} + +static void +ipe4_attach(void) +{ +#ifdef VIMAGE + vnet_mod_register(&vnet_ipip_modinfo); +#else + ipip_iattach(NULL); #endif } SYSINIT(ipe4_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipe4_attach, NULL); --- /u/marko/p4/head/src/sys/nfsclient/nfs_diskless.c 2008-02-27 18:29:16.000000000 +0100 +++ src/sys/nfsclient/nfs_diskless.c 2008-02-27 11:50:11.000000000 +0100 @@ -36,14 +36,17 @@ __FBSDID("$FreeBSD: src/sys/nfsclient/nfs_diskless.c,v 1.19 2008/02/11 23:28:35 kris Exp $"); #include "opt_bootp.h" +#include "opt_vimage.h" #include #include #include #include #include - #include +#include + +#include #include #include #include @@ -148,6 +151,7 @@ void nfs_setup_diskless(void) { + INIT_VNET_NET(curvnet); struct nfs_diskless *nd = &nfs_diskless; struct ifnet *ifp; struct ifaddr *ifa; @@ -176,7 +180,7 @@ } ifa = NULL; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)ifa->ifa_addr; --- /u/marko/p4/head/src/sys/nfsclient/nfs_socket.c 2008-02-27 18:29:17.000000000 +0100 +++ src/sys/nfsclient/nfs_socket.c 2008-02-27 18:05:41.000000000 +0100 @@ -40,6 +40,7 @@ */ #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -58,6 +59,7 @@ #include #include #include +#include #include #include @@ -1506,6 +1508,7 @@ mtx_unlock(&nmp->nm_mtx); continue; } + CURVNET_SET(so->so_vnet); /* * If there is enough space and the window allows.. * Resend it @@ -1571,6 +1574,7 @@ mtx_unlock(&rep->r_mtx); mtx_unlock(&nmp->nm_mtx); } + CURVNET_RESTORE(); } mtx_unlock(&nfs_reqq_mtx); callout_reset(&nfs_callout, nfs_ticks, nfs_timer, NULL); --- /u/marko/p4/head/src/sys/nfsclient/nfs_vfsops.c 2008-02-27 18:29:18.000000000 +0100 +++ src/sys/nfsclient/nfs_vfsops.c 2008-02-27 11:50:18.000000000 +0100 @@ -38,6 +38,7 @@ #include "opt_bootp.h" #include "opt_nfsroot.h" +#include "opt_vimage.h" #include #include @@ -57,6 +58,7 @@ #include #include #include +#include #include #include @@ -399,6 +401,7 @@ int nfs_mountroot(struct mount *mp, struct thread *td) { + INIT_VPROCG(TD_TO_VPROCG(td)); struct nfsv3_diskless *nd = &nfsv3_diskless; struct socket *so; struct vnode *vp; @@ -408,14 +411,17 @@ char buf[128]; char *cp; + CURVNET_SET(TD_TO_VNET(td)); #if defined(BOOTP_NFSROOT) && defined(BOOTP) bootpc_init(); /* use bootp to get nfs_diskless filled in */ #elif defined(NFS_ROOT) nfs_setup_diskless(); #endif - if (nfs_diskless_valid == 0) + if (nfs_diskless_valid == 0) { + CURVNET_RESTORE(); return (-1); + } if (nfs_diskless_valid == 1) nfs_convert_diskless(); @@ -497,6 +503,7 @@ printf("NFS ROOT: %s\n", buf); if ((error = nfs_mountdiskless(buf, &nd->root_saddr, &nd->root_args, td, &vp, mp)) != 0) { + CURVNET_RESTORE(); return (error); } @@ -505,12 +512,14 @@ * set hostname here and then let the "/etc/rc.xxx" files * mount the right /var based upon its preset value. */ - bcopy(nd->my_hostnam, hostname, MAXHOSTNAMELEN); - hostname[MAXHOSTNAMELEN - 1] = '\0'; + bcopy(nd->my_hostnam, V_hostname, MAXHOSTNAMELEN); + V_hostname[MAXHOSTNAMELEN - 1] = '\0'; for (i = 0; i < MAXHOSTNAMELEN; i++) - if (hostname[i] == '\0') + if (V_hostname[i] == '\0') break; inittodr(ntohl(nd->root_time)); + + CURVNET_RESTORE(); return (0); } --- /u/marko/p4/head/src/sys/nfsclient/nfs_vnops.c 2008-02-27 18:29:20.000000000 +0100 +++ src/sys/nfsclient/nfs_vnops.c 2008-02-27 11:50:20.000000000 +0100 @@ -40,6 +40,7 @@ */ #include "opt_inet.h" +#include "opt_vimage.h" #include #include @@ -60,6 +61,7 @@ #include #include #include +#include #include #include @@ -80,6 +82,8 @@ #include #include + +#include #include #include @@ -1386,15 +1390,18 @@ if (v3) { tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED); if (fmode & O_EXCL) { + CURVNET_SET(VFSTONFS(dvp->v_mount)->nm_so->so_vnet); *tl = txdr_unsigned(NFSV3CREATE_EXCLUSIVE); tl = nfsm_build(u_int32_t *, NFSX_V3CREATEVERF); #ifdef INET - if (!TAILQ_EMPTY(&in_ifaddrhead)) - *tl++ = IA_SIN(TAILQ_FIRST(&in_ifaddrhead))->sin_addr.s_addr; + INIT_VNET_INET(curvnet); + if (!TAILQ_EMPTY(&V_in_ifaddrhead)) + *tl++ = IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr.s_addr; else #endif *tl++ = create_verf; *tl = ++create_verf; + CURVNET_RESTORE(); } else { *tl = txdr_unsigned(NFSV3CREATE_UNCHECKED); nfsm_v3attrbuild(vap, FALSE); --- /u/marko/p4/head/src/sys/security/audit/audit_worker.c 2008-02-27 18:29:25.000000000 +0100 +++ src/sys/security/audit/audit_worker.c 2008-02-27 11:51:04.000000000 +0100 @@ -1,6 +1,6 @@ /* * Copyright (c) 1999-2005 Apple Computer, Inc. - * Copyright (c) 2006-2008 Robert N. M. Watson + * Copyright (c) 2006 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -27,7 +27,7 @@ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: src/sys/security/audit/audit_worker.c,v 1.20 2008/02/27 17:12:22 rwatson Exp $ + * $FreeBSD: src/sys/security/audit/audit_worker.c,v 1.19 2008/01/13 14:44:13 attilio Exp $ */ #include @@ -48,7 +48,6 @@ #include #include #include -#include #include #include #include @@ -76,18 +75,31 @@ static struct proc *audit_thread; /* - * audit_cred and audit_vp are the stored credential and vnode to use for - * active audit trail. They are protected by audit_worker_sx, which will be - * held across all I/O and all rotation to prevent them from being replaced - * (rotated) while in use. The audit_file_rotate_wait flag is set when the - * kernel has delivered a trigger to auditd to rotate the trail, and is - * cleared when the next rotation takes place. It is also protected by - * audit_worker_sx. + * When an audit log is rotated, the actual rotation must be performed by the + * audit worker thread, as it may have outstanding writes on the current + * audit log. audit_replacement_vp holds the vnode replacing the current + * vnode. We can't let more than one replacement occur at a time, so if more + * than one thread requests a replacement, only one can have the replacement + * "in progress" at any given moment. If a thread tries to replace the audit + * vnode and discovers a replacement is already in progress (i.e., + * audit_replacement_flag != 0), then it will sleep on audit_replacement_cv + * waiting its turn to perform a replacement. When a replacement is + * completed, this cv is signalled by the worker thread so a waiting thread + * can start another replacement. We also store a credential to perform + * audit log write operations with. + * + * The current credential and vnode are thread-local to audit_worker. */ -static int audit_file_rotate_wait; -static struct sx audit_worker_sx; -static struct ucred *audit_cred; -static struct vnode *audit_vp; +static struct cv audit_replacement_cv; + +static int audit_replacement_flag; +static struct vnode *audit_replacement_vp; +static struct ucred *audit_replacement_cred; + +/* + * Flags related to Kernel->user-space communication. + */ +static int audit_file_rotate_wait; /* * Write an audit record to a file, performed as the last stage after both @@ -98,8 +110,8 @@ * the audit daemon, since the message is asynchronous anyway. */ static void -audit_record_write(struct vnode *vp, struct ucred *cred, void *data, - size_t len) +audit_record_write(struct vnode *vp, struct ucred *cred, struct thread *td, + void *data, size_t len) { static struct timeval last_lowspace_trigger; static struct timeval last_fail; @@ -110,8 +122,6 @@ struct vattr vattr; long temp; - sx_assert(&audit_worker_sx, SA_LOCKED); /* audit_file_rotate_wait. */ - if (vp == NULL) return; @@ -123,11 +133,11 @@ * that we know how we're doing on space. Consider failure of these * operations to indicate a future inability to write to the file. */ - error = VFS_STATFS(vp->v_mount, mnt_stat, curthread); + error = VFS_STATFS(vp->v_mount, mnt_stat, td); if (error) goto fail; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - error = VOP_GETATTR(vp, &vattr, cred, curthread); + error = VOP_GETATTR(vp, &vattr, cred, td); VOP_UNLOCK(vp, 0); if (error) goto fail; @@ -190,8 +200,6 @@ */ if ((audit_fstat.af_filesz != 0) && (audit_file_rotate_wait == 0) && (vattr.va_size >= audit_fstat.af_filesz)) { - sx_assert(&audit_worker_sx, SA_XLOCKED); - audit_file_rotate_wait = 1; (void)send_trigger(AUDIT_TRIGGER_ROTATE_KERNEL); } @@ -226,7 +234,7 @@ } error = vn_rdwr(UIO_WRITE, vp, data, len, (off_t)0, UIO_SYSSPACE, - IO_APPEND|IO_UNIT, cred, NULL, NULL, curthread); + IO_APPEND|IO_UNIT, cred, NULL, NULL, td); if (error == ENOSPC) goto fail_enospc; else if (error) @@ -244,7 +252,7 @@ if (audit_in_failure) { if (audit_q_len == 0 && audit_pre_q_len == 0) { VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK); - (void)VOP_FSYNC(vp, MNT_WAIT, curthread); + (void)VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0); panic("Audit store overflow; record queue drained."); } @@ -261,7 +269,7 @@ */ if (audit_fail_stop) { VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK); - (void)VOP_FSYNC(vp, MNT_WAIT, curthread); + (void)VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0); panic("Audit log space exhausted and fail-stop set."); } @@ -276,7 +284,7 @@ */ if (audit_panic_on_write_fail) { VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK); - (void)VOP_FSYNC(vp, MNT_WAIT, curthread); + (void)VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0); panic("audit_worker: write error %d\n", error); } else if (ppsratecheck(&last_fail, &cur_fail, 1)) @@ -285,6 +293,62 @@ } /* + * If an appropriate signal has been received rotate the audit log based on + * the global replacement variables. Signal consumers as needed that the + * rotation has taken place. + * + * The global variables and CVs used to signal the audit_worker to perform a + * rotation are essentially a message queue of depth 1. It would be much + * nicer to actually use a message queue. + */ +static void +audit_worker_rotate(struct ucred **audit_credp, struct vnode **audit_vpp, + struct thread *audit_td) +{ + int do_replacement_signal, vfslocked; + struct ucred *old_cred; + struct vnode *old_vp; + + mtx_assert(&audit_mtx, MA_OWNED); + + do_replacement_signal = 0; + while (audit_replacement_flag != 0) { + old_cred = *audit_credp; + old_vp = *audit_vpp; + *audit_credp = audit_replacement_cred; + *audit_vpp = audit_replacement_vp; + audit_replacement_cred = NULL; + audit_replacement_vp = NULL; + audit_replacement_flag = 0; + + audit_enabled = (*audit_vpp != NULL); + + if (old_vp != NULL) { + mtx_unlock(&audit_mtx); + vfslocked = VFS_LOCK_GIANT(old_vp->v_mount); + vn_close(old_vp, AUDIT_CLOSE_FLAGS, old_cred, + audit_td); + VFS_UNLOCK_GIANT(vfslocked); + crfree(old_cred); + mtx_lock(&audit_mtx); + old_cred = NULL; + old_vp = NULL; + } + do_replacement_signal = 1; + } + + /* + * Signal that replacement have occurred to wake up and start any + * other replacements started in parallel. We can continue about our + * business in the mean time. We broadcast so that both new + * replacements can be inserted, but also so that the source(s) of + * replacement can return successfully. + */ + if (do_replacement_signal) + cv_broadcast(&audit_replacement_cv); +} + +/* * Given a kernel audit record, process as required. Kernel audit records * are converted to one, or possibly two, BSM records, depending on whether * there is a user audit record present also. Kernel records need be @@ -292,38 +356,23 @@ * written to disk, and audit pipes. */ static void -audit_worker_process_record(struct kaudit_record *ar) +audit_worker_process_record(struct vnode *audit_vp, struct ucred *audit_cred, + struct thread *audit_td, struct kaudit_record *ar) { struct au_record *bsm; au_class_t class; au_event_t event; au_id_t auid; int error, sorf; - int trail_locked; - - /* - * We hold the audit_worker_sx lock over both writes, if there are - * two, so that the two records won't be split across a rotation and - * end up in two different trail files. - */ - if (((ar->k_ar_commit & AR_COMMIT_USER) && - (ar->k_ar_commit & AR_PRESELECT_USER_TRAIL)) || - (ar->k_ar_commit & AR_PRESELECT_TRAIL)) { - sx_xlock(&audit_worker_sx); - trail_locked = 1; - } else - trail_locked = 0; /* * First, handle the user record, if any: commit to the system trail * and audit pipes as selected. */ if ((ar->k_ar_commit & AR_COMMIT_USER) && - (ar->k_ar_commit & AR_PRESELECT_USER_TRAIL)) { - sx_assert(&audit_worker_sx, SA_XLOCKED); - audit_record_write(audit_vp, audit_cred, ar->k_udata, - ar->k_ulen); - } + (ar->k_ar_commit & AR_PRESELECT_USER_TRAIL)) + audit_record_write(audit_vp, audit_cred, audit_td, + ar->k_udata, ar->k_ulen); if ((ar->k_ar_commit & AR_COMMIT_USER) && (ar->k_ar_commit & AR_PRESELECT_USER_PIPE)) @@ -332,7 +381,7 @@ if (!(ar->k_ar_commit & AR_COMMIT_KERNEL) || ((ar->k_ar_commit & AR_PRESELECT_PIPE) == 0 && (ar->k_ar_commit & AR_PRESELECT_TRAIL) == 0)) - goto out; + return; auid = ar->k_ar.ar_subj_auid; event = ar->k_ar.ar_event; @@ -345,11 +394,11 @@ error = kaudit_to_bsm(ar, &bsm); switch (error) { case BSM_NOAUDIT: - goto out; + return; case BSM_FAILURE: printf("audit_worker_process_record: BSM_FAILURE\n"); - goto out; + return; case BSM_SUCCESS: break; @@ -358,10 +407,9 @@ panic("kaudit_to_bsm returned %d", error); } - if (ar->k_ar_commit & AR_PRESELECT_TRAIL) { - sx_assert(&audit_worker_sx, SA_XLOCKED); - audit_record_write(audit_vp, audit_cred, bsm->data, bsm->len); - } + if (ar->k_ar_commit & AR_PRESELECT_TRAIL) + audit_record_write(audit_vp, audit_cred, audit_td, bsm->data, + bsm->len); if (ar->k_ar_commit & AR_PRESELECT_PIPE) audit_pipe_submit(auid, event, class, sorf, @@ -369,39 +417,50 @@ bsm->len); kau_free(bsm); -out: - if (trail_locked) - sx_xunlock(&audit_worker_sx); } /* * The audit_worker thread is responsible for watching the event queue, * dequeueing records, converting them to BSM format, and committing them to * disk. In order to minimize lock thrashing, records are dequeued in sets - * to a thread-local work queue. - * - * Note: this means that the effect bound on the size of the pending record - * queue is 2x the length of the global queue. + * to a thread-local work queue. In addition, the audit_work performs the + * actual exchange of audit log vnode pointer, as audit_vp is a thread-local + * variable. */ static void audit_worker(void *arg) { struct kaudit_queue ar_worklist; struct kaudit_record *ar; + struct ucred *audit_cred; + struct thread *audit_td; + struct vnode *audit_vp; int lowater_signal; + /* + * These are thread-local variables requiring no synchronization. + */ TAILQ_INIT(&ar_worklist); + audit_cred = NULL; + audit_td = curthread; + audit_vp = NULL; + mtx_lock(&audit_mtx); while (1) { mtx_assert(&audit_mtx, MA_OWNED); /* - * Wait for a record. + * Wait for record or rotation events. */ - while (TAILQ_EMPTY(&audit_q)) + while (!audit_replacement_flag && TAILQ_EMPTY(&audit_q)) cv_wait(&audit_worker_cv, &audit_mtx); /* + * First priority: replace the audit log target if requested. + */ + audit_worker_rotate(&audit_cred, &audit_vp, audit_td); + + /* * If there are records in the global audit record queue, * transfer them to a thread-local queue and process them * one by one. If we cross the low watermark threshold, @@ -422,7 +481,8 @@ mtx_unlock(&audit_mtx); while ((ar = TAILQ_FIRST(&ar_worklist))) { TAILQ_REMOVE(&ar_worklist, ar, k_q); - audit_worker_process_record(ar); + audit_worker_process_record(audit_vp, audit_cred, + audit_td, ar); audit_free(ar); } mtx_lock(&audit_mtx); @@ -432,45 +492,50 @@ /* * audit_rotate_vnode() is called by a user or kernel thread to configure or * de-configure auditing on a vnode. The arguments are the replacement - * credential (referenced) and vnode (referenced and opened) to substitute - * for the current credential and vnode, if any. If either is set to NULL, - * both should be NULL, and this is used to indicate that audit is being - * disabled. Any previous cred/vnode will be closed and freed. We re-enable - * generating rotation requests to auditd. + * credential and vnode to substitute for the current credential and vnode, + * if any. If either is set to NULL, both should be NULL, and this is used + * to indicate that audit is being disabled. The real work is done in the + * audit_worker thread, but audit_rotate_vnode() waits synchronously for that + * to complete. + * + * The vnode should be referenced and opened by the caller. The credential + * should be referenced. audit_rotate_vnode() will own both references as of + * this call, so the caller should not release either. + * + * XXXAUDIT: Review synchronize communication logic. Really, this is a + * message queue of depth 1. We are essentially acquiring ownership of the + * communications queue, inserting our message, and waiting for an + * acknowledgement. */ void audit_rotate_vnode(struct ucred *cred, struct vnode *vp) { - struct ucred *old_audit_cred; - struct vnode *old_audit_vp; - int vfslocked; - KASSERT((cred != NULL && vp != NULL) || (cred == NULL && vp == NULL), - ("audit_rotate_vnode: cred %p vp %p", cred, vp)); + /* + * If other parallel log replacements have been requested, we wait + * until they've finished before continuing. + */ + mtx_lock(&audit_mtx); + while (audit_replacement_flag != 0) + cv_wait(&audit_replacement_cv, &audit_mtx); + audit_replacement_cred = cred; + audit_replacement_flag = 1; + audit_replacement_vp = vp; /* - * Rotate the vnode/cred, and clear the rotate flag so that we will - * send a rotate trigger if the new file fills. + * Wake up the audit worker to perform the exchange once we release + * the mutex. */ - sx_xlock(&audit_worker_sx); - old_audit_cred = audit_cred; - old_audit_vp = audit_vp; - audit_cred = cred; - audit_vp = vp; - audit_file_rotate_wait = 0; - audit_enabled = (audit_vp != NULL); - sx_xunlock(&audit_worker_sx); + cv_signal(&audit_worker_cv); /* - * If there was an old vnode/credential, close and free. + * Wait for the audit_worker to broadcast that a replacement has + * taken place; we know that once this has happened, our vnode has + * been replaced in, so we can return successfully. */ - if (old_audit_vp != NULL) { - vfslocked = VFS_LOCK_GIANT(old_audit_vp->v_mount); - vn_close(old_audit_vp, AUDIT_CLOSE_FLAGS, old_audit_cred, - curthread); - VFS_UNLOCK_GIANT(vfslocked); - crfree(old_audit_cred); - } + cv_wait(&audit_replacement_cv, &audit_mtx); + audit_file_rotate_wait = 0; /* We can now request another rotation */ + mtx_unlock(&audit_mtx); } void @@ -478,7 +543,7 @@ { int error; - sx_init(&audit_worker_sx, "audit_worker_sx"); + cv_init(&audit_replacement_cv, "audit_replacement_cv"); error = kproc_create(audit_worker, NULL, &audit_thread, RFHIGHPID, 0, "audit"); if (error) --- /u/marko/p4/head/src/sys/sys/domain.h 2007-08-31 03:48:34.000000000 +0200 +++ src/sys/sys/domain.h 2007-10-05 12:27:49.000000000 +0200 @@ -48,6 +48,8 @@ char *dom_name; void (*dom_init) /* initialize domain data structures */ (void); + void (*dom_destroy) /* cleanup structures / state */ + (void); int (*dom_externalize) /* externalize access rights */ (struct mbuf *, struct mbuf **); void (*dom_dispose) /* dispose of internalized rights */ @@ -56,6 +58,8 @@ struct domain *dom_next; int (*dom_rtattach) /* initialize routing table */ (void **, int); + int (*dom_rtdetach) /* clean up routing table */ + (void **, int); int dom_rtoffset; /* an arg to rtattach, in bits */ int dom_maxrtkey; /* for routing layer */ void *(*dom_ifattach)(struct ifnet *); --- /u/marko/p4/head/src/sys/sys/kernel.h 2007-12-27 19:33:27.000000000 +0100 +++ src/sys/sys/kernel.h 2008-01-14 19:24:07.000000000 +0100 @@ -57,8 +57,10 @@ /* 1.1 */ extern unsigned long hostid; extern char hostuuid[64]; +#ifndef VIMAGE extern char hostname[MAXHOSTNAMELEN]; extern char domainname[MAXHOSTNAMELEN]; +#endif extern char kernelname[MAXPATHLEN]; extern int tick; /* usec per tick (1000000 / hz) */ @@ -116,6 +118,7 @@ SI_SUB_MAC = 0x2180000, /* TrustedBSD MAC subsystem */ SI_SUB_MAC_POLICY = 0x21C0000, /* TrustedBSD MAC policies */ SI_SUB_MAC_LATE = 0x21D0000, /* TrustedBSD MAC subsystem */ + SI_SUB_VIMAGE = 0x21E0000, /* vimage 0 */ SI_SUB_INTRINSIC = 0x2200000, /* proc 0*/ SI_SUB_VM_CONF = 0x2300000, /* config VM, set limits*/ SI_SUB_DDB_SERVICES = 0x2380000, /* capture, scripting, etc. */ @@ -163,6 +166,7 @@ SI_SUB_KTHREAD_BUF = 0xea00000, /* buffer daemon*/ SI_SUB_KTHREAD_UPDATE = 0xec00000, /* update daemon*/ SI_SUB_KTHREAD_IDLE = 0xee00000, /* idle procs*/ + SI_SUB_VIMAGE_DONE = 0xef00000, /* clear curvnet*/ SI_SUB_SMP = 0xf000000, /* start the APs*/ SI_SUB_RUN_SCHEDULER = 0xfffffff /* scheduler*/ }; --- /u/marko/p4/head/src/sys/sys/mbuf.h 2008-02-03 08:16:02.000000000 +0100 +++ src/sys/sys/mbuf.h 2008-02-27 11:51:30.000000000 +0100 @@ -194,6 +194,7 @@ #define M_VLANTAG 0x10000 /* ether_vtag is valid */ #define M_PROMISC 0x20000 /* packet was not for us */ #define M_NOFREE 0x40000 /* do not free mbuf - it is embedded in the cluster */ +#define M_REMOTE_VNET 0x80000 /* mbuf crossed boundary between two vnets */ /* * External buffer types: identify ext_buf type. @@ -216,7 +217,7 @@ #define M_COPYFLAGS (M_PKTHDR|M_EOR|M_RDONLY|M_PROTO1|M_PROTO1|M_PROTO2|\ M_PROTO3|M_PROTO4|M_PROTO5|M_SKIP_FIREWALL|\ M_BCAST|M_MCAST|M_FRAG|M_FIRSTFRAG|M_LASTFRAG|\ - M_VLANTAG|M_PROMISC) + M_VLANTAG|M_PROMISC|M_REMOTE_VNET) /* * Flags to purge when crossing layers. --- /u/marko/p4/head/src/sys/sys/proc.h 2008-02-27 18:29:26.000000000 +0100 +++ src/sys/sys/proc.h 2008-02-27 11:51:37.000000000 +0100 @@ -301,6 +301,9 @@ struct kaudit_record *td_ar; /* (k) Active audit record, if any. */ int td_syscalls; /* per-thread syscall count (used by NFS :)) */ struct lpohead td_lprof[2]; /* (a) lock profiling objects. */ + + struct vnet *td_vnet; /* (*) Effective vnet */ + const char *td_vnet_lpush; /* (*) Debugging vnet push / pop */ }; struct mtx *thread_lock_block(struct thread *); --- /u/marko/p4/head/src/sys/sys/protosw.h 2007-08-31 03:48:35.000000000 +0200 +++ src/sys/sys/protosw.h 2007-10-05 12:27:51.000000000 +0200 @@ -72,6 +72,7 @@ typedef void pr_ctlinput_t (int, struct sockaddr *, void *); typedef int pr_ctloutput_t (struct socket *, struct sockopt *); typedef void pr_init_t (void); +typedef void pr_destroy_t (void); typedef void pr_fasttimo_t (void); typedef void pr_slowtimo_t (void); typedef void pr_drain_t (void); @@ -93,6 +94,7 @@ pr_usrreq_t *pr_ousrreq; /* utility hooks */ pr_init_t *pr_init; + pr_destroy_t *pr_destroy; pr_fasttimo_t *pr_fasttimo; /* fast timeout (200ms) */ pr_slowtimo_t *pr_slowtimo; /* slow timeout (500ms) */ pr_drain_t *pr_drain; /* flush any excess space possible */ --- /u/marko/p4/head/src/sys/sys/resource.h 2007-11-30 21:34:35.000000000 +0100 +++ src/sys/sys/resource.h 2007-12-10 11:26:17.000000000 +0100 @@ -153,7 +153,9 @@ #ifdef _KERNEL +#ifndef VIMAGE extern struct loadavg averunnable; +#endif void read_cpu_time(long *cp_time); /* Writes array of CPUSTATES */ #else --- /u/marko/p4/head/src/sys/sys/sched.h 2007-08-31 03:48:36.000000000 +0200 +++ src/sys/sys/sched.h 2007-10-13 14:57:16.000000000 +0200 @@ -63,6 +63,9 @@ #define _SCHED_H_ #ifdef _KERNEL + +struct vprocg; + /* * General scheduling info. * @@ -72,7 +75,12 @@ * sched_runnable: * Runnable threads for this processor. */ +#ifdef VIMAGE +int sched_load(struct vprocg *); +void sched_load_reassign(struct vprocg *, struct vprocg *); +#else int sched_load(void); +#endif int sched_rr_interval(void); int sched_runnable(void); --- /u/marko/p4/head/src/sys/sys/socketvar.h 2008-02-27 18:29:26.000000000 +0100 +++ src/sys/sys/socketvar.h 2008-02-27 11:51:43.000000000 +0100 @@ -39,6 +39,8 @@ #include #include +struct vnet; + /* * Kernel structure per socket. * Contains send and receive buffer queues, @@ -66,6 +68,7 @@ short so_state; /* (b) internal state flags SS_* */ int so_qstate; /* (e) internal state flags SQ_* */ void *so_pcb; /* protocol control block */ + struct vnet *so_vnet; /* network stack instance */ struct protosw *so_proto; /* (a) protocol handle */ /* * Variables for connection queuing. --- /u/marko/p4/head/src/sys/sys/sockio.h 2007-08-31 03:48:36.000000000 +0200 +++ src/sys/sys/sockio.h 2007-10-05 12:27:52.000000000 +0200 @@ -108,6 +108,10 @@ #define SIOCGPRIVATE_0 _IOWR('i', 80, struct ifreq) /* device private 0 */ #define SIOCGPRIVATE_1 _IOWR('i', 81, struct ifreq) /* device private 1 */ +#define SIOCSPVIMAGE _IOW('i', 101, struct vi_req) /* set proc vimage */ +#define SIOCGPVIMAGE _IOWR('i', 102, struct vi_req) /* get proc vimage */ +#define SIOCSIFVIMAGE _IOWR('i', 103, struct vi_req) /* set ifc vi/net */ + #define SIOCSDRVSPEC _IOW('i', 123, struct ifdrv) /* set driver-specific parameters */ #define SIOCGDRVSPEC _IOWR('i', 123, struct ifdrv) /* get driver-specific --- /u/marko/p4/head/src/sys/sys/sysctl.h 2008-01-04 13:50:41.000000000 +0100 +++ src/sys/sys/sysctl.h 2008-02-27 18:30:30.000000000 +0100 @@ -115,6 +115,9 @@ #define SYSCTL_HANDLER_ARGS struct sysctl_oid *oidp, void *arg1, int arg2, \ struct sysctl_req *req +#define SYSCTL_HANDLER_V_ARGS struct sysctl_oid *oidp, void *arg1, int arg2, \ + struct sysctl_req *req, int subs, int mod + /* definitions for sysctl_req 'lock' member */ #define REQ_UNLOCKED 0 /* not locked and not wired */ #define REQ_LOCKED 1 /* locked and not wired */ @@ -162,6 +165,24 @@ const char *oid_fmt; int oid_refcnt; const char *oid_descr; + short oid_v_subs; + short oid_v_mod; +}; + +struct sysctl_v_oid { + struct sysctl_oid_list *oid_parent; + SLIST_ENTRY(sysctl_oid) oid_link; + int oid_number; + u_int oid_kind; + void *oid_arg1; + int oid_arg2; + const char *oid_name; + int (*oid_handler)(SYSCTL_HANDLER_V_ARGS); + const char *oid_fmt; + int oid_refcnt; + const char *oid_descr; + short oid_v_subs; + short oid_v_mod; }; #define SYSCTL_IN(r, p, l) (r->newfunc)(r, p, l) @@ -175,6 +196,10 @@ int sysctl_handle_string(SYSCTL_HANDLER_ARGS); int sysctl_handle_opaque(SYSCTL_HANDLER_ARGS); +int sysctl_handle_v_int(SYSCTL_HANDLER_V_ARGS); +int sysctl_handle_v_string(SYSCTL_HANDLER_V_ARGS); +int sysctl_handle_v_opaque(SYSCTL_HANDLER_V_ARGS); + /* * These functions are used to add/remove an oid from the mib. */ @@ -215,10 +240,24 @@ /* This constructs a "raw" MIB oid. */ #define SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \ static struct sysctl_oid sysctl__##parent##_##name = { \ - &sysctl_##parent##_children, { 0 }, \ - nbr, kind, a1, a2, #name, handler, fmt, 0, __DESCR(descr) }; \ + &sysctl_##parent##_children, { 0 }, nbr, kind, \ + a1, a2, #name, handler, fmt, 0, __DESCR(descr), 0, 0 }; \ DATA_SET(sysctl_set, sysctl__##parent##_##name) +#ifdef VIMAGE +#define SYSCTL_V_OID(subs, mod, parent, nbr, name, kind, a1, a2, \ + handler, fmt, descr) \ + static struct sysctl_v_oid sysctl__##parent##_##name = { \ + &sysctl_##parent##_children, { 0 }, nbr, kind, \ + (void *) offsetof(struct mod, _##a1), a2, #name, \ + handler, fmt, 0, __DESCR(descr), subs, V_MOD_##mod }; \ + DATA_SET(sysctl_set, sysctl__##parent##_##name) +#else +#define SYSCTL_V_OID(subs, mod, parent, nbr, name, kind, a1, a2, \ + handler, fmt, descr) \ + SYSCTL_OID(parent, nbr, name, kind, &a1, a2, handler, fmt, descr) +#endif + #define SYSCTL_ADD_OID(ctx, parent, nbr, name, kind, a1, a2, handler, fmt, descr) \ sysctl_add_oid(ctx, parent, nbr, name, kind, a1, a2, handler, fmt, __DESCR(descr)) @@ -226,7 +265,7 @@ #define SYSCTL_NODE(parent, nbr, name, access, handler, descr) \ struct sysctl_oid_list SYSCTL_NODE_CHILDREN(parent, name); \ SYSCTL_OID(parent, nbr, name, CTLTYPE_NODE|(access), \ - (void*)&SYSCTL_NODE_CHILDREN(parent, name), 0, handler, \ + (void*)&SYSCTL_NODE_CHILDREN(parent, name), 0, handler, \ "N", descr) #define SYSCTL_ADD_NODE(ctx, parent, nbr, name, access, handler, descr) \ @@ -238,6 +277,16 @@ SYSCTL_OID(parent, nbr, name, CTLTYPE_STRING|(access), \ arg, len, sysctl_handle_string, "A", descr) +#ifdef VIMAGE +#define SYSCTL_V_STRING(subs, mod, parent, nbr, name, access, sym, len, descr) \ + SYSCTL_V_OID(subs, mod, parent, nbr, name, CTLTYPE_STRING|(access), \ + sym, len, sysctl_handle_v_string, "A", descr) +#else +#define SYSCTL_V_STRING(subs, mod, parent, nbr, name, access, sym, len, descr) \ + SYSCTL_OID(parent, nbr, name, CTLTYPE_STRING|(access), \ + &sym, len, sysctl_handle_string, "A", descr) +#endif + #define SYSCTL_ADD_STRING(ctx, parent, nbr, name, access, arg, len, descr) \ sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_STRING|(access), \ arg, len, sysctl_handle_string, "A", __DESCR(descr)) @@ -247,6 +296,16 @@ SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|(access), \ ptr, val, sysctl_handle_int, "I", descr) +#ifdef VIMAGE +#define SYSCTL_V_INT(subs, mod, parent, nbr, name, access, sym, val, descr) \ + SYSCTL_V_OID(subs, mod, parent, nbr, name, CTLTYPE_INT|(access), \ + sym, val, sysctl_handle_v_int, "I", descr) +#else +#define SYSCTL_V_INT(subs, mod, parent, nbr, name, access, sym, val, descr) \ + SYSCTL_OID(parent, nbr, name, CTLTYPE_INT|(access), \ + &sym, val, sysctl_handle_int, "I", descr) +#endif + #define SYSCTL_ADD_INT(ctx, parent, nbr, name, access, ptr, val, descr) \ sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_INT|(access), \ ptr, val, sysctl_handle_int, "I", __DESCR(descr)) @@ -309,6 +368,20 @@ ptr, sizeof(struct type), sysctl_handle_opaque, \ "S," #type, descr) +#ifdef VIMAGE +#define SYSCTL_V_STRUCT(subs, mod, parent, nbr, name, access, sym, \ + type, descr) \ + SYSCTL_V_OID(subs, mod, parent, nbr, name, CTLTYPE_OPAQUE|(access), \ + sym, sizeof(struct type), sysctl_handle_v_opaque, \ + "S," #type, descr) +#else +#define SYSCTL_V_STRUCT(subs, mod, parent, nbr, name, access, sym, \ + type, descr) \ + SYSCTL_OID(parent, nbr, name, CTLTYPE_OPAQUE|(access), \ + &sym, sizeof(struct type), sysctl_handle_opaque, \ + "S," #type, descr) +#endif + #define SYSCTL_ADD_STRUCT(ctx, parent, nbr, name, access, ptr, type, descr) \ sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_OPAQUE|(access), \ ptr, sizeof(struct type), sysctl_handle_opaque, "S," #type, __DESCR(descr)) @@ -318,6 +391,18 @@ SYSCTL_OID(parent, nbr, name, (access), \ ptr, arg, handler, fmt, descr) +#ifdef VIMAGE +#define SYSCTL_V_PROC(subs, mod, parent, nbr, name, access, sym, arg, \ + handler, fmt, descr) \ + SYSCTL_V_OID(subs, mod, parent, nbr, name, (access), \ + sym, arg, handler, fmt, descr) +#else +#define SYSCTL_V_PROC(subs, mod, parent, nbr, name, access, sym, arg, \ + handler, fmt, descr) \ + SYSCTL_OID(parent, nbr, name, (access), \ + &sym, arg, handler, fmt, descr) +#endif + #define SYSCTL_ADD_PROC(ctx, parent, nbr, name, access, ptr, arg, handler, fmt, descr) \ sysctl_add_oid(ctx, parent, nbr, name, (access), \ ptr, arg, handler, fmt, __DESCR(descr)) @@ -329,6 +414,29 @@ #define FEATURE(name, desc) \ SYSCTL_INT(_kern_features, OID_AUTO, name, CTLFLAG_RD, 0, 1, desc) +/* + * Resolve void *arg1 in a proper virtualization container. + */ +#ifdef VIMAGE +#define SYSCTL_RESOLVE_V_ARG1() do { \ + char *cp; \ + switch (subs) { \ + case V_NET: \ + cp = (char *) TD_TO_VNET(curthread)->mod_data[mod]; \ + break; \ + case V_PROCG: \ + cp = (char *) TD_TO_VPROCG(curthread); \ + break; \ + case V_CPU: \ + cp = (char *) TD_TO_VCPU(curthread); \ + break; \ + default: \ + panic("unsupported module id %d", subs); \ + } \ + arg1 = cp + (size_t) arg1; \ +} while (0) +#endif + #endif /* _KERNEL */ /* --- /u/marko/p4/head/src/sys/sys/ucred.h 2007-12-27 19:33:32.000000000 +0100 +++ src/sys/sys/ucred.h 2008-01-14 19:24:10.000000000 +0100 @@ -35,6 +35,8 @@ #include +struct vimage; + /* * Credentials. * @@ -55,7 +57,9 @@ struct uidinfo *cr_uidinfo; /* per euid resource consumption */ struct uidinfo *cr_ruidinfo; /* per ruid resource consumption */ struct prison *cr_prison; /* jail(2) */ - void *cr_pspare[3]; /* vimage 2; general use 1 */ + struct vimage *cr_vimage; /* effective vimage */ + struct vimage *cr_rvimage; /* real vimage */ + void *cr_pspare[1]; /* vimage 2; general use 1 */ #define cr_endcopy cr_label struct label *cr_label; /* MAC label */ struct auditinfo_addr cr_audit; /* Audit properties. */ --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/sys/vimage.h 2007-12-10 11:26:17.000000000 +0100 @@ -0,0 +1,502 @@ +/*- + * Copyright (c) 2006 University of Zagreb + * Copyright (c) 2006 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * XXX RCS tag goes here + */ + +#ifndef _NET_VIMAGE_H_ +#define _NET_VIMAGE_H_ + +#include +#include +#include +#include + +#ifdef INVARIANTS +#define VNET_DEBUG +#endif + +struct vimage; +struct vprocg; +struct vnet; +struct vi_req; +struct kld_sym_lookup; + +struct ifnet; /* XXX must go away */ + +#define curvnet curthread->td_vnet + +typedef int vnet_attach_fn(const void *); +typedef int vnet_detach_fn(const void *); + + +#ifdef VIMAGE + +#define V_GLOBAL 0 +#define V_NET 1 +#define V_PROCG 2 +#define V_CPU 3 + +#define VNET_MOD_NONE -1 +/*statefull modules */ +#define VNET_MOD_NET 0 +#define VNET_MOD_NETGRAPH 1 +#define VNET_MOD_INET 2 +#define VNET_MOD_INET6 3 +#define VNET_MOD_IPSEC 4 +#define VNET_MOD_IPFW 5 +#define VNET_MOD_DUMMYNET 6 +#define VNET_MOD_PF 7 +#define VNET_MOD_ALTQ 8 +#define VNET_MOD_IPX 9 +#define VNET_MOD_ATALK 10 +/* stateless modules */ +#define VNET_MOD_NG_WORMHOLE 19 +#define VNET_MOD_NG_ETHER 20 +#define VNET_MOD_NG_IFACE 21 +#define VNET_MOD_NG_EIFACE 22 +#define VNET_MOD_ESP 23 +#define VNET_MOD_IPIP 24 +#define VNET_MOD_AH 25 +#define VNET_MOD_IPCOMP 26 +#define VNET_MOD_GIF 27 +#define VNET_MOD_ARP 28 +#define VNET_MOD_RTABLE 29 +#define VNET_MOD_LOIF 30 +#define VNET_MOD_DOMAIN 31 +#define VNET_MOD_DYNAMIC_START 32 +#define VNET_MOD_MAX 64 + +/* Needed for ugly sysctl virtualization macros */ +#define V_MOD_vnet_net VNET_MOD_NET +#define V_MOD_vnet_netgraph VNET_MOD_NETGRAPH +#define V_MOD_vnet_inet VNET_MOD_INET +#define V_MOD_vnet_inet6 VNET_MOD_INET6 +#define V_MOD_vnet_ipfw VNET_MOD_IPFW +#define V_MOD_vnet_pf VNET_MOD_PF +#define V_MOD_vnet_gif VNET_MOD_GIF +#define V_MOD_vnet_ipsec VNET_MOD_IPSEC + +#define V_MOD_vprocg 0 +#define V_MOD_vcpu 0 + +struct vnet { + void *mod_data[VNET_MOD_MAX]; + + u_int vnet_ref; /* reference count */ + LIST_ENTRY(vnet) vnet_le; /* all vnets list */ + u_int vnet_id; /* ID num */ + + u_int ifccnt; + u_int sockcnt; + + u_int vnet_magic_n; +}; + +struct vnet_symmap { + char *name; + size_t offset; + size_t size; +}; + +struct vnet_modinfo { + u_int vmi_id; + u_int vmi_dependson; + char *vmi_name; + vnet_attach_fn *vmi_iattach; + vnet_detach_fn *vmi_idetach; + size_t vmi_struct_size; + struct vnet_symmap *vmi_symmap; +}; + +struct vnet_modlink { + TAILQ_ENTRY(vnet_modlink) vml_mod_le; + const struct vnet_modinfo *vml_modinfo; + const void *vml_iarg; + const char *vml_iname; +}; + +#define VNET_MOD_DECLARE(m_name_uc, m_name_lc, m_iattach, m_idetach, \ + m_dependson, m_symmap) \ + static const struct vnet_modinfo vnet_##m_name_lc##_modinfo = { \ + .vmi_id = VNET_MOD_##m_name_uc, \ + .vmi_dependson = VNET_MOD_##m_dependson, \ + .vmi_name = #m_name_lc, \ + .vmi_iattach = m_iattach, \ + .vmi_idetach = m_idetach, \ + .vmi_struct_size = \ + sizeof(struct vnet_##m_name_lc), \ + .vmi_symmap = m_symmap \ +}; + +#define VNET_MOD_DECLARE_STATELESS(m_name_uc, m_name_lc, m_iattach, m_idetach, \ + m_dependson) \ + static const struct vnet_modinfo vnet_##m_name_lc##_modinfo = { \ + .vmi_id = VNET_MOD_##m_name_uc, \ + .vmi_dependson = VNET_MOD_##m_dependson, \ + .vmi_name = #m_name_lc, \ + .vmi_iattach = m_iattach, \ + .vmi_idetach = m_idetach \ +}; + +#define VSYM(base, sym) ((base)->_##sym) + +#define VNET_SYMMAP(mod, name) \ + { #name, offsetof(struct vnet_##mod, _##name), \ + sizeof(((struct vnet_##mod *) curthread)->_##name) } + +#define VNET_SYMMAP_END \ + { NULL, 0 } + +#define VNET_MAGIC_N 0x3e0d8f29 + + +#ifdef VNET_DEBUG + +#define VNET_ASSERT(condition) \ + if (!(condition)) { \ + printf("VNET_ASSERT @ %s:%d %s():\n", \ + __FILE__, __LINE__, __FUNCTION__); \ + panic(#condition); \ + } + +#define CURVNET_SET_QUIET(arg) \ + VNET_ASSERT((arg)->vnet_magic_n == VNET_MAGIC_N); \ + struct vnet *saved_vnet = curvnet; \ + const char *saved_vnet_lpush = curthread->td_vnet_lpush; \ + curvnet = arg; \ + curthread->td_vnet_lpush = __FUNCTION__; + +#define CURVNET_SET_VERBOSE(arg) \ + CURVNET_SET_QUIET(arg) \ + if (saved_vnet) \ + printf("curvnet_set(%p) in %s() on cpu %d, prev %p in %s()\n", \ + curvnet, curthread->td_vnet_lpush, curcpu, \ + saved_vnet, saved_vnet_lpush); + +#define CURVNET_SET(arg) CURVNET_SET_VERBOSE(arg) + +#define CURVNET_RESTORE() \ + VNET_ASSERT(saved_vnet == NULL || \ + saved_vnet->vnet_magic_n == VNET_MAGIC_N); \ + curvnet = saved_vnet; \ + curthread->td_vnet_lpush = saved_vnet_lpush; + +#define INIT_FROM_VNET(vnet, modindex, modtype, sym) \ + if (vnet != curvnet) \ + panic("in %s:%d %s()\n vnet=%p curvnet=%p", \ + __FILE__, __LINE__, __FUNCTION__, \ + vnet, curvnet); \ + modtype *sym = (vnet)->mod_data[modindex]; + +#define VNET_ITERLOOP_BEGIN() \ + struct vnet *vnet_iter; \ + VNET_LIST_REF(); \ + LIST_FOREACH(vnet_iter, &vnet_head, vnet_le) { \ + CURVNET_SET(vnet_iter); + +#define VNET_ITERLOOP_BEGIN_QUIET() \ + struct vnet *vnet_iter; \ + VNET_LIST_REF(); \ + LIST_FOREACH(vnet_iter, &vnet_head, vnet_le) { \ + CURVNET_SET_QUIET(vnet_iter); + +#define VNET_ITERLOOP_END() \ + CURVNET_RESTORE(); \ + } \ + VNET_LIST_UNREF(); + +#else /* !VNET_DEBUG */ + +#define VNET_ASSERT(condition) + +#define CURVNET_SET(arg) \ + struct vnet *saved_vnet = curvnet; \ + curvnet = arg; + +#define CURVNET_SET_VERBOSE(arg) CURVNET_SET(arg) +#define CURVNET_SET_QUIET(arg) CURVNET_SET(arg) + +#define CURVNET_RESTORE() \ + curvnet = saved_vnet; + +#define INIT_FROM_VNET(vnet, modindex, modtype, sym) \ + modtype *sym = (vnet)->mod_data[modindex]; + +#define VNET_ITERLOOP_BEGIN() \ + struct vnet *vnet_iter; \ + VNET_LIST_REF(); \ + LIST_FOREACH(vnet_iter, &vnet_head, vnet_le) { \ + CURVNET_SET(vnet_iter); + +#define VNET_ITERLOOP_BEGIN_QUIET() VNET_ITERLOOP_BEGIN() + +#define VNET_ITERLOOP_END() \ + CURVNET_RESTORE(); \ + } \ + VNET_LIST_UNREF(); + +#endif /* !VNET_DEBUG */ + +#define INIT_VPROCG(arg) struct vprocg *vprocg = (arg); + +#define VPROCG_ITERLOOP_BEGIN() \ + struct vprocg *vprocg_iter; \ + LIST_FOREACH(vprocg_iter, &vprocg_head, vprocg_le) { \ + +#define VPROCG_ITERLOOP_END() \ + } \ + +#define INIT_VCPU(arg) struct vcpu *vcpu = (arg); + +#define TD_TO_VIMAGE(td) (td)->td_ucred->cr_vimage +#define TD_TO_VNET(td) (td)->td_ucred->cr_vimage->v_net +#define TD_TO_VPROCG(td) (td)->td_ucred->cr_vimage->v_procg +#define TD_TO_VCPU(td) (td)->td_ucred->cr_vimage->v_cpu +#define P_TO_VIMAGE(p) (p)->p_ucred->cr_vimage +#define P_TO_VNET(p) (p)->p_ucred->cr_vimage->v_net +#define P_TO_VPROCG(p) (p)->p_ucred->cr_vimage->v_procg +#define P_TO_VCPU(p) (p)->p_ucred->cr_vimage->v_cpu + +#else /* !VIMAGE */ + +/* Non-VIMAGE null-macros */ +#define VNET_MOD_DECLARE(m_name_uc, m_name_lc, m_iattach, m_idetach, \ + m_dependson, m_symmap) +#define VNET_MOD_DECLARE_STATELESS(m_name_uc, m_name_lc, m_iattach, m_idetach, \ + m_dependson) +#define CURVNET_SET(arg) +#define CURVNET_SET_QUIET(arg) +#define CURVNET_RESTORE() +#define VNET_ASSERT(condition) +#define VSYM(base, sym) (sym) +#define INIT_FROM_VNET(vnet, modindex, modtype, sym) +#define VNET_ITERLOOP_BEGIN() +#define VNET_ITERLOOP_BEGIN_QUIET() +#define VNET_ITERLOOP_END() +#define INIT_VPROCG(arg) +#define VPROCG_ITERLOOP_BEGIN() +#define VPROCG_ITERLOOP_END() +#define INIT_VCPU(arg) +#define TD_TO_VIMAGE(td) +#define TD_TO_VNET(td) +#define TD_TO_VPROCG(td) +#define TD_TO_VCPU(td) +#define P_TO_VIMAGE(p) +#define P_TO_VNET(p) +#define P_TO_VPROCG(p) +#define P_TO_VCPU(p) + +#endif /* !VIMAGE */ + +/* XXX those defines bellow should probably go into vprocg.h and vcpu.h */ +#define VPROCG(sym) VSYM(vprocg, sym) +#define VCPU(sym) VSYM(vcpu, sym) + +#define V_cp_time VPROCG(cp_time) +#define V_hostname VPROCG(hostname) +#define V_domainname VPROCG(domainname) +#define V_morphing_symlinks VPROCG(morphing_symlinks) +#define V_averunnable VPROCG(averunnable) +#define V_sched_tdcnt VPROCG(sched_tdcnt) +#define V_tdq_sysload VPROCG(tdq_sysload) + +#define V_acc_statcalls VCPU(acc_statcalls) +#define V_avg1_fixp VCPU(avg1_fixp) +#define V_avg2_fixp VCPU(avg2_fixp) + +#ifdef VIMAGE +void vnet_mod_register(const struct vnet_modinfo *); +void vnet_mod_deregister(const struct vnet_modinfo *); +void vnet_mod_register_multi(const struct vnet_modinfo *, const void *, + const char *); +void vnet_mod_deregister_multi(const struct vnet_modinfo *, const void *, + const char *); + +void printcpuinfo(struct vprocg *); +void vi_cpu_acct(void *); +int vi_td_ioctl(u_long, struct vi_req *, struct thread *); +int vi_if_move(struct vi_req *, struct ifnet *, struct vimage *); +void if_reassign_common(struct ifnet *, struct vnet *, const char *); + +int vi_symlookup(struct kld_sym_lookup *, char *); +struct vimage *vnet2vimage(struct vnet *); +struct vimage *vimage_by_name(struct vimage *, char *); +char *vnet_name(struct vnet *); +int vi_child_of(struct vimage *, struct vimage *); + +LIST_HEAD(vimage_list_head, vimage); +extern struct vimage_list_head vimage_head; + +LIST_HEAD(vprocg_list_head, vprocg); +extern struct vprocg_list_head vprocg_head; + +LIST_HEAD(vcpu_list_head, vcpu); +extern struct vcpu_list_head vcpu_head; + +LIST_HEAD(vnet_list_head, vnet); +extern struct vnet_list_head vnet_head; +extern int vnet_list_refc; +extern struct mtx vnet_list_refc_mtx; +extern struct cv vnet_list_condvar; +extern struct mtx vcpu_list_mtx; + +#define VNET_LIST_REF() \ + mtx_lock(&vnet_list_refc_mtx); \ + vnet_list_refc++; \ + mtx_unlock(&vnet_list_refc_mtx); + +#define VNET_LIST_UNREF() \ + mtx_lock(&vnet_list_refc_mtx); \ + vnet_list_refc--; \ + mtx_unlock(&vnet_list_refc_mtx); \ + cv_signal(&vnet_list_condvar); + +#define IS_DEFAULT_VIMAGE(arg) ((arg)->vi_id == 0) +#define IS_DEFAULT_VNET(arg) ((arg)->vnet_id == 0) + +struct vimage { + LIST_ENTRY(vimage) vi_le; /* all vimage list */ + LIST_ENTRY(vimage) vi_sibling; /* vimages with same parent */ + LIST_HEAD(, vimage) vi_child_head; /* direct offspring list */ + struct vimage *vi_parent; /* ptr to parent vimage */ + u_int vi_id; /* ID num */ + u_int vi_ucredrefc; /* refc of ucreds pointing to us */ + + char vi_name[MAXHOSTNAMELEN]; /* assigned by parent */ + + struct vprocg *v_procg; + struct vcpu *v_cpu; + struct vnet *v_net; +}; + +struct vprocg { + LIST_ENTRY(vprocg) vprocg_le; + u_int vprocg_ref; /* reference count */ + u_int vprocg_id; /* ID num */ + + u_int nprocs; + + long _cp_time[CPUSTATES]; + + char _hostname[MAXHOSTNAMELEN]; + char _domainname[MAXHOSTNAMELEN]; + + int _morphing_symlinks; + + struct loadavg _averunnable; /* from kern/kern_synch.c */ + int _sched_tdcnt; /* from kern/sched_4bsd.c */ + int _tdq_sysload[32]; /* XXX MAXCPUS from kern/sched_ule.c (SMP) */ + +#if 0 + u_int proc_limit; /* max. number of processes */ + + struct msgbuf *msgbufp; + int msgbuftrigger; + int msg_seqn; + + pid_t log_open_pid; + int log_sc_state; + struct selinfo log_sc_selp; + struct sigio *log_sc_sigio; + struct callout log_sc_callout; + + struct timeval boottime; + long boottdelta_sec; + + char chroot[MAXPATHLEN]; /* assigned/inherited from parent */ + + int big_brother; /* manage procs in all child vprocgs */ +#endif +}; + +struct vcpu { + LIST_ENTRY(vcpu) vcpu_le; + u_int vcpu_ref; /* reference count */ + u_int vcpu_id; /* ID num */ + + u_int _acc_statcalls; /* statclocks since last avg update*/ + u_int _avg1_fixp; /* "fast" avg in 16:16 bit fixedpoint */ + u_int _avg2_fixp; /* "slow" avg in 16:16 bit fixedpoint */ + +#if 0 + u_int cpu_min; /* Guaranteed CPU share */ + u_int cpu_max; /* Maximum average CPU usage */ + u_int intr_limit; /* Limit on CPU usage in intr ctx */ + u_int cpu_weight; /* Prop. share scheduling priority */ + u_int cpu_elimit; /* Dynamic soft CPU usage limit */ + u_int schedstamp; /* Prop. share scheduler tmp var */ + u_int forcedmin; /* Prop. share scheduler tmp var */ +#endif +}; + +#endif /* VIMAGE */ + +struct vi_req { + int req_action; /* What to do with this reqest? */ + u_int vi_cpu_min; /* Guaranteed CPU share */ + u_int vi_cpu_max; /* Maximum average CPU usage */ + u_int vi_cpu_weight; /* Prop. share scheduling priority */ + int vi_intr_limit; /* Limit on CPU usage in intr ctx */ + int vi_maxsockets; + u_short vi_proc_limit; /* max. number of processes */ + u_short vi_proc_count; /* current number of processes */ + u_short vi_child_limit; /* max. number of child vnets */ + u_short vi_child_count; /* current number of child vnets */ + int vi_if_count; /* current number network interfaces */ + int vi_sock_count; + char vi_name[MAXPATHLEN]; + char vi_chroot[MAXPATHLEN]; + char vi_if_xname[MAXPATHLEN]; /* XXX should be IFNAMSIZ */ + u_int cp_time_avg; + struct loadavg averunnable; +}; + +#define VI_CREATE 0x00000001 +#define VI_DESTROY 0x00000002 +#define VI_MODIFY 0x00000004 +#define VI_SWITCHTO 0x00000008 +#define VI_IFACE 0x00000010 + +#define VI_GET 0x00000100 +#define VI_GETNEXT 0x00000200 +#define VI_GETNEXT_RECURSE 0x00000300 + +#define VI_SET_CPU_MIN 0x00001000 +#define VI_SET_CPU_MAX 0x00002000 +#define VI_SET_CPU_WEIGHT 0x00004000 +#define VI_SET_INTR_LIMIT 0x00008000 +#define VI_SET_PROC_LIMIT 0x00010000 +#define VI_SET_CHILD_LIMIT 0x00020000 +#define VI_SET_SOCK_LIMIT 0x00040000 +#define VI_SET_NAME 0x00100000 +#define VI_SET_CHROOT 0x00200000 + + +#endif /* _NET_VIMAGE_H_ */ --- /u/marko/p4/head/src/sys/vm/vm_meter.c 2007-08-31 03:48:40.000000000 +0200 +++ src/sys/vm/vm_meter.c 2007-10-22 18:07:07.000000000 +0200 @@ -32,6 +32,8 @@ #include __FBSDID("$FreeBSD: src/sys/vm/vm_meter.c,v 1.96 2007/07/27 20:01:21 alc Exp $"); +#include "opt_vimage.h" + #include #include #include @@ -51,6 +53,7 @@ #include #include #include +#include struct vmmeter cnt; @@ -76,18 +79,20 @@ static int sysctl_vm_loadavg(SYSCTL_HANDLER_ARGS) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); + #ifdef SCTL_MASK32 u_int32_t la[4]; if (req->flags & SCTL_MASK32) { - la[0] = averunnable.ldavg[0]; - la[1] = averunnable.ldavg[1]; - la[2] = averunnable.ldavg[2]; - la[3] = averunnable.fscale; + la[0] = V_averunnable.ldavg[0]; + la[1] = V_averunnable.ldavg[1]; + la[2] = V_averunnable.ldavg[2]; + la[3] = V_averunnable.fscale; return SYSCTL_OUT(req, la, sizeof(la)); } else #endif - return SYSCTL_OUT(req, &averunnable, sizeof(averunnable)); + return SYSCTL_OUT(req, &V_averunnable, sizeof(V_averunnable)); } SYSCTL_PROC(_vm, VM_LOADAVG, loadavg, CTLTYPE_STRUCT|CTLFLAG_RD, NULL, 0, sysctl_vm_loadavg, "S,loadavg", "Machine loadaverage history"); --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/usr.sbin/vimage/Makefile 2007-10-05 12:28:05.000000000 +0200 @@ -0,0 +1,13 @@ +# RCS ID + +PROG= vimage +CFLAGS+= -I/sys + +WARNS?= 2 + +MAN= vimage.8 + +.include + +BINDIR?= /sbin +NOSHARED?= YES --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/usr.sbin/vimage/vimage.8 2007-10-05 12:28:05.000000000 +0200 @@ -0,0 +1,252 @@ +.\" Copyright (c) 2002, 2003 Marko Zec +.\" +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" +.\" @(#)vimage.8 1.01 (M. Zec) 2003/09/06 +.\" +.Dd November 3, 2003 +.Dt VIMAGE 8 +.Os +.Sh NAME +.Nm vimage +.Nd manage the FreeBSD virtual image facility +.Sh SYNOPSIS +.Nm +.Nm +.Ar vi_name +.Op command +.Nm +.Brq Fl c | m +.Ar vi_name +.Op options +.Nm +.Fl d +.Ar vi_name +.Nm +.Fl l +.Op Ar vi_name +.Nm +.Fl i +.Ar vi_name interface +.Op target_interface +.Sh DESCRIPTION +.Nm +command is the user interface for controlling the virtual image facility +in FreeBSD. +.Ss Overview +Each virtual image presents an isolated operating environment with its own +private view of vital system resources, most notably user processes, +CPU time share and an independent network stack instance. +Accordingly, every process and every network interface present +in the system is always assigned +to a single and unique virtual image. During the system bootup sequence +the +.So default +.Sc virtual image is created to which all the configured +interfaces and user processes are initially assigned. +Assuming that enough system resources +and per virtual image privileges are provided, the super-user can create and +manage a hierarchy of subordinated virtual images. The +.Nm +command allows +creation, deletion, modification and monitoring of virtual images, as well as +execution of arbitrary processes in the target virtual image. +.Ss Invocation +With no arguments provided, the +.Nm +command returns the name of the current virtual image +on the standard output and exits. +.Pp +If invoked with no modifiers, the +.Nm +command spawns a new process in virtual +image +.Ar vi_name . +If provided, the optional arguments following the virtual image name +.Ar vi_name +are executed as a standard command line, otherwise an interactive +shell is started in the target virtual image. +.Pp +The following parameters are available: +.Bl -tag -width indent +.It Fl c +Create a new virtual image named +.So +.Ar vi_name +.Sc . +If additional arguments are present following the +.Ar vi_name +parameter, they are interpreted as custom options (see below). +.It Fl m +Modify the custom options of the existing virtual image +.Ar vi_name , +in accordance with the additional arguments following the +.Ar vi_name +parameter (see below). +.It Fl d +Delete the virtual image +.Ar vi_name . +No processes should exist in the target virtual image, in order for +deletion to succeed. Non-loopback interfaces residing in the target +virtual image will be reassigned to the virtual image's parent. +.It Fl l +List the properties, custom parameters and statistics for virtual +images bellow the current one in the hierarchy. If an optional argument +.Ar vi_name +is provided, only the information regarding the target virtual image +.Ar vi_name +is displayed. +.It Fl i +Move the interface +.Ar interface +to the target virtual image +.Ar vi_name . +If the value of +.Ar vi_name +argument is +.So - +.Sc , +the interface is returned to the parent of the current virtual image. +.El +.Pp +The following options to +.Fl c +and +.Fl m +modifiers are available: +.Bl -tag -width indent +.It Cm cpumin +Set the minimum guaranteed average CPU share for the target virtual image. +The parameter is specified as percentage in range between 0 and 90. +The guaranteed CPU share for the +.So default +.Sc virtual image cannot be set bellow 10%. +Note that the system does not enforce strict global budgeting on guaranteed +CPU time shares. Therefore it is in the sole responsibility of the system +administrator whether he/she will allow for guaranteed CPU shares to be +oversubscribed or not. By default no virtual image is granted a guaranteed +CPU share, except the +.So default +.Sc virtual image, which normally runs with +.Cm cpumin +level of 10%. +.It Cm cpumax +Set the upper limit to average total CPU usage for the target virtual image. +The limit is specified as a percentage (1-100%). However, the limit cannot +be raised above the current upper CPU limit of the parent virtual image. +By default there is no CPU usage limit (100%). +.It Cm cpuweight +If the current average CPU usage of a virtual image is above the +.Cm cpumin +level, but bellow the +.Cm cpumax, +the virtual image becomes subject to a proportional share CPU scheduler. +The +.Cm cpuweight +parameter determines how will the virtual image compete for the available +CPU time. The higher the +.Cm cpuweight, +the less often will the virtual image be allocated a CPU time slice. +Valid parameter values range from 1 (default) to 10. +.It Cm proc +Set the maximum number of processes that are allowed to exist simultaneously +in the target virtual image. The default is 0, which means no process limit. +.It Cm chroot +Set the chroot directory for the virtual image. All new processes spawned +into the target virtual image using the +.Nm +command will be initially chrooted to that directory. This parameter can +be changed only when no processes are running within the target virtual +image. Note that it is not required to have a chrooted environment for +a virtual image operate, which is also the default behavior. +.It Cm child +Limit the number of children the target virtual image is allowed to create. +The limit cannot be raised above the lowest child limit of all the ancestors +of the target virtual image. By default all created virtual images are +prohibited from creating new virtual images, except the +.So default +.Sc virtual image. +.El +.Sh EXAMPLES +Create a new virtual image named +.So v1 +.Sc with average CPU usage limited to 20%: +.Pp +.Dl vimage -c v1 cpumax 20% +.Pp +Execute the +.So ifconfig +.Sc command in the virtual image +.So v1 +.Sc : +.Pp +.Dl vimage v1 ifconfig +.Pp +Move the interface +.So vlan0 +.Sc to the virtual image +.So v1 +.Sc : +.Pp +.Dl vimage -i v1 vlan0 +.Pp +Show the status information for virtual image +.So v1 +.Sc : +.Pp +.Dl vimage -l v1 +.Sh DIAGNOSTICS +The +.Nm +command exits 0 on success, and >0 if an error occurs. +.Sh SEE ALSO +.Xr jail 8 +.Sh BUGS +If memory allocation failure occurs during the vimage creation, it will remain +undetected/ignored in the current implementation, thus latently scheduling +an almost imminent system crash in the future. +.Pp +The current (experimental) implementation provides support for only IPv4 +protocol, though many features are not included, such as IPSEC or IPF. +IPv6, IPX, AppleTalk, XNS and OSI/ISO protocols are not yet supported. +.Pp +.Xr netgraph 4 +naming has to be extended to reflect virtual image association of netgraph +nodes and interfaces. +.Pp +No testing has been performed on SMP systems. There is absolutely no guarantee +that the kernel will even compile with SMP options enabled. +.Pp +At the time of writing this document the code is still in highly experimental +phase, so one should expect to encounter numerous undocumented problems. +The author will welcome and appreciate all (decently documented) bugreports. +You can check for updated versions of the vimage framework at +http://www.tel.fer.hr/zec/BSD/vimage/ +.Sh AUTHOR +.An "Marko Zec" Aq zec@tel.fer.hr +.Sh HISTORY +The +.Nm +facility first appeared as a FreeBSD 4.7-RELEASE patch. --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/usr.sbin/vimage/vimage.c 2007-11-01 22:50:12.000000000 +0100 @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2002, 2003, 2004 Marko Zec + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + +int main __P((int, char *[])); +void vi_print(struct vi_req *); + + +void vi_print(struct vi_req *vi_req) +{ + double lf = 1.0/vi_req->averunnable.fscale; + + printf("\"%s\":\n", vi_req->vi_name); + printf(" Processes (cur/max): %d/%d;", + vi_req->vi_proc_count, vi_req->vi_proc_limit); + printf(" load averages: %3.2f, %3.2f, %3.2f\n", + lf * vi_req->averunnable.ldavg[0], + lf * vi_req->averunnable.ldavg[1], + lf * vi_req->averunnable.ldavg[2]); + + printf(" CPU usage: %3.2f%%\n", vi_req->cp_time_avg / 655.04); + + printf(" Sockets (cur/max): %d/%d;", vi_req->vi_sock_count, + vi_req->vi_maxsockets); + printf(" %d network interfaces\n", vi_req->vi_if_count); + +#if 0 + printf(" CPU limits: min %3.2f%%, ", 0.0001 * vi_req->vi_cpu_min); + if (vi_req->vi_cpu_max == 0) + vi_req->vi_cpu_max = 1000000; + printf("max %3.2f%%, ", 0.0001 * vi_req->vi_cpu_max); + printf("weight %d, ", vi_req->vi_cpu_weight); + if (vi_req->vi_intr_limit) + printf("intr limit: %3.2f%%\n", + 0.0001 * vi_req->vi_intr_limit); + else + printf("no intr limit\n"); + + if (vi_req->vi_child_limit) + printf(" child limit: %d\n", vi_req->vi_child_limit); + if (vi_req->vi_child_count) + printf(" %d child vimages\n", vi_req->vi_child_count); + if (vi_req->vi_chroot[0]) + printf(" Chroot dir: %s\n", vi_req->vi_chroot); +#endif +} + + +/* + * The command syntax and argument parser are both uggly, as they have been + * "stiched" together on the fly, but they fullfil their current experimental + * purpose. The whole code should be rewritten properly one day... + */ + +int +main(argc, argv) + int argc; + char *argv[]; +{ + int s, i; + char *shell; + int cmd = VI_SWITCHTO; + struct vi_req vi_req; + + s = socket(AF_INET, SOCK_DGRAM, 0); + if (s == -1) + goto abort; + + bzero(&vi_req, sizeof(vi_req)); + if (argc == 1) { + strcpy(vi_req.vi_name, "."); + cmd = VI_GET; + } + + if (argc == 2 && strcmp(argv[1], "-l") == 0) { + strcpy(vi_req.vi_name, "."); + cmd = VI_GETNEXT; + } + + if (argc == 2 && strcmp(argv[1], "-lr") == 0) { + strcpy(vi_req.vi_name, "."); + cmd = VI_GETNEXT_RECURSE; + } + + if (argc == 3) { + strcpy(vi_req.vi_name, argv[2]); + if (strcmp(argv[1], "-l") == 0) + cmd = VI_GET; + if (strcmp(argv[1], "-c") == 0) + cmd = VI_CREATE; + if (strcmp(argv[1], "-d") == 0) + cmd = VI_DESTROY; + } + + if (argc >= 3) { + strcpy(vi_req.vi_name, argv[2]); + if (strcmp(argv[1], "-c") == 0) + cmd = VI_CREATE; + if (strcmp(argv[1], "-m") == 0) + cmd = VI_MODIFY; + if (strcmp(argv[1], "-i") == 0) + cmd = VI_IFACE; + } + + vi_req.req_action = cmd; + switch (cmd) { + + case VI_GET: + if (ioctl(s, SIOCGPVIMAGE, (caddr_t)&vi_req) < 0) + goto abort; + if (argc == 1) + printf("%s\n", vi_req.vi_name); + else + vi_print(&vi_req); + exit(0); + + case VI_GETNEXT: + case VI_GETNEXT_RECURSE: + vi_req.req_action = VI_GET; + if (ioctl(s, SIOCGPVIMAGE, (caddr_t)&vi_req) < 0) + goto abort; + vi_print(&vi_req); + vi_req.req_action = VI_GETNEXT_RECURSE; + while (ioctl(s, SIOCGPVIMAGE, (caddr_t)&vi_req) == 0) { + vi_print(&vi_req); + vi_req.req_action = cmd; + } + exit(0); + + case VI_IFACE: + /* here vi_chroot stores the current ifc name */ + strncpy(vi_req.vi_chroot, argv[3], sizeof(vi_req.vi_chroot)); + if (argc >= 5) + strncpy(vi_req.vi_if_xname, argv[4], + sizeof(vi_req.vi_if_xname)); + else + vi_req.vi_if_xname[0] = 0; + if (ioctl(s, SIOCSIFVIMAGE, (caddr_t)&vi_req) < 0) + goto abort; + printf("%s@%s\n", vi_req.vi_chroot, vi_req.vi_name); + exit(0); + + case VI_CREATE: + case VI_MODIFY: + for (i = 3; i < argc-1; i += 2) { + if (strcmp(argv[i], "maxsockets") == 0) { + vi_req.req_action |= VI_SET_SOCK_LIMIT; + vi_req.vi_maxsockets = strtod(argv[i+1], NULL); + } + if (strcmp(argv[i], "cpumin") == 0) { + vi_req.req_action |= VI_SET_CPU_MIN; + vi_req.vi_cpu_min = + strtod(argv[i+1], NULL) * 10000; + if (vi_req.vi_cpu_min > 900000) { + fprintf(stderr, "error: cpumin must be between 0 and 90\n"); + exit(1); + } + } + if (strcmp(argv[i], "cpumax") == 0) { + vi_req.req_action |= VI_SET_CPU_MAX; + vi_req.vi_cpu_max = + strtod(argv[i+1], NULL) * 10000; + if (vi_req.vi_cpu_max < 10000 || + vi_req.vi_cpu_max > 1000000) { + fprintf(stderr, "error: cpumax must be between 1 and 100\n"); + exit(1); + } + } + if (strcmp(argv[i], "cpuweight") == 0) { + vi_req.req_action |= VI_SET_CPU_WEIGHT; + vi_req.vi_cpu_weight = strtod(argv[i+1], NULL); + if (vi_req.vi_cpu_weight < 1 || + vi_req.vi_cpu_weight > 10) { + fprintf(stderr, "error: cpuweight must be between 1 and 10\n"); + exit(1); + } + } + if (strcmp(argv[i], "intr") == 0) { + vi_req.req_action |= VI_SET_INTR_LIMIT; + vi_req.vi_intr_limit = + strtod(argv[i+1], NULL) * 10000; + if (vi_req.vi_intr_limit < 10000 || + vi_req.vi_intr_limit > 1000000) { + fprintf(stderr, "error: intr limit must be between 1 and 100\n"); + exit(1); + } + } + if (strcmp(argv[i], "child") == 0) { + vi_req.req_action |= VI_SET_CHILD_LIMIT; + vi_req.vi_child_limit = atoi(argv[i+1]); + } + if (strcmp(argv[i], "proc") == 0) { + vi_req.req_action |= VI_SET_PROC_LIMIT; + vi_req.vi_proc_limit = atoi(argv[i+1]); + } + if (strcmp(argv[i], "chroot") == 0) { + vi_req.req_action |= VI_SET_CHROOT; + strncpy(vi_req.vi_chroot, argv[i+1], + sizeof(vi_req.vi_chroot)); + } + } + if (ioctl(s, SIOCSPVIMAGE, (caddr_t)&vi_req) < 0) + goto abort; + exit(0); + + case VI_SWITCHTO: + strcpy(vi_req.vi_name, argv[1]); + if (ioctl(s, SIOCSPVIMAGE, (caddr_t)&vi_req) < 0) + goto abort; + + vi_req.req_action = VI_GET; + strcpy(vi_req.vi_name, "."); + if (ioctl(s, SIOCGPVIMAGE, (caddr_t)&vi_req) < 0) { + printf("XXX this should have not happened!\n"); + goto abort; + } + + if (strlen(vi_req.vi_chroot) && (chdir(vi_req.vi_chroot) || + chroot(vi_req.vi_chroot))) + goto abort; + close(s); + + if (argc == 2) { + printf("Switched to vimage %s\n", argv[1]); + if ((shell=getenv("SHELL")) == NULL) + execlp("/bin/sh", argv[0], NULL); + else + execlp(shell, argv[0], NULL); + } else + execvp(argv[2], &argv[2]); + break; + + case VI_DESTROY: + if (ioctl(s, SIOCSPVIMAGE, (caddr_t)&vi_req) < 0) + goto abort; + exit(0); + + default: + fprintf(stderr, "usage: %s bla bla\n", argv[0]); + exit(1); + } + +abort: + perror("Error"); + exit(1); +}