--- /u/marko/p4/head/src/sys/amd64/amd64/dump_machdep.c 2008-02-27 18:27:04.000000000 +0100 +++ src/sys/amd64/amd64/dump_machdep.c 2008-02-27 11:38:49.000000000 +0100 @@ -27,6 +27,8 @@ #include __FBSDID("$FreeBSD: src/sys/amd64/amd64/dump_machdep.c,v 1.14 2008/02/15 06:26:25 scottl Exp $"); +#include "opt_vimage.h" + #include #include #include @@ -34,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -109,6 +112,7 @@ mkdumpheader(struct kerneldumpheader *kdh, uint32_t archver, uint64_t dumplen, uint32_t blksz) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX */ bzero(kdh, sizeof(*kdh)); strncpy(kdh->magic, KERNELDUMPMAGIC, sizeof(kdh->magic)); @@ -118,7 +122,7 @@ kdh->dumplength = htod64(dumplen); kdh->dumptime = htod64(time_second); kdh->blocksize = htod32(blksz); - strncpy(kdh->hostname, hostname, sizeof(kdh->hostname)); + strncpy(kdh->hostname, V_hostname, sizeof(kdh->hostname)); strncpy(kdh->versionstring, version, sizeof(kdh->versionstring)); if (panicstr != NULL) strncpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring)); --- /u/marko/p4/head/src/sys/amd64/amd64/minidump_machdep.c 2008-02-27 18:27:04.000000000 +0100 +++ src/sys/amd64/amd64/minidump_machdep.c 2008-02-27 11:38:52.000000000 +0100 @@ -27,6 +27,8 @@ #include __FBSDID("$FreeBSD: src/sys/amd64/amd64/minidump_machdep.c,v 1.4 2008/02/15 06:26:25 scottl Exp $"); +#include "opt_vimage.h" + #include #include #include @@ -34,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -85,6 +88,7 @@ mkdumpheader(struct kerneldumpheader *kdh, uint32_t archver, uint64_t dumplen, uint32_t blksz) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX */ bzero(kdh, sizeof(*kdh)); strncpy(kdh->magic, KERNELDUMPMAGIC, sizeof(kdh->magic)); @@ -94,7 +98,7 @@ kdh->dumplength = htod64(dumplen); kdh->dumptime = htod64(time_second); kdh->blocksize = htod32(blksz); - strncpy(kdh->hostname, hostname, sizeof(kdh->hostname)); + strncpy(kdh->hostname, V_hostname, sizeof(kdh->hostname)); strncpy(kdh->versionstring, version, sizeof(kdh->versionstring)); if (panicstr != NULL) strncpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring)); --- /u/marko/p4/head/src/sys/compat/linprocfs/linprocfs.c 2008-01-15 17:45:36.000000000 +0100 +++ src/sys/compat/linprocfs/linprocfs.c 2008-02-27 11:39:51.000000000 +0100 @@ -42,6 +42,9 @@ #include __FBSDID("$FreeBSD: src/sys/compat/linprocfs/linprocfs.c,v 1.118 2008/01/10 01:10:41 attilio Exp $"); +#include "opt_compat.h" +#include "opt_vimage.h" + #include #include #include @@ -70,7 +73,9 @@ #include #include #include +#include +#include #include #include @@ -87,7 +92,6 @@ #include #endif /* __i386__ || __amd64__ */ -#include "opt_compat.h" #ifdef COMPAT_LINUX32 /* XXX */ #include #else @@ -507,15 +511,16 @@ static int linprocfs_doloadavg(PFS_FILL_ARGS) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); sbuf_printf(sb, "%d.%02d %d.%02d %d.%02d %d/%d %d\n", - (int)(averunnable.ldavg[0] / averunnable.fscale), - (int)(averunnable.ldavg[0] * 100 / averunnable.fscale % 100), - (int)(averunnable.ldavg[1] / averunnable.fscale), - (int)(averunnable.ldavg[1] * 100 / averunnable.fscale % 100), - (int)(averunnable.ldavg[2] / averunnable.fscale), - (int)(averunnable.ldavg[2] * 100 / averunnable.fscale % 100), + (int)(V_averunnable.ldavg[0] / V_averunnable.fscale), + (int)(V_averunnable.ldavg[0] * 100 / V_averunnable.fscale % 100), + (int)(V_averunnable.ldavg[1] / V_averunnable.fscale), + (int)(V_averunnable.ldavg[1] * 100 / V_averunnable.fscale % 100), + (int)(V_averunnable.ldavg[2] / V_averunnable.fscale), + (int)(V_averunnable.ldavg[2] * 100 / V_averunnable.fscale % 100), 1, /* number of running tasks */ nprocs, /* number of tasks */ lastpid /* the last pid */ @@ -998,6 +1003,7 @@ static int linprocfs_donetdev(PFS_FILL_ARGS) { + INIT_VNET_NET(TD_TO_VNET(curthread)); char ifname[16]; /* XXX LINUX_IFNAMSIZ */ struct ifnet *ifp; @@ -1007,7 +1013,7 @@ "bytes packets errs drop fifo frame compressed"); IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { linux_ifname(ifp, ifname, sizeof ifname); sbuf_printf(sb, "%6.6s:", ifname); sbuf_printf(sb, "%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu ", --- /u/marko/p4/head/src/sys/compat/linux/linux_ioctl.c 2007-11-13 02:48:15.000000000 +0100 +++ src/sys/compat/linux/linux_ioctl.c 2007-12-10 11:25:48.000000000 +0100 @@ -29,6 +29,9 @@ #include __FBSDID("$FreeBSD: src/sys/compat/linux/linux_ioctl.c,v 1.139 2007/11/07 16:42:52 kib Exp $"); +#include "opt_vimage.h" +#include "opt_compat.h" + #include #include #include @@ -56,12 +59,13 @@ #include #include #include +#include + +#include #include #include #include -#include "opt_compat.h" - #ifdef COMPAT_LINUX32 #include #include @@ -2037,6 +2041,7 @@ int linux_ifname(struct ifnet *ifp, char *buffer, size_t buflen) { + INIT_VNET_NET(ifp->if_vnet); struct ifnet *ifscan; int ethno; @@ -2047,7 +2052,7 @@ /* Determine the (relative) unit number for ethernet interfaces */ ethno = 0; IFNET_RLOCK(); - TAILQ_FOREACH(ifscan, &ifnet, if_link) { + TAILQ_FOREACH(ifscan, &V_ifnet, if_link) { if (ifscan == ifp) { IFNET_RUNLOCK(); return (snprintf(buffer, buflen, "eth%d", ethno)); @@ -2070,6 +2075,7 @@ static struct ifnet * ifname_linux_to_bsd(const char *lxname, char *bsdname) { + INIT_VNET_NET(TD_TO_VNET(curthread)); struct ifnet *ifp; int len, unit; char *ep; @@ -2086,7 +2092,7 @@ index = 0; is_eth = (len == 3 && !strncmp(lxname, "eth", len)) ? 1 : 0; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* * Allow Linux programs to use FreeBSD names. Don't presume * we never have an interface named "eth", so don't make @@ -2110,6 +2116,7 @@ static int linux_ifconf(struct thread *td, struct ifconf *uifc) { + INIT_VNET_NET(TD_TO_VNET(td)); #ifdef COMPAT_LINUX32 struct l_ifconf ifc; #else @@ -2130,7 +2137,7 @@ /* handle the 'request buffer size' case */ if (ifc.ifc_buf == PTROUT(NULL)) { ifc.ifc_len = 0; - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (sa->sa_family == AF_INET) @@ -2157,7 +2164,7 @@ /* Return all AF_INET addresses of all interfaces */ IFNET_RLOCK(); /* could sleep XXX */ - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { int addrs = 0; bzero(&ifr, sizeof(ifr)); --- /u/marko/p4/head/src/sys/compat/linux/linux_misc.c 2008-02-27 18:27:10.000000000 +0100 +++ src/sys/compat/linux/linux_misc.c 2008-02-27 11:39:56.000000000 +0100 @@ -32,6 +32,7 @@ #include "opt_compat.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -63,6 +64,7 @@ #include #include #include +#include #include @@ -123,6 +125,7 @@ int linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args) { + INIT_VPROCG(TD_TO_VPROCG(td)); struct l_sysinfo sysinfo; vm_object_t object; int i, j; @@ -135,8 +138,8 @@ /* Use the information from the mib to get our load averages */ for (i = 0; i < 3; i++) - sysinfo.loads[i] = averunnable.ldavg[i] * - LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale; + sysinfo.loads[i] = V_averunnable.ldavg[i] * + LINUX_SYSINFO_LOADS_SCALE / V_averunnable.fscale; sysinfo.totalram = physmem * PAGE_SIZE; sysinfo.freeram = sysinfo.totalram - cnt.v_wire_count * PAGE_SIZE; @@ -709,6 +712,7 @@ int linux_newuname(struct thread *td, struct linux_newuname_args *args) { + INIT_VPROCG(TD_TO_VPROCG(td)); struct l_new_utsname utsname; char osname[LINUX_MAX_UTSNAME]; char osrelease[LINUX_MAX_UTSNAME]; @@ -760,7 +764,7 @@ #else /* something other than i386 or amd64 - assume we and Linux agree */ strlcpy(utsname.machine, machine, LINUX_MAX_UTSNAME); #endif /* __i386__ */ - strlcpy(utsname.domainname, domainname, LINUX_MAX_UTSNAME); + strlcpy(utsname.domainname, V_domainname, LINUX_MAX_UTSNAME); return (copyout(&utsname, args->buf, sizeof(utsname))); } --- /u/marko/p4/head/src/sys/conf/files 2008-02-27 18:27:14.000000000 +0100 +++ src/sys/conf/files 2008-02-27 11:40:17.000000000 +0100 @@ -1475,6 +1475,7 @@ kern/kern_timeout.c standard kern/kern_umtx.c standard kern/kern_uuid.c standard +kern/kern_vimage.c optional vimage kern/kern_xxx.c standard kern/link_elf.c standard kern/linker_if.m standard @@ -1838,6 +1839,7 @@ netgraph/ng_nat.c optional netgraph_nat netgraph/ng_one2many.c optional netgraph_one2many netgraph/ng_parse.c optional netgraph +netgraph/ng_pipe.c optional netgraph_pipe netgraph/ng_ppp.c optional netgraph_ppp netgraph/ng_pppoe.c optional netgraph_pppoe netgraph/ng_pptpgre.c optional netgraph_pptpgre @@ -1851,6 +1853,7 @@ netgraph/ng_tee.c optional netgraph_tee netgraph/ng_tty.c optional netgraph_tty netgraph/ng_vjc.c optional netgraph_vjc +netgraph/ng_wormhole.c optional netgraph_wormhole vimage netinet/accf_data.c optional accept_filter_data netinet/accf_http.c optional accept_filter_http netinet/if_atm.c optional atm --- /u/marko/p4/head/src/sys/conf/options 2008-02-27 18:27:14.000000000 +0100 +++ src/sys/conf/options 2008-02-27 11:40:25.000000000 +0100 @@ -457,6 +457,7 @@ NETGRAPH_NAT opt_netgraph.h NETGRAPH_NETFLOW opt_netgraph.h NETGRAPH_ONE2MANY opt_netgraph.h +NETGRAPH_PIPE opt_netgraph.h NETGRAPH_PPP opt_netgraph.h NETGRAPH_PPPOE opt_netgraph.h NETGRAPH_PPTPGRE opt_netgraph.h @@ -471,6 +472,7 @@ NETGRAPH_TTY opt_netgraph.h NETGRAPH_UI opt_netgraph.h NETGRAPH_VJC opt_netgraph.h +NETGRAPH_WORMHOLE opt_netgraph.h # NgATM options NGATM_ATM opt_netgraph.h @@ -760,3 +762,6 @@ #Disable code to dispatch tcp offloading TCP_OFFLOAD_DISABLE opt_inet.h + +# Virtualize the network stack +VIMAGE opt_vimage.h --- /u/marko/p4/head/src/sys/contrib/pf/net/pfvar.h 2007-08-31 03:45:08.000000000 +0200 +++ src/sys/contrib/pf/net/pfvar.h 2007-10-05 12:23:37.000000000 +0200 @@ -1848,5 +1848,22 @@ struct pf_os_fingerprint * pf_osfp_validate(void); +/* + * Stack virtualization support. + */ +#ifdef VIMAGE +struct vnet_pf { + struct vnet *parent_vnet; + +} +#endif + +/* + * Symbol translation macros + */ +#define INIT_VNET_PF(vnet) \ + INIT_FROM_VNET(vnet, VNET_MOD_PF, struct vnet_pf, vnet_pf) + +#define VNET_PF(sym) VSYM(vnet_pf, sym) #endif /* _NET_PFVAR_H_ */ --- /u/marko/p4/head/src/sys/ddb/db_command.c 2007-12-27 19:27:46.000000000 +0100 +++ src/sys/ddb/db_command.c 2008-01-14 19:22:48.000000000 +0100 @@ -268,24 +268,40 @@ return (result); } +/* + * Print out a sorted command table. + */ static void db_cmd_list(table) struct command_table *table; { - register struct command *cmd; - register struct command **aux_cmdp; + struct command *cmd; + struct command **aux_cmdp; + char *last; + char *next = ""; - for (cmd = table->table; cmd->name != 0; cmd++) { - db_printf("%-12s", cmd->name); - db_end_line(12); - } - if (table->aux_tablep == NULL) - return; - for (aux_cmdp = table->aux_tablep; aux_cmdp < table->aux_tablep_end; - aux_cmdp++) { - db_printf("%-12s", (*aux_cmdp)->name); - db_end_line(12); - } + do { + last = next; + for (cmd = table->table; cmd->name != 0; cmd++) { + if (strcmp(cmd->name, last) > 0 && + (last == next || strcmp(cmd->name, next) < 0)) + next = cmd->name; + } + if (table->aux_tablep != NULL) { + for (aux_cmdp = table->aux_tablep; + aux_cmdp < table->aux_tablep_end; aux_cmdp++) { + cmd = *aux_cmdp; + if (strcmp(cmd->name, last) > 0 && + (last == next || + strcmp(cmd->name, next) < 0)) + next = cmd->name; + } + } + if (next != last) { + db_printf("%-12s", next); + db_end_line(12); + } + } while (next != last); } static void --- /u/marko/p4/head/src/sys/ddb/db_textdump.c 2008-02-03 08:15:52.000000000 +0100 +++ src/sys/ddb/db_textdump.c 2008-02-27 11:40:43.000000000 +0100 @@ -60,6 +60,7 @@ __FBSDID("$FreeBSD: src/sys/ddb/db_textdump.c,v 1.3 2008/01/31 16:22:14 rwatson Exp $"); #include "opt_config.h" +#include "opt_vimage.h" #include #include @@ -68,6 +69,7 @@ #include #include #include +#include #include #include @@ -183,6 +185,7 @@ mkdumpheader(struct kerneldumpheader *kdh, uint32_t archver, uint64_t dumplen, uint32_t blksz) { + INIT_VPROCG(TD_TO_VPROCG(&thread0)); bzero(kdh, sizeof(*kdh)); strncpy(kdh->magic, TEXTDUMPMAGIC, sizeof(kdh->magic)); @@ -192,7 +195,7 @@ kdh->dumplength = htod64(dumplen); kdh->dumptime = htod64(time_second); kdh->blocksize = htod32(blksz); - strncpy(kdh->hostname, hostname, sizeof(kdh->hostname)); + strncpy(kdh->hostname, V_hostname, sizeof(kdh->hostname)); strncpy(kdh->versionstring, version, sizeof(kdh->versionstring)); if (panicstr != NULL) strncpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring)); --- /u/marko/p4/head/src/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 2008-02-27 18:27:46.000000000 +0100 +++ src/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 2008-02-27 11:41:57.000000000 +0100 @@ -136,10 +136,6 @@ #define VALIDATE_SOCK(so) #define DEBUG_WR 0 -extern int tcp_do_autorcvbuf; -extern int tcp_do_autosndbuf; -extern int tcp_autorcvbuf_max; -extern int tcp_autosndbuf_max; static void t3_send_reset(struct toepcb *toep); static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); --- /u/marko/p4/head/src/sys/dev/firewire/firewire.c 2007-10-21 13:56:14.000000000 +0200 +++ src/sys/dev/firewire/firewire.c 2007-12-10 11:25:55.000000000 +0100 @@ -35,6 +35,8 @@ * */ +#include "opt_vimage.h" + #include #include #include @@ -45,6 +47,7 @@ #include #include #include +#include #include @@ -674,6 +677,7 @@ static void fw_reset_crom(struct firewire_comm *fc) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX */ struct crom_src_buf *buf; struct crom_src *src; struct crom_chunk *root; @@ -699,7 +703,7 @@ crom_add_simple_text(src, root, &buf->vendor, "FreeBSD Project"); crom_add_entry(root, CSRKEY_HW, __FreeBSD_version); #endif - crom_add_simple_text(src, root, &buf->hw, hostname); + crom_add_simple_text(src, root, &buf->hw, V_hostname); } /* --- /u/marko/p4/head/src/sys/dev/iwi/if_iwi.c 2007-12-27 19:30:53.000000000 +0100 +++ src/sys/dev/iwi/if_iwi.c 2008-01-14 19:23:18.000000000 +0100 @@ -35,6 +35,8 @@ * http://www.intel.com/network/connectivity/products/wireless/prowireless_mobile.htm */ +#include "opt_vimage.h" + #include #include #include @@ -55,6 +57,7 @@ #include #include #include +#include #include #include @@ -189,6 +192,9 @@ static void iwi_sysctlattach(struct iwi_softc *); static void iwi_led_event(struct iwi_softc *, int); static void iwi_ledattach(struct iwi_softc *); +#ifdef VIMAGE +static void iwi_reassign(struct ifnet *, struct vnet *, char *); +#endif static int iwi_probe(device_t); static int iwi_attach(device_t); @@ -407,6 +413,9 @@ ieee80211_ifattach(ic); ic->ic_bmissthreshold = 10; /* override default */ /* override default methods */ +#ifdef VIMAGE + ifp->if_reassign = iwi_reassign; +#endif ic->ic_node_alloc = iwi_node_alloc; sc->sc_node_free = ic->ic_node_free; ic->ic_node_free = iwi_node_free; @@ -505,6 +514,28 @@ return 0; } +#ifdef VIMAGE +static void +iwi_reassign(struct ifnet *ifp, struct vnet *vnet, char *dname) +{ + struct iwi_softc *sc = ifp->if_softc; + struct ieee80211com *ic = &sc->sc_ic; + IWI_LOCK_DECL; + + IWI_LOCK(sc); + bpfdetach(ifp); + sc->sc_drvbpf = NULL; + ieee80211_reassign(ic, vnet, dname); + + CURVNET_SET_QUIET(vnet); + bpfattach2(ifp, DLT_IEEE802_11_RADIO, + sizeof (struct ieee80211_frame) + sizeof (sc->sc_txtap), + &sc->sc_drvbpf); + CURVNET_RESTORE(); + IWI_UNLOCK(sc); +} +#endif + static void iwi_dma_map_addr(void *arg, bus_dma_segment_t *segs, int nseg, int error) { --- /u/marko/p4/head/src/sys/fs/cd9660/cd9660_rrip.c 2007-08-31 03:46:55.000000000 +0200 +++ src/sys/fs/cd9660/cd9660_rrip.c 2007-10-22 18:06:27.000000000 +0200 @@ -34,6 +34,8 @@ * @(#)cd9660_rrip.c 8.6 (Berkeley) 12/5/94 */ +#include "opt_vimage.h" + #include __FBSDID("$FreeBSD: src/sys/fs/cd9660/cd9660_rrip.c,v 1.30 2007/02/11 13:54:25 rodrigc Exp $"); @@ -44,6 +46,7 @@ #include #include #include +#include #include #include @@ -113,6 +116,7 @@ ISO_RRIP_SLINK *p; ISO_RRIP_ANALYZE *ana; { + INIT_VPROCG(TD_TO_VPROCG(curthread)); ISO_RRIP_SLINK_COMPONENT *pcomp; ISO_RRIP_SLINK_COMPONENT *pcompe; int len, wlen, cont; @@ -171,8 +175,8 @@ case ISO_SUSP_CFLAG_HOST: /* Inserting hostname i.e. "kurt.tools.de" */ - inbuf = hostname; - wlen = strlen(hostname); + inbuf = V_hostname; + wlen = strlen(V_hostname); break; case ISO_SUSP_CFLAG_CONTINUE: @@ -222,6 +226,7 @@ ISO_RRIP_ALTNAME *p; ISO_RRIP_ANALYZE *ana; { + INIT_VPROCG(TD_TO_VPROCG(curthread)); char *inbuf; int wlen; int cont; @@ -243,8 +248,8 @@ case ISO_SUSP_CFLAG_HOST: /* Inserting hostname i.e. "kurt.tools.de" */ - inbuf = hostname; - wlen = strlen(hostname); + inbuf = V_hostname; + wlen = strlen(V_hostname); break; case ISO_SUSP_CFLAG_CONTINUE: --- /u/marko/p4/head/src/sys/i386/conf/.cvsignore 2007-08-31 03:47:17.000000000 +0200 +++ src/sys/i386/conf/.cvsignore 2007-10-05 12:26:12.000000000 +0200 @@ -1 +0,0 @@ -[A-Za-z0-9]* --- /u/marko/p4/head/src/sys/i386/conf/NOTES 2008-02-03 08:16:00.000000000 +0100 +++ src/sys/i386/conf/NOTES 2008-02-27 11:46:14.000000000 +0100 @@ -248,8 +248,14 @@ # # Not all device drivers support this mode of operation at the time of # this writing. See polling(4) for more details. +# +# VIMAGE adds support for maintaining multiple independent network stack +# state instances in the kernel. This feature is still in early +# experimental phase, and needs more thought, testing, and documentation. +# options DEVICE_POLLING +options VIMAGE ##################################################################### --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/i386/conf/VIMAGE 2007-10-05 12:26:12.000000000 +0200 @@ -0,0 +1,16 @@ +# +# VIMAGE - sample kernel configuration file with a virtualized network stack +# configure. +# +# $FreeBSD$ +# +include GENERIC +ident VIMAGE + +options VIMAGE + +# +# Some kernel subsystems and functions don't yet compile with VIMAGE. Remove +# from the configuration for now. +# +nooptions SCTP --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/i386/conf/TPX32V 2007-12-01 12:34:54.000000000 +0100 @@ -0,0 +1,22 @@ +include TPX32C +ident TPX32V + +options VIMAGE + +options NETGRAPH +options NETGRAPH_PIPE +options NETGRAPH_ETHER +options NETGRAPH_EIFACE +options NETGRAPH_IFACE +options NETGRAPH_BRIDGE +options NETGRAPH_SOCKET +options NETGRAPH_KSOCKET + +device wlan # 802.11 support +device wlan_wep # 802.11 WEP support +device wlan_ccmp # 802.11 CCMP support +device wlan_tkip # 802.11 TKIP support +device wlan_amrr # AMRR transmit rate control algorithm +device wlan_scan_ap # 802.11 AP mode scanning +device wlan_scan_sta # 802.11 STA mode scanning +device iwi --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/i386/conf/TPX32V_NODEBUG 2007-10-05 12:30:14.000000000 +0200 @@ -0,0 +1,4 @@ +include TPX32C_NODEBUG +ident TPX32V_NODEBUG + +options VIMAGE --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/i386/conf/TPX32C_NODEBUG 2008-01-04 14:27:24.000000000 +0100 @@ -0,0 +1,117 @@ +cpu I686_CPU +ident TPX32C + +# To statically compile in device wiring instead of /boot/device.hints +#hints "GENERIC.hints" # Default places to look for devices. + +makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols + +options SCHED_4BSD # 4BSD scheduler +options PREEMPTION # Enable kernel thread preemption +options INET # InterNETworking +options INET6 # IPv6 communications protocols +options SCTP # Stream Transmission Control Protocol +options FFS # Berkeley Fast Filesystem +options SOFTUPDATES # Enable FFS soft updates support +options UFS_ACL # Support for access control lists +options UFS_DIRHASH # Improve performance on big directories +options UFS_GJOURNAL # Enable gjournal-based UFS journaling +options MSDOSFS # MSDOS Filesystem +options CD9660 # ISO 9660 Filesystem +options PROCFS # Process filesystem (requires PSEUDOFS) +options PSEUDOFS # Pseudo-filesystem framework +options GEOM_PART_GPT # GUID Partition Tables. +options GEOM_LABEL # Provides labelization +options COMPAT_43TTY # BSD 4.3 TTY compat [KEEP THIS!] +options COMPAT_FREEBSD4 # Compatible with FreeBSD4 +options COMPAT_FREEBSD5 # Compatible with FreeBSD5 +options COMPAT_FREEBSD6 # Compatible with FreeBSD6 +options KTRACE # ktrace(1) support +options SYSVSHM # SYSV-style shared memory +options SYSVMSG # SYSV-style message queues +options SYSVSEM # SYSV-style semaphores +options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions +options KBD_INSTALL_CDEV # install a CDEV entry in /dev +options STOP_NMI # Stop CPUS using NMI instead of IPI +options AUDIT # Security event auditing + +# Debugging for use in -current +#options KDB # Enable kernel debugger support. +#options DDB # Support DDB. +#options INVARIANTS # Enable calls of extra sanity checking +#options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS +#options WITNESS # Enable checks to detect deadlocks and cycles +#options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed + +# Bus support. +device pci + +# ATA and ATAPI devices +device ata +device atadisk # ATA disk drives +device atapicd # ATAPI CDROM drives +options ATA_STATIC_ID # Static device numbering + +# SCSI peripherals +device scbus # SCSI bus (required for SCSI) +device da # Direct Access (disks) +device pass # Passthrough device (direct SCSI access) + +# atkbdc0 controls both the keyboard and the PS/2 mouse +device atkbdc # AT keyboard controller +device atkbd # AT keyboard +device psm # PS/2 mouse + +device kbdmux # keyboard multiplexer + +device vga # VGA video card driver + +# syscons is the default console driver, resembling an SCO console +device sc + +device agp # support several AGP chipsets + +# Power management support (see NOTES for more options) +device apm +# Add suspend/resume support for the i8254. +device pmtimer + +# PCCARD (PCMCIA) support +# PCMCIA and cardbus bridge support +device cbb # cardbus (yenta) bridge +device pccard # PC Card (16-bit) bus +device cardbus # CardBus (32-bit) bus + +# Serial (COM) ports +device sio # 8250, 16[45]50 based serial ports +device uart # Generic UART driver + +# Parallel port +device ppc +device ppbus # Parallel port bus (required) +device lpt # Printer +device ppi # Parallel port interface device + +# Pseudo devices. +device loop # Network loopback +device random # Entropy device +device ether # Ethernet support +device tun # Packet tunnel. +device pty # Pseudo-ttys (telnet etc) +device md # Memory "disks" +device firmware # firmware assist module + +# The `bpf' device enables the Berkeley Packet Filter. +# Be aware of the administrative consequences of enabling this! +# Note that 'bpf' is required for DHCP. +device bpf # Berkeley packet filter + +options HZ=200 + +nooptions SCTP + +options IPSEC +device enc +device crypto + +options ALT_BREAK_TO_DEBUGGER --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/i386/conf/TPX32C 2007-12-10 10:26:48.000000000 +0100 @@ -0,0 +1,118 @@ +cpu I686_CPU +ident TPX32C + +# To statically compile in device wiring instead of /boot/device.hints +#hints "GENERIC.hints" # Default places to look for devices. + +makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols + +options SCHED_ULE +options PREEMPTION # Enable kernel thread preemption +options INET # InterNETworking +options INET6 # IPv6 communications protocols +options SCTP # Stream Transmission Control Protocol +options FFS # Berkeley Fast Filesystem +options SOFTUPDATES # Enable FFS soft updates support +options UFS_ACL # Support for access control lists +options UFS_DIRHASH # Improve performance on big directories +options UFS_GJOURNAL # Enable gjournal-based UFS journaling +options MSDOSFS # MSDOS Filesystem +options CD9660 # ISO 9660 Filesystem +options PROCFS # Process filesystem (requires PSEUDOFS) +options PSEUDOFS # Pseudo-filesystem framework +options NFSCLIENT # Network Filesystem Client +options NFS_ROOT # NFS usable as /, requires NFSCLIENT +options MD_ROOT # MD is a potential root device +options GEOM_PART_GPT # GUID Partition Tables. +options GEOM_LABEL # Provides labelization +options COMPAT_43TTY # BSD 4.3 TTY compat [KEEP THIS!] +options COMPAT_FREEBSD4 # Compatible with FreeBSD4 +options COMPAT_FREEBSD5 # Compatible with FreeBSD5 +options COMPAT_FREEBSD6 # Compatible with FreeBSD6 +options KTRACE # ktrace(1) support +options SYSVSHM # SYSV-style shared memory +options SYSVMSG # SYSV-style message queues +options SYSVSEM # SYSV-style semaphores +options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions +options KBD_INSTALL_CDEV # install a CDEV entry in /dev +options STOP_NMI # Stop CPUS using NMI instead of IPI +options AUDIT # Security event auditing + +# Debugging for use in -current +options KDB # Enable kernel debugger support. +options DDB # Support DDB. +options INVARIANTS # Enable calls of extra sanity checking +options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS +options WITNESS # Enable checks to detect deadlocks and cycles +options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed + +# Bus support. +device pci + +# ATA and ATAPI devices +device ata +device atadisk # ATA disk drives +device atapicd # ATAPI CDROM drives +options ATA_STATIC_ID # Static device numbering + +# SCSI peripherals +device scbus # SCSI bus (required for SCSI) +device da # Direct Access (disks) +device pass # Passthrough device (direct SCSI access) + +# atkbdc0 controls both the keyboard and the PS/2 mouse +device atkbdc # AT keyboard controller +device atkbd # AT keyboard +device psm # PS/2 mouse + +device kbdmux # keyboard multiplexer + +device vga # VGA video card driver + +# syscons is the default console driver, resembling an SCO console +device sc + +device agp # support several AGP chipsets + +# Power management support (see NOTES for more options) +device apm +# Add suspend/resume support for the i8254. +device pmtimer + +# PCCARD (PCMCIA) support +# PCMCIA and cardbus bridge support +device cbb # cardbus (yenta) bridge +device pccard # PC Card (16-bit) bus +device cardbus # CardBus (32-bit) bus + +# Serial (COM) ports +device sio # 8250, 16[45]50 based serial ports +device uart # Generic UART driver + +# Parallel port +device ppc +device ppbus # Parallel port bus (required) +device lpt # Printer +device ppi # Parallel port interface device + +# Pseudo devices. +device loop # Network loopback +device random # Entropy device +device ether # Ethernet support +device tun # Packet tunnel. +device pty # Pseudo-ttys (telnet etc) +device md # Memory "disks" +device firmware # firmware assist module + +# The `bpf' device enables the Berkeley Packet Filter. +# Be aware of the administrative consequences of enabling this! +# Note that 'bpf' is required for DHCP. +device bpf # Berkeley packet filter + +options CONSPEED=115200 # Speed for serial console +options HZ=200 +options MAC + +nooptions SCTP +options IPSEC +device crypto --- /u/marko/p4/head/src/sys/i386/i386/dump_machdep.c 2008-02-27 18:28:47.000000000 +0100 +++ src/sys/i386/i386/dump_machdep.c 2008-02-27 11:46:17.000000000 +0100 @@ -24,6 +24,8 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include "opt_vimage.h" + #include __FBSDID("$FreeBSD: src/sys/i386/i386/dump_machdep.c,v 1.14 2008/02/15 06:26:25 scottl Exp $"); @@ -34,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -109,6 +112,7 @@ mkdumpheader(struct kerneldumpheader *kdh, uint32_t archver, uint64_t dumplen, uint32_t blksz) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX */ bzero(kdh, sizeof(*kdh)); strncpy(kdh->magic, KERNELDUMPMAGIC, sizeof(kdh->magic)); @@ -118,7 +122,7 @@ kdh->dumplength = htod64(dumplen); kdh->dumptime = htod64(time_second); kdh->blocksize = htod32(blksz); - strncpy(kdh->hostname, hostname, sizeof(kdh->hostname)); + strncpy(kdh->hostname, V_hostname, sizeof(kdh->hostname)); strncpy(kdh->versionstring, version, sizeof(kdh->versionstring)); if (panicstr != NULL) strncpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring)); --- /u/marko/p4/head/src/sys/i386/i386/minidump_machdep.c 2008-02-27 18:28:48.000000000 +0100 +++ src/sys/i386/i386/minidump_machdep.c 2008-02-27 11:46:23.000000000 +0100 @@ -24,6 +24,8 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include "opt_vimage.h" + #include __FBSDID("$FreeBSD: src/sys/i386/i386/minidump_machdep.c,v 1.5 2008/02/15 06:26:25 scottl Exp $"); @@ -34,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -83,6 +86,7 @@ mkdumpheader(struct kerneldumpheader *kdh, uint32_t archver, uint64_t dumplen, uint32_t blksz) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); bzero(kdh, sizeof(*kdh)); strncpy(kdh->magic, KERNELDUMPMAGIC, sizeof(kdh->magic)); @@ -92,7 +96,7 @@ kdh->dumplength = htod64(dumplen); kdh->dumptime = htod64(time_second); kdh->blocksize = htod32(blksz); - strncpy(kdh->hostname, hostname, sizeof(kdh->hostname)); + strncpy(kdh->hostname, V_hostname, sizeof(kdh->hostname)); strncpy(kdh->versionstring, version, sizeof(kdh->versionstring)); if (panicstr != NULL) strncpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring)); --- /u/marko/p4/head/src/sys/kern/init_main.c 2008-01-15 17:58:03.000000000 +0100 +++ src/sys/kern/init_main.c 2008-02-27 11:46:50.000000000 +0100 @@ -47,6 +47,7 @@ #include "opt_ddb.h" #include "opt_init_path.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -66,6 +67,7 @@ #include #include #include +#include #include #include #include @@ -73,6 +75,7 @@ #include #include #include +#include #include @@ -447,6 +450,11 @@ p->p_ucred->cr_uidinfo = uifind(0); p->p_ucred->cr_ruidinfo = uifind(0); p->p_ucred->cr_prison = NULL; /* Don't jail it. */ +#ifdef VIMAGE + P_TO_VIMAGE(p) = LIST_FIRST(&vimage_head); + refcount_acquire(&P_TO_VIMAGE(p)->vi_ucredrefc); + LIST_FIRST(&vprocg_head)->nprocs++; +#endif #ifdef AUDIT audit_cred_kproc0(p->p_ucred); #endif --- /u/marko/p4/head/src/sys/kern/kern_clock.c 2007-12-27 19:31:49.000000000 +0100 +++ src/sys/kern/kern_clock.c 2008-01-14 19:23:33.000000000 +0100 @@ -42,6 +42,7 @@ #include "opt_hwpmc_hooks.h" #include "opt_ntp.h" #include "opt_watchdog.h" +#include "opt_vimage.h" #include #include @@ -65,6 +66,7 @@ #include #include #include +#include #ifdef GPROF #include @@ -87,6 +89,8 @@ static int sysctl_kern_cp_time(SYSCTL_HANDLER_ARGS) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); + int error; long cp_time[CPUSTATES]; #ifdef SCTL_MASK32 @@ -100,14 +104,14 @@ if (!req->oldptr) return SYSCTL_OUT(req, 0, sizeof(cp_time32)); for (i = 0; i < CPUSTATES; i++) - cp_time32[i] = (unsigned int)cp_time[i]; + cp_time32[i] = (unsigned int)V_cp_time[i]; error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32)); } else #endif { if (!req->oldptr) - return SYSCTL_OUT(req, 0, sizeof(cp_time)); - error = SYSCTL_OUT(req, cp_time, sizeof(cp_time)); + return SYSCTL_OUT(req, 0, sizeof(V_cp_time)); + error = SYSCTL_OUT(req, V_cp_time, sizeof(V_cp_time)); } return error; } @@ -223,6 +227,11 @@ int ticks; int psratio; +#ifdef VIMAGE +u_int tot_acc_statcalls; +int last_acc_ticks; +#endif + /* * Initialize clock frequencies and start both clocks running. */ @@ -468,9 +477,15 @@ struct proc *p; long rss; long *cp_time; + int sel; td = curthread; p = td->td_proc; +#ifdef VIMAGE + INIT_VPROCG(TD_TO_VPROCG(td)); + INIT_VCPU(TD_TO_VCPU(td)); + struct vprocg *vprocg_iter; +#endif cp_time = (long *)PCPU_PTR(cp_time); if (usermode) { @@ -483,9 +498,9 @@ #endif td->td_uticks++; if (p->p_nice > NZERO) - cp_time[CP_NICE]++; + sel = CP_NICE; else - cp_time[CP_USER]++; + sel = CP_USER; } else { /* * Came from kernel mode, so we were: @@ -502,7 +517,7 @@ if ((td->td_pflags & TDP_ITHREAD) || td->td_intr_nesting_level >= 2) { td->td_iticks++; - cp_time[CP_INTR]++; + sel = CP_INTR; } else { #ifdef KSE if (p->p_flag & P_SA) @@ -511,11 +526,50 @@ td->td_pticks++; td->td_sticks++; if (!TD_IS_IDLETHREAD(td)) - cp_time[CP_SYS]++; + sel = CP_SYS; else - cp_time[CP_IDLE]++; + sel = CP_IDLE; + } + } + atomic_add_long(&V_cp_time[sel], 1); /* XXX remove atomic! */ +#ifdef VIMAGE + if (sel != CP_INTR) + sel = CP_IDLE; + /* XXX list locking? */ + LIST_FOREACH(vprocg_iter, &vprocg_head, vprocg_le) + if (vprocg != vprocg_iter) + atomic_add_long(&vprocg_iter->_cp_time[sel], 1); + + /* Per-vcpu average accounting */ + mtx_lock_spin(&vcpu_list_mtx); + tot_acc_statcalls++; + if (!TD_IS_IDLETHREAD(td)) + V_acc_statcalls++; + + /* Deccay processing every 1/16 seconds */ + if (last_acc_ticks + (hz >> 4) <= ticks) { + u_int weight_fixp; + u_int avg0; + + last_acc_ticks = ticks; + /* + * avg0, avg1 and avg2 are stored in 16.16 fixed point format. + * weight_fixp is in 1.31 format for better accuracy. + * + * avg1 loses half of its value in roughly 150 ms. + * avg2 loses half of its value in roughly 1350 ms. + */ + weight_fixp = 0x80000000 / tot_acc_statcalls; + LIST_FOREACH(vcpu, &vcpu_head, vcpu_le) { + avg0 = (weight_fixp * V_acc_statcalls) >> 15; + V_avg1_fixp = (3 * V_avg1_fixp + avg0) >> 2; + V_avg2_fixp = (31 * V_avg2_fixp + avg0) >> 5; + V_acc_statcalls = 0; } + tot_acc_statcalls = 0; } + mtx_unlock_spin(&vcpu_list_mtx); +#endif /* Update resource usage integrals and maximums. */ MPASS(p->p_vmspace != NULL); --- /u/marko/p4/head/src/sys/kern/kern_exit.c 2008-02-27 18:28:48.000000000 +0100 +++ src/sys/kern/kern_exit.c 2008-02-27 11:47:03.000000000 +0100 @@ -40,6 +40,8 @@ #include "opt_compat.h" #include "opt_ktrace.h" #include "opt_mac.h" +#include "opt_sched.h" +#include "opt_vimage.h" #include #include @@ -67,6 +69,7 @@ #include #include #include +#include #ifdef KTRACE #include #endif @@ -165,6 +168,7 @@ } KASSERT(p->p_numthreads == 1, ("exit1: proc %p exiting with %d threads", p, p->p_numthreads)); + /* * Wakeup anyone in procfs' PIOCWAIT. They should have a hold * on our vmspace, so we should block below until they have @@ -404,6 +408,10 @@ LIST_REMOVE(p, p_list); LIST_INSERT_HEAD(&zombproc, p, p_list); LIST_REMOVE(p, p_hash); +#if defined(VIMAGE) && defined(SCHED_4BSD) + if (P_TO_VPROCG(p) != P_TO_VPROCG(p->p_pptr)) + sched_load_reassign(P_TO_VPROCG(p), P_TO_VPROCG(p->p_pptr)); +#endif sx_xunlock(&allproc_lock); /* @@ -661,6 +669,7 @@ AUDIT_ARG(pid, pid); q = td->td_proc; + if (pid == 0) { PROC_LOCK(q); pid = -q->p_pgid; @@ -705,6 +714,7 @@ nfound++; PROC_SLOCK(p); if (p->p_state == PRS_ZOMBIE) { + INIT_VPROCG(P_TO_VPROCG(p)); if (rusage) { *rusage = p->p_ru; calcru(p, &rusage->ru_utime, &rusage->ru_stime); @@ -792,6 +802,9 @@ uma_zfree(proc_zone, p); sx_xlock(&allproc_lock); nprocs--; +#ifdef VIMAGE + vprocg->nprocs--; +#endif sx_xunlock(&allproc_lock); return (0); } --- /u/marko/p4/head/src/sys/kern/kern_fork.c 2007-11-16 18:15:01.000000000 +0100 +++ src/sys/kern/kern_fork.c 2007-12-10 11:26:04.000000000 +0100 @@ -39,6 +39,7 @@ #include "opt_ktrace.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -65,6 +66,7 @@ #include #include #include +#include #include #include @@ -75,7 +77,6 @@ #include #include - #ifndef _SYS_SYSPROTO_H_ struct fork_args { int dummy; @@ -332,6 +333,9 @@ * are hard-limits as to the number of processes that can run. */ nprocs++; +#ifdef VIMAGE + P_TO_VPROCG(p1)->nprocs++; +#endif /* * Find an unused process ID. We remember a range of unused IDs @@ -500,6 +504,9 @@ td2->td_sigmask = td->td_sigmask; td2->td_flags = TDF_INMEM; + td2->td_vnet = NULL; /* XXX */ + td2->td_vnet_lpush = NULL; /* XXX */ + /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. --- /u/marko/p4/head/src/sys/kern/kern_jail.c 2008-01-28 23:53:46.000000000 +0100 +++ src/sys/kern/kern_jail.c 2008-02-27 11:47:04.000000000 +0100 @@ -11,6 +11,7 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_jail.c,v 1.75 2008/01/24 08:25:58 bz Exp $"); #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -33,6 +34,8 @@ #include #include #include +#include + #include #include @@ -449,6 +452,10 @@ if (cred2->cr_prison != cred1->cr_prison) return (ESRCH); } +#ifdef VIMAGE + if (cred2->cr_vimage->v_procg != cred1->cr_vimage->v_procg) + return (ESRCH); +#endif return (0); } @@ -469,13 +476,14 @@ void getcredhostname(struct ucred *cred, char *buf, size_t size) { + INIT_VPROCG(cred->cr_vimage->v_procg); if (jailed(cred)) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_host, size); mtx_unlock(&cred->cr_prison->pr_mtx); } else - strlcpy(buf, hostname, size); + strlcpy(buf, V_hostname, size); } /* --- /u/marko/p4/head/src/sys/kern/kern_kse.c 2007-11-16 18:15:01.000000000 +0100 +++ src/sys/kern/kern_kse.c 2007-12-10 11:26:04.000000000 +0100 @@ -1043,7 +1043,10 @@ */ bcopy(&td->td_startcopy, &td2->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); + td2->td_vnet = NULL; + td2->td_vnet_lpush = NULL; sched_fork_thread(td, td2); + thread_link(td2, ku->ku_proc); bcopy(ku->ku_proc->p_comm, td2->td_name, sizeof(td2->td_name)); /* inherit parts of blocked thread's context as a good template */ --- /u/marko/p4/head/src/sys/kern/kern_linker.c 2008-01-15 18:00:08.000000000 +0100 +++ src/sys/kern/kern_linker.c 2008-02-27 11:47:07.000000000 +0100 @@ -30,6 +30,7 @@ #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -51,6 +52,9 @@ #include #include #include +#include + +#include #include @@ -950,6 +954,13 @@ if ((error = priv_check(td, PRIV_KLD_LOAD)) != 0) return (error); +#ifdef VIMAGE + if (!IS_DEFAULT_VIMAGE(TD_TO_VIMAGE(td))) + return (EPERM); + + CURVNET_SET(TD_TO_VNET(td)); +#endif + /* * If file does not contain a qualified name or any dot in it * (kldname.ko, or kldname.ver.ko) treat it as an interface @@ -977,6 +988,7 @@ *fileid = lf->id; unlock: KLD_UNLOCK(); + CURVNET_RESTORE(); return (error); } @@ -1014,6 +1026,11 @@ if ((error = priv_check(td, PRIV_KLD_UNLOAD)) != 0) return (error); + /* XXX should suser catch this for us? */ + VNET_ASSERT(IS_DEFAULT_VIMAGE(TD_TO_VIMAGE(td))); + + CURVNET_SET(TD_TO_VNET(td)); + KLD_LOCK(); lf = linker_find_file_by_id(fileid); if (lf) { @@ -1050,6 +1067,7 @@ PMC_CALL_HOOK(td, PMC_FN_KLD_UNLOAD, (void *) &pkm); #endif KLD_UNLOCK(); + CURVNET_RESTORE(); return (error); } @@ -1267,12 +1285,24 @@ lookup.symvalue = (uintptr_t)symval.value; lookup.symsize = symval.size; error = copyout(&lookup, uap->data, - sizeof(lookup)); + sizeof(lookup)); break; } } +#ifdef VIMAGE + if (lf == NULL) { + CURVNET_SET(TD_TO_VNET(td)); + error = vi_symlookup(&lookup, symstr); + CURVNET_RESTORE(); + if (error == 0) { + error = copyout(&lookup, uap->data, + sizeof(lookup)); + } + } +#else if (lf == NULL) error = ENOENT; +#endif } KLD_UNLOCK(); out: --- /u/marko/p4/head/src/sys/kern/kern_mib.c 2008-02-27 18:28:48.000000000 +0100 +++ src/sys/kern/kern_mib.c 2008-02-27 11:47:10.000000000 +0100 @@ -41,6 +41,7 @@ #include "opt_compat.h" #include "opt_posix.h" #include "opt_config.h" +#include "opt_vimage.h" #include #include @@ -53,6 +54,7 @@ #include #include #include +#include SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW, 0, "Sysctl internal magic"); @@ -206,11 +208,14 @@ SYSCTL_STRING(_hw, HW_MACHINE_ARCH, machine_arch, CTLFLAG_RD, machine_arch, 0, "System architecture"); +#ifndef VIMAGE char hostname[MAXHOSTNAMELEN]; +#endif static int sysctl_hostname(SYSCTL_HANDLER_ARGS) { + INIT_VPROCG(TD_TO_VPROCG(req->td)); struct prison *pr; char tmphostname[MAXHOSTNAMELEN]; int error; @@ -242,7 +247,7 @@ } } else error = sysctl_handle_string(oidp, - hostname, sizeof hostname, req); + V_hostname, sizeof V_hostname, req); return (error); } @@ -328,9 +333,12 @@ 0, 0, sysctl_kern_config, "", "Kernel configuration file"); #endif +#ifndef VIMAGE char domainname[MAXHOSTNAMELEN]; -SYSCTL_STRING(_kern, KERN_NISDOMAINNAME, domainname, CTLFLAG_RW, - &domainname, sizeof(domainname), "Name of the current YP/NIS domain"); +#endif +SYSCTL_V_STRING(V_PROCG, vprocg, _kern, KERN_NISDOMAINNAME, domainname, + CTLFLAG_RW, domainname, MAXHOSTNAMELEN, + "Name of the current YP/NIS domain"); u_long hostid; SYSCTL_ULONG(_kern, KERN_HOSTID, hostid, CTLFLAG_RW, &hostid, 0, "Host ID"); --- /u/marko/p4/head/src/sys/kern/kern_prot.c 2007-10-29 17:17:39.000000000 +0100 +++ src/sys/kern/kern_prot.c 2007-12-10 11:26:04.000000000 +0100 @@ -46,6 +46,7 @@ #include "opt_compat.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -67,6 +68,7 @@ #include #include #include +#include #include #include @@ -1720,6 +1722,9 @@ KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); +#ifdef VIMAGE + if (!vi_child_of(TD_TO_VIMAGE(td), P_TO_VIMAGE(p))) +#endif if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC @@ -1789,6 +1794,10 @@ */ if (jailed(cr)) prison_free(cr->cr_prison); +#ifdef VIMAGE + if (cr->cr_vimage != NULL) + refcount_release(&cr->cr_vimage->vi_ucredrefc); +#endif #ifdef AUDIT audit_cred_destroy(cr); #endif @@ -1824,6 +1833,10 @@ uihold(dest->cr_ruidinfo); if (jailed(dest)) prison_hold(dest->cr_prison); +#ifdef VIMAGE + KASSERT(src->cr_vimage != NULL, ("cr_vimage == NULL")); + refcount_acquire(&dest->cr_vimage->vi_ucredrefc); +#endif #ifdef AUDIT audit_cred_copy(src, dest); #endif --- /u/marko/p4/head/src/sys/kern/kern_switch.c 2007-11-14 19:35:22.000000000 +0100 +++ src/sys/kern/kern_switch.c 2007-12-10 11:26:05.000000000 +0100 @@ -529,6 +529,7 @@ return (NULL); } + /* * Remove the thread from the queue specified by its priority, and clear the * corresponding status bit if the queue becomes empty. --- /u/marko/p4/head/src/sys/kern/kern_synch.c 2008-01-15 18:00:09.000000000 +0100 +++ src/sys/kern/kern_synch.c 2008-02-27 11:47:20.000000000 +0100 @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_synch.c,v 1.305 2008/01/10 22:11:20 rwatson Exp $"); #include "opt_ktrace.h" +#include "opt_vimage.h" #include #include @@ -61,9 +62,12 @@ #include #include #endif +#include #include +#include + static void synch_setup(void *dummy); SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup, NULL) @@ -74,8 +78,11 @@ static struct callout loadav_callout; static struct callout lbolt_callout; +#ifndef VIMAGE struct loadavg averunnable = { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ +#endif + /* * Constants for averages over 1, 5, and 15 minutes * when sampling at 5 second intervals. @@ -513,12 +520,19 @@ int i, nrun; struct loadavg *avg; + VPROCG_ITERLOOP_BEGIN(); + INIT_VPROCG(vprocg_iter); +#ifdef VIMAGE + nrun = sched_load(vprocg_iter); +#else nrun = sched_load(); - avg = &averunnable; +#endif + avg = &V_averunnable; for (i = 0; i < 3; i++) avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; + VPROCG_ITERLOOP_END(); /* * Schedule the next update to occur after 5 seconds, but add a --- /u/marko/p4/head/src/sys/kern/kern_sysctl.c 2007-12-03 11:00:00.000000000 +0100 +++ src/sys/kern/kern_sysctl.c 2007-12-10 11:26:05.000000000 +0100 @@ -40,6 +40,7 @@ #include "opt_compat.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -52,6 +53,7 @@ #include #include #include +#include #include @@ -845,6 +847,32 @@ } +#ifdef VIMAGE +int +sysctl_handle_v_int(SYSCTL_HANDLER_V_ARGS) +{ + int tmpout, error = 0; + + SYSCTL_RESOLVE_V_ARG1(); + + /* + * Attempt to get a coherent snapshot by making a copy of the data. + */ + tmpout = *(int *)arg1; + error = SYSCTL_OUT(req, &tmpout, sizeof(int)); + + if (error || !req->newptr) + return (error); + + if (!arg1) + error = EPERM; + else + error = SYSCTL_IN(req, arg1, sizeof(int)); + return (error); +} +#endif + + /* * Based on on sysctl_handle_int() convert milliseconds into ticks. */ @@ -979,6 +1007,48 @@ return (error); } +#ifdef VIMAGE +int +sysctl_handle_v_string(SYSCTL_HANDLER_V_ARGS) +{ + int error=0; + char *tmparg; + size_t outlen; + + SYSCTL_RESOLVE_V_ARG1(); + + /* + * Attempt to get a coherent snapshot by copying to a + * temporary kernel buffer. + */ +retry: + outlen = strlen((char *)arg1)+1; + tmparg = malloc(outlen, M_SYSCTLTMP, M_WAITOK); + + if (strlcpy(tmparg, (char *)arg1, outlen) >= outlen) { + free(tmparg, M_SYSCTLTMP); + goto retry; + } + + error = SYSCTL_OUT(req, tmparg, outlen); + free(tmparg, M_SYSCTLTMP); + + if (error || !req->newptr) + return (error); + + if ((req->newlen - req->newidx) >= arg2) { + error = EINVAL; + } else { + arg2 = (req->newlen - req->newidx); + error = SYSCTL_IN(req, arg1, arg2); + ((char *)arg1)[arg2] = '\0'; + } + + return (error); +} +#endif + + /* * Handle any kind of opaque data. * arg1 points to it, arg2 is the size. @@ -1016,6 +1086,35 @@ return (error); } +#ifdef VIMAGE +int +sysctl_handle_v_opaque(SYSCTL_HANDLER_V_ARGS) +{ + int error, tries; + u_int generation; + struct sysctl_req req2; + + SYSCTL_RESOLVE_V_ARG1(); + + tries = 0; + req2 = *req; +retry: + generation = curthread->td_generation; + error = SYSCTL_OUT(req, arg1, arg2); + if (error) + return (error); + tries++; + if (generation != curthread->td_generation && tries < 3) { + *req = req2; + goto retry; + } + + error = SYSCTL_IN(req, arg1, arg2); + + return (error); +} +#endif + /* * Transfer functions to/from kernel space. * XXX: rather untested at this point @@ -1322,7 +1421,17 @@ if (error != 0) return (error); #endif +#ifndef VIMAGE error = oid->oid_handler(oid, arg1, arg2, req); +#else + if (oid->oid_v_subs) { + struct sysctl_v_oid *v_oid = (struct sysctl_v_oid *) oid; + error = v_oid->oid_handler(oid, arg1, arg2, + req, oid->oid_v_subs, + oid->oid_v_mod); + } else + error = oid->oid_handler(oid, arg1, arg2, req); +#endif return (error); } @@ -1413,6 +1522,7 @@ req.lock = REQ_LOCKED; SYSCTL_LOCK(); + CURVNET_SET(TD_TO_VNET(curthread)); do { req.oldidx = 0; @@ -1423,6 +1533,7 @@ if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); + CURVNET_RESTORE(); SYSCTL_UNLOCK(); if (error && error != ENOMEM) --- /u/marko/p4/head/src/sys/kern/kern_thread.c 2007-12-27 19:31:56.000000000 +0100 +++ src/sys/kern/kern_thread.c 2008-01-14 19:23:37.000000000 +0100 @@ -26,6 +26,8 @@ * DAMAGE. */ +#include "opt_vimage.h" + #include __FBSDID("$FreeBSD: src/sys/kern/kern_thread.c,v 1.265 2007/12/22 04:56:48 julian Exp $"); @@ -44,6 +46,7 @@ #include #include #include +#include #include --- /u/marko/p4/head/src/sys/kern/kern_timeout.c 2008-02-27 18:28:49.000000000 +0100 +++ src/sys/kern/kern_timeout.c 2008-02-27 11:47:24.000000000 +0100 @@ -73,6 +73,9 @@ struct callout_tailq *callwheel; int softticks; /* Like ticks, but for softclock(). */ struct mtx callout_lock; +#ifdef INVARIANTS +static int callwheel_initialized = 0; +#endif static struct callout *nextsoftcheck; /* Next callout to be checked. */ @@ -143,6 +146,9 @@ TAILQ_INIT(&callwheel[i]); } mtx_init(&callout_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE); +#ifdef INVARIANTS + callwheel_initialized = 1; +#endif } /* @@ -612,11 +618,36 @@ return (1); } +#ifdef INVARIANTS +/* + * Examine the entire callwhell before initializing a new handle, + * and panic if the handle was already linked in. + */ +#define CALLWHEEL_CHECK(c) \ + if (callwheel_initialized) { \ + int callwheel_iter; \ + struct callout *c_iter; \ + \ + mtx_lock_spin(&callout_lock); \ + for (callwheel_iter = 0; callwheel_iter <= callwheelmask; \ + callwheel_iter++) \ + TAILQ_FOREACH(c_iter, &callwheel[callwheel_iter], \ + c_links.tqe) \ + if (c_iter == c) \ + panic("%s() for active handle!", \ + __FUNCTION__); \ + mtx_unlock_spin(&callout_lock); \ + } +#else +#define CALLWHEEL_CHECK(c) +#endif /* INVARIANTS */ + void callout_init(c, mpsafe) struct callout *c; int mpsafe; { + CALLWHEEL_CHECK(c); bzero(c, sizeof *c); if (mpsafe) { c->c_lock = NULL; @@ -633,6 +664,7 @@ struct lock_object *lock; int flags; { + CALLWHEEL_CHECK(c); bzero(c, sizeof *c); c->c_lock = lock; KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0, --- /u/marko/p4/head/src/sys/kern/kern_uuid.c 2007-08-31 03:47:34.000000000 +0200 +++ src/sys/kern/kern_uuid.c 2007-10-22 18:06:31.000000000 +0200 @@ -27,6 +27,8 @@ #include __FBSDID("$FreeBSD: src/sys/kern/kern_uuid.c,v 1.13 2007/04/23 12:53:00 pjd Exp $"); +#include "opt_vimage.h" + #include #include #include @@ -37,7 +39,9 @@ #include #include #include +#include +#include #include #include #include @@ -87,13 +91,14 @@ static void uuid_node(uint16_t *node) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifaddr *ifa; struct sockaddr_dl *sdl; int i; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Walk the address list */ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sdl = (struct sockaddr_dl*)ifa->ifa_addr; --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/kern/kern_vimage.c 2008-02-27 18:30:30.000000000 +0100 @@ -0,0 +1,1011 @@ +/*- + * Copyright (c) 2004, 2005, 2006 University of Zagreb + * Copyright (c) 2006 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * XXX RCS tag goes here + */ + +#include "opt_ddb.h" +#include "opt_vimage.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DDB +#include +#endif + +#include +#include +#include +#include +#include + +//#define DEBUG_ORDERING + +MALLOC_DEFINE(M_VIMAGE, "vimage", "virtual image resource container"); +MALLOC_DEFINE(M_VNET, "vnet", "network stack control block"); +MALLOC_DEFINE(M_VPROCG, "vprocg", "process group control block"); +MALLOC_DEFINE(M_VCPU, "vcpu", "cpu resource control block"); + +static struct vimage *vi_alloc(struct vimage *, char *); +static int vi_destroy(struct vimage *); +static void vnet_mod_complete_registration(struct vnet_modlink *); +static int vnet_mod_constructor(struct vnet_modlink *); +static int vnet_mod_destructor(struct vnet_modlink *); + +#ifdef VI_PREALLOC_SIZE +/* + * A private memory allocator can be enabled by setting VI_PREALLOC_SIZE + * to amount of memory (in bytes) to be reserved for the allocator at + * boot time. This pool is guaranteed to reside on a 4M superpage(s) on + * i386 and amd64, thus potentially reducing TLB trashing. + * + * So far I couldn't observe any significant performance impact of using + * this allocator vs. the standard malloc(), whereas in FreeBSD 4.11 + * days I recall using "uninitialized data" storage vs. malloc() would + * be an instant win... Is it possible that these days all malloc'ed + * kernel storage is automagically placed on 4M superpages, so that this + * effort is redundant? Who knows... Therefore this code is disabled by + * default, so vi_alloc() and vi_free() simply resolve to standard + * malloc() and free(). + */ + +static void *vi_malloc(unsigned long, struct malloc_type *, int); +static void vi_free(void *, struct malloc_type *); + +struct vi_mtrack { + LIST_ENTRY(vi_mtrack) vmt_le; + char *vmt_addr; + size_t vmt_size; + int vmt_flags; +}; + +static char vi_mpool[VI_PREALLOC_SIZE]; +static struct uma_zone *vi_mtrack_zone; +static LIST_HEAD(, vi_mtrack) vi_mem_free_head; +static LIST_HEAD(, vi_mtrack) vi_mem_alloc_head; +static int vi_mpool_fail_cnt = 0; +#else +#define vi_malloc(addr, type, flags) malloc((addr), (type), (flags)) +#define vi_free(addr, type) free((addr), (type)) +#endif /* VI_PREALLOC_SIZE */ + +struct vimage_list_head vimage_head; +struct vnet_list_head vnet_head; +struct vprocg_list_head vprocg_head; +struct vcpu_list_head vcpu_head; + +struct cv vnet_list_condvar; +struct mtx vnet_list_refc_mtx; +int vnet_list_refc = 0; + +struct mtx vcpu_list_mtx; + +#define VNET_LIST_LOCK() \ + mtx_lock(&vnet_list_refc_mtx); \ + while (vnet_list_refc != 0) \ + cv_wait(&vnet_list_condvar, &vnet_list_refc_mtx); + +#define VNET_LIST_UNLOCK() \ + mtx_unlock(&vnet_list_refc_mtx); + +static u_int last_vi_id = 0; +static u_int last_vnet_id = 0; +static u_int last_vprocg_id = 0; +static u_int last_vcpu_id = 0; + +static TAILQ_HEAD(vnet_modlink_head, vnet_modlink) vnet_modlink_head; +static TAILQ_HEAD(vnet_modpending_head, vnet_modlink) vnet_modpending_head; + +void vnet_mod_register(vmi) + const struct vnet_modinfo *vmi; +{ + vnet_mod_register_multi(vmi, NULL, NULL); +} + +void vnet_mod_register_multi(vmi, iarg, iname) + const struct vnet_modinfo *vmi; + const void *iarg; + const char *iname; +{ + struct vnet_modlink *vml, *vml_iter; + + /* Do not register the same module instance more than once */ + TAILQ_FOREACH(vml_iter, &vnet_modlink_head, vml_mod_le) + if (vml_iter->vml_modinfo == vmi && vml_iter->vml_iarg == iarg) + break; + if (vml_iter != NULL) + panic("attempt to register an already registered vnet module"); + vml = vi_malloc(sizeof(struct vnet_modlink), M_VIMAGE, M_NOWAIT); + + /* + * XXX we support only statically assigned module IDs at the time. + * In principle modules should be able to get a dynamically + * assigned ID at registration time. + */ + VNET_ASSERT(vmi->vmi_id > 0 || vmi->vmi_id < VNET_MOD_MAX); + VNET_ASSERT(!((iarg == NULL) ^ (iname == NULL))); + + vml->vml_modinfo = vmi; + vml->vml_iarg = iarg; + vml->vml_iname = iname; + + /* Check whether the module we depend on is already registered */ + if (vmi->vmi_dependson != VNET_MOD_NONE) { + TAILQ_FOREACH(vml_iter, &vnet_modlink_head, vml_mod_le) + if (vml_iter->vml_modinfo->vmi_id == + vmi->vmi_dependson) + break; /* Depencency found, we are done */ + if (vml_iter == NULL) { +#ifdef DEBUG_ORDERING + printf("dependency %d missing for vnet mod %s," + "postponing registration\n", + vmi->vmi_dependson, vmi->vmi_name); +#endif /* DEBUG_ORDERING */ + TAILQ_INSERT_TAIL(&vnet_modpending_head, vml, + vml_mod_le); + return; + } + } + + vnet_mod_complete_registration(vml); +} + +void vnet_mod_complete_registration(vml) +struct vnet_modlink *vml; +{ + struct vnet_modlink *vml_iter; + + TAILQ_INSERT_TAIL(&vnet_modlink_head, vml, vml_mod_le); + + VNET_ITERLOOP_BEGIN_QUIET(); + vnet_mod_constructor(vml); + VNET_ITERLOOP_END(); + + /* Check for pending modules depending on us */ + do { + TAILQ_FOREACH(vml_iter, &vnet_modpending_head, vml_mod_le) + if (vml_iter->vml_modinfo->vmi_dependson == + vml->vml_modinfo->vmi_id) + break; + if (vml_iter != NULL) { +#ifdef DEBUG_ORDERING + printf("vnet mod %s now registering," + "dependency %d loaded\n", + vml_iter->vml_modinfo->vmi_name, + vml->vml_modinfo->vmi_id); +#endif /* DEBUG_ORDERING */ + TAILQ_REMOVE(&vnet_modpending_head, vml_iter, + vml_mod_le); + vnet_mod_complete_registration(vml_iter); + } + } while (vml_iter != NULL); +} + +void vnet_mod_deregister(vmi) + const struct vnet_modinfo *vmi; +{ + vnet_mod_deregister_multi(vmi, NULL, NULL); +} + +void vnet_mod_deregister_multi(vmi, iarg, iname) + const struct vnet_modinfo *vmi; + const void *iarg; + const char *iname; +{ + struct vnet_modlink *vml; + + TAILQ_FOREACH(vml, &vnet_modlink_head, vml_mod_le) + if (vml->vml_modinfo == vmi && vml->vml_iarg == iarg) + break; + if (vml == NULL) + panic("cannot deregister unregistered vnet module %s", + vmi->vmi_name); + + VNET_ITERLOOP_BEGIN_QUIET(); + vnet_mod_destructor(vml); + VNET_ITERLOOP_END(); + + TAILQ_REMOVE(&vnet_modlink_head, vml, vml_mod_le); + vi_free(vml, M_VIMAGE); +} + +struct vimage *vnet2vimage(vnet) + struct vnet *vnet; +{ + struct vimage *vip; + + LIST_FOREACH(vip, &vimage_head, vi_le) + if (vip->v_net == vnet) + return(vip); + + panic("vnet2vimage"); /* must never happen */ +} + +char *vnet_name(vnet) + struct vnet *vnet; +{ + return(vnet2vimage(vnet)->vi_name); +} + + +int +vi_child_of(parent, child) + struct vimage *parent, *child; +{ + if (child == parent) + return (0); + for (; child; child = child->vi_parent) + if (child == parent) + return (1); + return (0); +} + +/* + * if_reassign_common() should be called by all device specific + * ifnet reassignment routines after the interface is detached from + * current vnet and before the interface gets attached to the target + * vnet. This routine attempts to shrink if_index in current vnet, + * find an unused if_index in target vnet and calls if_grow() if + * necessary, and finally finds an unused if_xname for the target + * vnet. + * + * XXX this routine should hold a lock over if_index and return with + * such a lock held, and the caller should release that lock + * after ifattach completes! + */ +void +if_reassign_common(struct ifnet *ifp, struct vnet *new_vnet, const char *dname) +{ + /* do/while construct needed to confine scope of INIT_VNET_NET() */ + do { + INIT_VNET_NET(curvnet); + + ifnet_byindex(ifp->if_index) = NULL; + /* XXX: should be locked with if_findindex() */ + while (V_if_index > 0 && ifnet_byindex(V_if_index) == NULL) + V_if_index--; + } while (0); + + CURVNET_SET_QUIET(new_vnet); + INIT_VNET_NET(new_vnet); + /* + * Try to find an empty slot below if_index. If we fail, take + * the next slot. + * + * XXX: should be locked! + */ + for (ifp->if_index = 1; ifp->if_index <= V_if_index; ifp->if_index++) { + if (ifnet_byindex(ifp->if_index) == NULL) + break; + } + /* Catch if_index overflow. */ + if (ifp->if_index < 1) + panic("vi_if_move: if_index overflow"); + + if (ifp->if_index > V_if_index) + V_if_index = ifp->if_index; + if (V_if_index >= V_if_indexlim) + if_grow(); + ifnet_byindex(ifp->if_index) = ifp; + + /* Rename the ifnet */ + if (new_vnet == ifp->if_home_vnet) { + /* always restore the original name on return to home vnet */ + snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", ifp->if_dname, + ifp->if_dunit); + } else { + int unit = 0; + struct ifnet *iter; + + do { + snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", dname, unit); + TAILQ_FOREACH(iter, &V_ifnet, if_link) + if (strcmp(ifp->if_xname, iter->if_xname) == 0) + break; + unit++; + } while (iter); + } + CURVNET_RESTORE(); +} + +/* + * Move the interface to another vnet. The interface can be specified either + * by ifp argument, or by name contained in vi_req->vi_chroot if NULL is + * passed as ifp. The interface will be renamed to vi_req->vi_parent_name + * if vi_req->vi_parent_name is not an empty string (uff ugly ugly)... + * Similary, the target vnet can be specified either by vnet argument or + * by name. If vnet name equals to ".." or vi_req is set to NULL the + * interface is moved to the parent vnet. + */ +int +vi_if_move(vi_req, ifp, vip) + struct vi_req *vi_req; + struct ifnet *ifp; + struct vimage *vip; +{ + struct vimage *new_vip; + struct vnet *new_vnet = NULL; + + if (vi_req == NULL || strcmp(vi_req->vi_name, "..") == 0) { + if (IS_DEFAULT_VIMAGE(vip)) + return (ENXIO); + new_vnet = vip->vi_parent->v_net; + } else { + new_vip = vimage_by_name(vip, vi_req->vi_name); + if (new_vip == NULL) + return (ENXIO); + new_vnet = new_vip->v_net; + } + + if (ifp == NULL) + ifp = ifunit(vi_req->vi_chroot); + if (ifp == NULL) + return (ENXIO); + + /* Abort if driver did not provide a if_reassign() method */ + if (ifp->if_reassign == NULL) + return (ENODEV); + + if (vi_req != NULL) { + struct ifnet *t_ifp; + + CURVNET_SET_QUIET(new_vnet); + t_ifp = ifunit(vi_req->vi_if_xname); + CURVNET_RESTORE(); + if (t_ifp != NULL) + return (EEXIST); + } + + if (vi_req && strlen(vi_req->vi_if_xname) > 0) + ifp->if_reassign(ifp, new_vnet, vi_req->vi_if_xname); + else + ifp->if_reassign(ifp, new_vnet, NULL); + getmicrotime(&ifp->if_lastchange); + + /* Report the new if_xname back to the userland */ + if (vi_req != NULL) + sprintf(vi_req->vi_chroot, "%s", ifp->if_xname); + + return (0); +} + + +struct vimage * +vimage_by_name(struct vimage *top, char *name) +{ + struct vimage *vip; + char *next_name; + int namelen; + + next_name = strchr(name, '.'); + if (next_name != NULL) { + namelen = next_name - name; + next_name++; + if (namelen == 0) { + if (strlen(next_name) == 0) + return(top); /* '.' == this vimage */ + else + return(NULL); + } + } else + namelen = strlen(name); + if (namelen == 0) + return(NULL); + LIST_FOREACH(vip, &top->vi_child_head, vi_sibling) + if (strlen(vip->vi_name) == namelen && + strncmp(name, vip->vi_name, namelen) == 0) { + if (next_name != NULL) + return(vimage_by_name(vip, next_name)); + else + return(vip); + } + return(NULL); +} + + +static void +vimage_relative_name(struct vimage *top, struct vimage *where, + char *buffer, int bufflen) +{ + int used = 1; + + if (where == top) { + sprintf(buffer, "."); + return; + } else + *buffer = 0; + + do { + int namelen = strlen(where->vi_name); + + if (namelen + used + 1 >= bufflen) + panic("buffer overflow"); + + if (used > 1) { + bcopy(buffer, &buffer[namelen + 1], used); + buffer[namelen] = '.'; + used++; + } else + bcopy(buffer, &buffer[namelen], used); + bcopy(where->vi_name, buffer, namelen); + used += namelen; + where = where->vi_parent; + } while (where != top); +} + + +static struct vimage * +vimage_get_next(struct vimage *top, struct vimage *where, int recurse) +{ + struct vimage *next; + + if (recurse) { + /* Try to go deeper in the hierarchy */ + next = LIST_FIRST(&where->vi_child_head); + if (next != NULL) + return(next); + } + + do { + /* Try to find next sibling */ + next = LIST_NEXT(where, vi_sibling); + if (!recurse || next != NULL) + return(next); + + /* Nothing left on this level, go one level up */ + where = where->vi_parent; + } while (where != top->vi_parent); + + /* Nothing left to be visited, we are done */ + return(NULL); +} + + +int +vi_td_ioctl(cmd, vi_req, td) + u_long cmd; + struct vi_req *vi_req; + struct thread *td; +{ + int error; + struct vimage *vip = TD_TO_VIMAGE(td); + struct vimage *vip_r = NULL; + + error = suser(td); /* XXX replace with priv(9) */ + if (error) + return (error); + + vip_r = vimage_by_name(vip, vi_req->vi_name); + if (vip_r == NULL && !(vi_req->req_action & VI_CREATE)) + return (ESRCH); + if (vip_r != NULL && vi_req->req_action & VI_CREATE) + return (EADDRINUSE); + if (vi_req->req_action == VI_GETNEXT) { + vip_r = vimage_get_next(vip, vip_r, 0); + if (vip_r == NULL) + return (ESRCH); + } + if (vi_req->req_action == VI_GETNEXT_RECURSE) { + vip_r = vimage_get_next(vip, vip_r, 1); + if (vip_r == NULL) + return (ESRCH); + } + + if (vip_r && !vi_child_of(vip, vip_r) && /* XXX delete the rest? */ + vi_req->req_action != VI_GET && vi_req->req_action != VI_GETNEXT) + return (EPERM); + + switch (cmd) { + + case SIOCGPVIMAGE: + vimage_relative_name(vip, vip_r, vi_req->vi_name, + sizeof (vi_req->vi_name)); + bcopy(&vip_r->v_procg->_averunnable, &vi_req->averunnable, + sizeof (vi_req->averunnable)); + vi_req->vi_proc_count = vip_r->v_procg->nprocs; + vi_req->vi_if_count = vip_r->v_net->ifccnt; + vi_req->vi_sock_count = vip_r->v_net->sockcnt; + vi_req->cp_time_avg = vip_r->v_cpu->_avg2_fixp; + break; + + case SIOCSPVIMAGE: + if (vi_req->req_action == VI_DESTROY) { + error = vi_destroy(vip_r); + break; + } + + if (vi_req->req_action == VI_SWITCHTO) { + struct proc *p = td->td_proc; + struct ucred *oldcred, *newcred; + + /* + * XXX priv_check()? + * XXX allow only a single td per proc here? + */ + newcred = crget(); + PROC_LOCK(p); + oldcred = p->p_ucred; + setsugid(p); + crcopy(newcred, oldcred); + refcount_release(&newcred->cr_vimage->vi_ucredrefc); + newcred->cr_vimage = vip_r; + refcount_acquire(&newcred->cr_vimage->vi_ucredrefc); + p->p_ucred = newcred; + PROC_UNLOCK(p); + sx_xlock(&allproc_lock); + oldcred->cr_vimage->v_procg->nprocs--; + refcount_release(&oldcred->cr_vimage->vi_ucredrefc); + P_TO_VPROCG(p)->nprocs++; + sched_load_reassign(oldcred->cr_vimage->v_procg, + newcred->cr_vimage->v_procg); + sx_xunlock(&allproc_lock); + crfree(oldcred); + break; + } + + if (vi_req->req_action & VI_CREATE) { + char *dotpos; + + dotpos = strrchr(vi_req->vi_name, '.'); + if (dotpos != NULL) { + *dotpos = 0; + vip = vimage_by_name(vip, vi_req->vi_name); + if (vip == NULL) + return (ESRCH); + dotpos++; + vip_r = vi_alloc(vip, dotpos); + } else + vip_r = vi_alloc(vip, vi_req->vi_name); + if (vip_r == NULL) + return (ENOMEM); + } + + /* XXX What the hell is this doing here? */ + if (vip == vip_r && !IS_DEFAULT_VIMAGE(vip)) + return (EPERM); + } + + return (error); +} + + +int +vi_symlookup(lookup, symstr) + struct kld_sym_lookup *lookup; + char *symstr; +{ + struct vnet_modlink *vml; + + TAILQ_FOREACH(vml, &vnet_modlink_head, vml_mod_le) { + struct vnet_symmap *mapentry; + + if (vml->vml_modinfo->vmi_symmap == NULL) + continue; + + for (mapentry = vml->vml_modinfo->vmi_symmap; + mapentry->name != NULL; mapentry++) { + if (strcmp(symstr, mapentry->name) == 0) { + lookup->symvalue = + (u_long) curvnet->mod_data[vml->vml_modinfo->vmi_id]; + lookup->symvalue += mapentry->offset; + lookup->symsize = mapentry->size; + return 0; + } + } + } + + return ENOENT; +} + + +struct vimage * +vi_alloc(struct vimage *parent, char *name) +{ + struct vimage *vip; + struct vnet *vnet; + struct vprocg *vprocg; + struct vcpu *vcpu; + struct vnet_modlink *vml; + + /* + * XXX don't forget the locking + */ + + /* A brute force check whether there's enough mem for a new vimage */ + vip = malloc(512*1024, M_VIMAGE, M_NOWAIT); /* XXX aaaargh... */ + if (vip == NULL) + goto vi_alloc_done; + free(vip, M_VIMAGE); + + vip = vi_malloc(sizeof(struct vimage), M_VIMAGE, M_NOWAIT | M_ZERO); + if (vip == NULL) + panic("vi_alloc: malloc failed for vimage \"%s\"\n", name); + vip->vi_id = last_vi_id++; + LIST_INIT(&vip->vi_child_head); + sprintf(vip->vi_name, "%s", name); + vip->vi_parent = parent; + /* XXX locking */ + if (parent != NULL) + LIST_INSERT_HEAD(&parent->vi_child_head, vip, vi_sibling); + else if (!LIST_EMPTY(&vimage_head)) + panic("there can be only one default vimage!"); + LIST_INSERT_HEAD(&vimage_head, vip, vi_le); + + vnet = vi_malloc(sizeof(struct vnet), M_VNET, M_NOWAIT | M_ZERO); + if (vnet == NULL) + panic("vi_alloc: malloc failed for vnet \"%s\"\n", name); + vip->v_net = vnet; + vnet->vnet_id = last_vnet_id++; + vnet->vnet_magic_n = VNET_MAGIC_N; + + vprocg = vi_malloc(sizeof(struct vprocg), M_VPROCG, M_NOWAIT | M_ZERO); + if (vprocg == NULL) + panic("vi_alloc: malloc failed for vprocg \"%s\"\n", name); + vip->v_procg = vprocg; + vprocg->vprocg_id = last_vprocg_id++; + + vcpu = vi_malloc(sizeof(struct vcpu), M_VCPU, M_NOWAIT | M_ZERO); + if (vcpu == NULL) + panic ("vi_alloc: malloc failed for vcpu \"%s\"\n", name); + vip->v_cpu = vcpu; + vcpu->vcpu_id = last_vcpu_id++; + + /* Struct vprocg initialization - perhaps move to anther place? */ + V_averunnable.fscale = FSCALE; + + /* Initialize / attach vnet module instances. */ + CURVNET_SET_QUIET(vnet); + TAILQ_FOREACH(vml, &vnet_modlink_head, vml_mod_le) + vnet_mod_constructor(vml); + CURVNET_RESTORE(); + + VNET_LIST_LOCK(); + LIST_INSERT_HEAD(&vnet_head, vnet, vnet_le); + VNET_LIST_UNLOCK(); + + /* XXX locking */ + LIST_INSERT_HEAD(&vprocg_head, vprocg, vprocg_le); + + mtx_lock_spin(&vcpu_list_mtx); + LIST_INSERT_HEAD(&vcpu_head, vcpu, vcpu_le); + mtx_unlock_spin(&vcpu_list_mtx); + +vi_alloc_done: + return (vip); +} + + +/* + * Destroy a vnet - unlink all linked lists, free all the memory, stop all + * the timers... How can one ever be sure to have done *all* the necessary + * steps? + */ +static int +vi_destroy(struct vimage *vip) +{ + struct vnet *vnet = vip->v_net; + struct vprocg *vprocg = vip->v_procg; + struct vcpu *vcpu = vip->v_cpu; + struct ifnet *ifp, *nifp; + struct vnet_modlink *vml; + + /* XXX Beware of races -> more locking to be done... */ + if (!LIST_EMPTY(&vip->vi_child_head)) + return (EBUSY); + + if (vprocg->nprocs != 0) + return (EBUSY); + + if (vnet->sockcnt != 0) + return (EBUSY); + + if (vip->vi_ucredrefc != 0) + printf("vi_destroy: %s ucredrefc %d\n", + vip->vi_name, vip->vi_ucredrefc); + + /* Point with no return - cleanup MUST succeed! */ + /* XXX locking */ + LIST_REMOVE(vip, vi_le); + LIST_REMOVE(vip, vi_sibling); + + /* XXX locking */ + LIST_REMOVE(vprocg, vprocg_le); + + mtx_lock_spin(&vcpu_list_mtx); + LIST_REMOVE(vcpu, vcpu_le); + mtx_unlock_spin(&vcpu_list_mtx); + + VNET_LIST_LOCK(); + LIST_REMOVE(vnet, vnet_le); + VNET_LIST_UNLOCK(); + + CURVNET_SET_QUIET(vnet); + INIT_VNET_NET(vnet); + + /* + * Return all inherited interfaces to their parent vnets, + * alternatively attempt to kill cloning ifnets. + */ + TAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { + if (ifp->if_home_vnet != ifp->if_vnet) + vi_if_move(NULL, ifp, vip); + else + if_clone_destroy(ifp->if_xname); + } + + /* Detach / free per-module state instances. */ + TAILQ_FOREACH_REVERSE(vml, &vnet_modlink_head, + vnet_modlink_head, vml_mod_le) + vnet_mod_destructor(vml); + +#if 0 + free((caddr_t)vnet->ifnet_addrs, M_IFADDR); + free((caddr_t)vnet->ifindex2ifnet, M_IFADDR); +#endif + + CURVNET_RESTORE(); + + /* hopefully, we are finally OK to free the vnet container itself! */ + vnet->vnet_magic_n = 0xdeadbeef; + vi_free(vnet, M_VNET); + vi_free(vprocg, M_VPROCG); + vi_free(vcpu, M_VCPU); + vi_free(vip, M_VIMAGE); + + return (0); +} + +static int vnet_mod_constructor(vml) +struct vnet_modlink *vml; +{ + const struct vnet_modinfo *vmi = vml->vml_modinfo; + +#ifdef DEBUG_ORDERING + printf("instatiating vnet_%s", vmi->vmi_name); + if (vml->vml_iarg) + printf("/%s", vml->vml_iname); + printf(": "); + if (vmi->vmi_struct_size) + printf("malloc(%d); ", vmi->vmi_struct_size); + if (vmi->vmi_iattach != NULL) + printf("iattach()"); + printf("\n"); +#endif + + if (vmi->vmi_struct_size) { + void *mem = vi_malloc(vmi->vmi_struct_size, M_VNET, + M_NOWAIT | M_ZERO); + if (mem == NULL) /* XXX should return error, not panic */ + panic("vi_alloc: malloc for %s\n", vmi->vmi_name); + curvnet->mod_data[vmi->vmi_id] = mem; + } + + if (vmi->vmi_iattach != NULL) + vmi->vmi_iattach(vml->vml_iarg); + + return 0; +} + +static int vnet_mod_destructor(vml) +struct vnet_modlink *vml; +{ + const struct vnet_modinfo *vmi = vml->vml_modinfo; + +#ifdef DEBUG_ORDERING + printf("destroying vnet_%s", vmi->vmi_name); + if (vml->vml_iarg) + printf("/%s", vml->vml_iname); + printf(": "); + if (vmi->vmi_idetach != NULL) + printf("idetach(); "); + if (vmi->vmi_struct_size) + printf("free()"); + printf("\n"); +#endif + + if (vmi->vmi_idetach) + vmi->vmi_idetach(vml->vml_iarg); + + if (vmi->vmi_struct_size) { + if (curvnet->mod_data[vmi->vmi_id] == NULL) + panic("vi_destroy: %s\n", vmi->vmi_name); + vi_free(curvnet->mod_data[vmi->vmi_id], M_VNET); + curvnet->mod_data[vmi->vmi_id] = NULL; + } + + return 0; +} + +static void +vi_init(void *unused) +{ +#ifdef VI_PREALLOC_SIZE + struct vi_mtrack *vmt; + + /* Initialize our private memory allocator */ + LIST_INIT(&vi_mem_free_head); + LIST_INIT(&vi_mem_alloc_head); + vi_mtrack_zone = uma_zcreate("vi_mtrack", sizeof(struct vi_mtrack), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + vmt = uma_zalloc(vi_mtrack_zone, M_NOWAIT); + vmt->vmt_addr = vi_mpool; + vmt->vmt_size = VI_PREALLOC_SIZE; + LIST_INSERT_HEAD(&vi_mem_free_head, vmt, vmt_le); +#endif /* VI_PREALLOC_SIZE */ + + /* vnet module list is both forward and reverse traversable */ + TAILQ_INIT(&vnet_modlink_head); + TAILQ_INIT(&vnet_modpending_head); + + LIST_INIT(&vimage_head); + LIST_INIT(&vnet_head); + LIST_INIT(&vprocg_head); + LIST_INIT(&vcpu_head); + + mtx_init(&vnet_list_refc_mtx, "vnet_list_refc_mtx", NULL, MTX_DEF); + cv_init(&vnet_list_condvar, "vnet_list_condvar"); + + mtx_init(&vcpu_list_mtx, "vcpu_list_mtx", NULL, MTX_SPIN); + + vi_alloc(NULL, ""); /* Default vimage has no name */ + + /* We MUST clear curvnet in vi_init_done before going SMP. */ + curvnet = LIST_FIRST(&vnet_head); +} + +static void +vi_init_done(void *unused) +{ + struct vnet_modlink *vml_iter; + + curvnet = NULL; + + if (TAILQ_EMPTY(&vnet_modpending_head)) + return; + + printf("vnet modules with unresolved dependencies:\n"); + TAILQ_FOREACH(vml_iter, &vnet_modpending_head, vml_mod_le) + printf(" %s depending on %d:\n", + vml_iter->vml_modinfo->vmi_name, + vml_iter->vml_modinfo->vmi_dependson); + panic("going nowhere without my vnet modules!"); +} + +SYSINIT(vimage, SI_SUB_VIMAGE, SI_ORDER_FIRST, vi_init, NULL) +SYSINIT(vimage_done, SI_SUB_VIMAGE_DONE, SI_ORDER_FIRST, vi_init_done, NULL) + +#ifdef VI_PREALLOC_SIZE +void * +vi_malloc(unsigned long size, struct malloc_type *type, int flags) +{ + void *addr; + struct vi_mtrack *vmt = NULL; + struct vi_mtrack *vmt_iter; + + /* Attempt to find a free chunk in our private pool */ + LIST_FOREACH(vmt_iter, &vi_mem_free_head, vmt_le) + if (vmt_iter->vmt_size >= size && + (vmt == NULL || vmt_iter->vmt_size < vmt->vmt_size)) { + vmt = vmt_iter; + /* Exact fit is an optimal choice, we are done. */ + if (vmt_iter->vmt_size == size) + break; + } + + /* Not (enough) free space in our pool, resort to malloc() */ + if (vmt == NULL) { + if (vi_mpool_fail_cnt == 0) + printf("vi_mpool exhausted," + "consider increasing VI_PREALLOC_SIZE\n"); + vi_mpool_fail_cnt++; + addr = malloc(size, type, flags); + return addr; + } + + addr = vmt->vmt_addr; + if (vmt->vmt_size == size) { + /* Move the descriptor from free to allocated list */ + LIST_REMOVE(vmt, vmt_le); + LIST_INSERT_HEAD(&vi_mem_alloc_head, vmt, vmt_le); + } else { + /* Shrink the existing free space block */ + vmt->vmt_addr += size; + vmt->vmt_size -= size; + + /* Create a new descriptor and place it on allocated list */ + vmt = uma_zalloc(vi_mtrack_zone, M_NOWAIT); + vmt->vmt_addr = addr; + vmt->vmt_size = size; + LIST_INSERT_HEAD(&vi_mem_alloc_head, vmt, vmt_le); + } + + bzero(addr, size); + return addr; +} + +void +vi_free(void *addr, struct malloc_type *type) +{ + struct vi_mtrack *vmt; + + /* Attempt to find the chunk in our allocated pool */ + LIST_FOREACH(vmt, &vi_mem_alloc_head, vmt_le) + if (vmt->vmt_addr == addr) + break; + + /* Not found in our private pool, resort to free() */ + if (vmt == NULL) { + free(addr, type); + return; + } + + /* Move the descriptor from allocated to free list */ + LIST_REMOVE(vmt, vmt_le); + LIST_INSERT_HEAD(&vi_mem_free_head, vmt, vmt_le); +} +#endif /* VI_PREALLOC_SIZE */ + +#ifdef DDB +static void +db_vnet_ptr(void *arg) +{ + if (arg) + db_printf(" %p", arg); + else + db_printf(" 0"); +} + +DB_SHOW_COMMAND(vnets, db_show_vnets) +{ + db_printf(" vnet ifs socks"); + db_printf(" net inet inet6 ipsec netgraph\n"); + VNET_ITERLOOP_BEGIN_QUIET(); + db_printf("%p %3d %5d", + vnet_iter, vnet_iter->ifccnt, vnet_iter->sockcnt); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_NET]); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_INET]); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_INET6]); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_IPSEC]); + db_vnet_ptr(vnet_iter->mod_data[VNET_MOD_NETGRAPH]); + db_printf("\n"); + VNET_ITERLOOP_END(); +} +#endif --- /u/marko/p4/head/src/sys/kern/kern_xxx.c 2007-08-31 03:47:34.000000000 +0200 +++ src/sys/kern/kern_xxx.c 2007-10-22 18:06:31.000000000 +0200 @@ -33,6 +33,7 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_xxx.c,v 1.49 2007/03/05 13:10:57 rwatson Exp $"); #include "opt_compat.h" +#include "opt_vimage.h" #include #include @@ -44,6 +45,7 @@ #include #include #include +#include #if defined(COMPAT_43) @@ -245,14 +247,15 @@ struct thread *td; struct getdomainname_args *uap; { + INIT_VPROCG(TD_TO_VPROCG(td)); int domainnamelen; int error; mtx_lock(&Giant); - domainnamelen = strlen(domainname) + 1; + domainnamelen = strlen(V_domainname) + 1; if ((u_int)uap->len > domainnamelen) uap->len = domainnamelen; - error = copyout(domainname, uap->domainname, uap->len); + error = copyout(V_domainname, uap->domainname, uap->len); mtx_unlock(&Giant); return (error); } @@ -269,19 +272,21 @@ struct thread *td; struct setdomainname_args *uap; { + INIT_VPROCG(TD_TO_VPROCG(td)); int error, domainnamelen; error = priv_check(td, PRIV_SETDOMAINNAME); +printf("setdomainname error=%d\n", error); if (error) return (error); mtx_lock(&Giant); - if ((u_int)uap->len > sizeof (domainname) - 1) { + if ((u_int)uap->len > sizeof (V_domainname) - 1) { error = EINVAL; goto done2; } domainnamelen = uap->len; - error = copyin(uap->domainname, domainname, uap->len); - domainname[domainnamelen] = 0; + error = copyin(uap->domainname, V_domainname, uap->len); + V_domainname[domainnamelen] = 0; done2: mtx_unlock(&Giant); return (error); --- /u/marko/p4/head/src/sys/kern/sched_4bsd.c 2007-12-27 19:31:57.000000000 +0100 +++ src/sys/kern/sched_4bsd.c 2008-01-14 19:23:38.000000000 +0100 @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD: src/sys/kern/sched_4bsd.c,v 1.112 2007/12/15 23:13:31 jeff Exp $"); #include "opt_hwpmc_hooks.h" +#include "opt_vimage.h" #include #include @@ -52,6 +53,7 @@ #include #include #include +#include #include #include @@ -102,9 +104,11 @@ ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq) static struct td_sched td_sched0; -struct mtx sched_lock; +static struct mtx sched_lock; +#ifndef VIMAGE static int sched_tdcnt; /* Total runnable threads in the system. */ +#endif static int sched_quantum; /* Roundrobin scheduling quantum in ticks. */ #define SCHED_QUANTUM (hz / 10) /* Default sched quantum */ @@ -227,18 +231,34 @@ #endif static __inline void -sched_load_add(void) +sched_load_add(struct thread *td) { - sched_tdcnt++; - CTR1(KTR_SCHED, "global load: %d", sched_tdcnt); + INIT_VPROCG(TD_TO_VPROCG(td)); + + V_sched_tdcnt++; + CTR1(KTR_SCHED, "global load: %d", V_sched_tdcnt); } static __inline void -sched_load_rem(void) +sched_load_rem(struct thread *td) { - sched_tdcnt--; - CTR1(KTR_SCHED, "global load: %d", sched_tdcnt); + INIT_VPROCG(TD_TO_VPROCG(td)); + + V_sched_tdcnt--; + CTR1(KTR_SCHED, "global load: %d", V_sched_tdcnt); } + +#ifdef VIMAGE +void +sched_load_reassign(struct vprocg *old, struct vprocg *new) +{ + mtx_lock_spin(&sched_lock); + old->_sched_tdcnt--; + new->_sched_tdcnt++; + mtx_unlock_spin(&sched_lock); +} +#endif + /* * Arrange to reschedule if necessary, taking the priorities and * schedulers into account. @@ -346,16 +366,26 @@ static void schedcpu(void) { - register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); + register fixpt_t loadfac; struct thread *td; struct proc *p; struct td_sched *ts; int awake, realstathz; +#ifndef VIMAGE + loadfac = loadfactor(averunnable.ldavg[0]); +#endif realstathz = stathz ? stathz : hz; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_SLOCK(p); +#ifdef VIMAGE + if (p->p_ucred != NULL) { + INIT_VPROCG(P_TO_VPROCG(p)); + loadfac = loadfactor(V_averunnable.ldavg[0]); + } else + loadfac = 0; +#endif FOREACH_THREAD_IN_PROC(p, td) { awake = 0; thread_lock(td); @@ -462,12 +492,13 @@ static void updatepri(struct thread *td) { + INIT_VPROCG(TD_TO_VPROCG(td)); struct td_sched *ts; fixpt_t loadfac; unsigned int newcpu; ts = td->td_sched; - loadfac = loadfactor(averunnable.ldavg[0]); + loadfac = loadfactor(V_averunnable.ldavg[0]); if (ts->ts_slptime > 5 * loadfac) td->td_estcpu = 0; else { @@ -528,7 +559,7 @@ hogticks = 2 * sched_quantum; /* Account for thread0. */ - sched_load_add(); + sched_load_add(&thread0); } /* External interfaces start here */ @@ -631,7 +662,7 @@ thread_unlock(td); mtx_lock_spin(&sched_lock); if ((child->td_proc->p_flag & P_NOLOAD) == 0) - sched_load_rem(); + sched_load_rem(td); mtx_unlock_spin(&sched_lock); } @@ -825,7 +856,7 @@ } if ((p->p_flag & P_NOLOAD) == 0) - sched_load_rem(); + sched_load_rem(td); if (newtd) newtd->td_flags |= (td->td_flags & TDF_NEEDRESCHED); @@ -867,7 +898,7 @@ newtd->td_sched->ts_flags |= TSF_DIDRUN; TD_SET_RUNNING(newtd); if ((newtd->td_proc->p_flag & P_NOLOAD) == 0) - sched_load_add(); + sched_load_add(newtd); } else { newtd = choosethread(); } @@ -1124,7 +1155,7 @@ } if ((td->td_proc->p_flag & P_NOLOAD) == 0) - sched_load_add(); + sched_load_add(td); runq_add(ts->ts_runq, ts, flags); } #else /* SMP */ @@ -1169,7 +1200,7 @@ return; } if ((td->td_proc->p_flag & P_NOLOAD) == 0) - sched_load_add(); + sched_load_add(td); runq_add(ts->ts_runq, ts, flags); maybe_resched(td); } @@ -1191,7 +1222,7 @@ curthread->td_name); if ((td->td_proc->p_flag & P_NOLOAD) == 0) - sched_load_rem(); + sched_load_rem(td); runq_remove(ts->ts_runq, ts); TD_SET_CAN_RUN(td); } @@ -1308,9 +1339,13 @@ } int +#ifdef VIMAGE +sched_load(struct vprocg *vprocg) +#else sched_load(void) +#endif { - return (sched_tdcnt); + return (V_sched_tdcnt); } int --- /u/marko/p4/head/src/sys/kern/sched_ule.c 2008-01-28 23:53:49.000000000 +0100 +++ src/sys/kern/sched_ule.c 2008-02-27 11:47:28.000000000 +0100 @@ -40,6 +40,7 @@ #include "opt_hwpmc_hooks.h" #include "opt_sched.h" +#include "opt_vimage.h" #include #include @@ -59,6 +60,7 @@ #include #include #include +#include #ifdef KTRACE #include #include @@ -286,8 +288,13 @@ /* Operations on per processor queues */ static struct td_sched * tdq_choose(struct tdq *); static void tdq_setup(struct tdq *); +#ifndef VIMAGE static void tdq_load_add(struct tdq *, struct td_sched *); static void tdq_load_rem(struct tdq *, struct td_sched *); +#else +static void tdq_load_add(struct tdq *, struct td_sched *, struct vprocg *); +static void tdq_load_rem(struct tdq *, struct td_sched *, struct vprocg *); +#endif static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int); static __inline void tdq_runq_rem(struct tdq *, struct td_sched *); void tdq_print(int cpu); @@ -459,7 +466,11 @@ * for this thread to the referenced thread queue. */ static void +#ifndef VIMAGE tdq_load_add(struct tdq *tdq, struct td_sched *ts) +#else +tdq_load_add(struct tdq *tdq, struct td_sched *ts, struct vprocg *vprocg) +#endif { int class; @@ -469,12 +480,19 @@ tdq->tdq_load++; CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load); if (class != PRI_ITHD && - (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) + (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) { #ifdef SMP tdq->tdq_group->tdg_load++; -#else +#ifdef VIMAGE + V_tdq_sysload[TDG_ID(tdq->tdq_group)]++; +#endif +#else /* !SMP */ tdq->tdq_sysload++; +#ifdef VIMAGE + V_tdq_sysload[0]++; #endif +#endif /* SMP */ + } } /* @@ -482,7 +500,11 @@ * exiting. */ static void +#ifndef VIMAGE tdq_load_rem(struct tdq *tdq, struct td_sched *ts) +#else +tdq_load_rem(struct tdq *tdq, struct td_sched *ts, struct vprocg *vprocg) +#endif { int class; @@ -490,12 +512,19 @@ TDQ_LOCK_ASSERT(tdq, MA_OWNED); class = PRI_BASE(ts->ts_thread->td_pri_class); if (class != PRI_ITHD && - (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) + (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) { #ifdef SMP tdq->tdq_group->tdg_load--; -#else +#ifdef VIMAGE + V_tdq_sysload[TDG_ID(tdq->tdq_group)]--; +#endif +#else /* !SMP */ tdq->tdq_sysload--; +#ifdef VIMAGE + V_tdq_sysload[0]--; #endif +#endif /* SMP */ + } KASSERT(tdq->tdq_load != 0, ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); tdq->tdq_load--; @@ -1330,7 +1359,11 @@ /* Add thread0's load since it's running. */ TDQ_LOCK(tdq); thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF()); +#ifndef VIMAGE tdq_load_add(tdq, &td_sched0); +#else + tdq_load_add(tdq, &td_sched0, TD_TO_VPROCG(&thread0)); +#endif TDQ_UNLOCK(tdq); } @@ -1859,7 +1892,11 @@ TD_SET_CAN_RUN(td); } else if (TD_IS_RUNNING(td)) { MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); +#ifndef VIMAGE tdq_load_rem(tdq, ts); +#else + tdq_load_rem(tdq, ts, TD_TO_VPROCG(td)); +#endif srqflag = (flags & SW_PREEMPT) ? SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : SRQ_OURSELF|SRQ_YIELDING; @@ -1871,7 +1908,11 @@ /* This thread must be going to sleep. */ TDQ_LOCK(tdq); mtx = thread_block_switch(td); +#ifndef VIMAGE tdq_load_rem(tdq, ts); +#else + tdq_load_rem(tdq, ts, TD_TO_VPROCG(td)); +#endif } /* * We enter here with the thread blocked and assigned to the @@ -2379,7 +2420,11 @@ tdq->tdq_lowpri = td->td_priority; #endif tdq_runq_add(tdq, ts, flags); +#ifndef VIMAGE tdq_load_add(tdq, ts); +#else + tdq_load_add(tdq, ts, TD_TO_VPROCG(td)); +#endif } /* @@ -2460,7 +2505,11 @@ KASSERT(TD_ON_RUNQ(td), ("sched_rem: thread not on run queue")); tdq_runq_rem(tdq, ts); +#ifndef VIMAGE tdq_load_rem(tdq, ts); +#else + tdq_load_rem(tdq, ts, TD_TO_VPROCG(td)); +#endif TD_SET_CAN_RUN(td); } @@ -2556,7 +2605,11 @@ * Return the total system load. */ int +#ifdef VIMAGE +sched_load(struct vprocg *vprocg) +#else sched_load(void) +#endif { #ifdef SMP int total; @@ -2564,13 +2617,40 @@ total = 0; for (i = 0; i <= tdg_maxid; i++) +#ifndef VIMAGE total += TDQ_GROUP(i)->tdg_load; - return (total); #else + total += V_tdq_sysload[i]; +#endif + return (total); +#else /* !SMP */ +#ifndef VIMAGE return (TDQ_SELF()->tdq_sysload); +#else + return (V_tdq_sysload[0]); #endif +#endif /* SMP */ } +#ifdef VIMAGE +void +sched_load_reassign(struct vprocg *old, struct vprocg *new) +{ +#ifdef SMP + int tdg_id; + + critical_enter(); + tdg_id = TDG_ID(tdq_cpu[curcpu].tdq_group); + old->_tdq_sysload[tdg_id]--; + new->_tdq_sysload[tdg_id]++; + critical_exit(); +#else + old->_tdq_sysload[0]--; + new->_tdq_sysload[0]++; +#endif +} +#endif + int sched_sizeof_proc(void) { @@ -2622,7 +2702,11 @@ spinlock_exit(); } else { MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); +#ifndef VIMAGE tdq_load_rem(tdq, td->td_sched); +#else + tdq_load_rem(tdq, td->td_sched, TD_TO_VPROCG(td)); +#endif lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object); } KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); --- /u/marko/p4/head/src/sys/kern/tty.c 2008-01-15 18:00:10.000000000 +0100 +++ src/sys/kern/tty.c 2008-02-27 11:47:51.000000000 +0100 @@ -75,6 +75,7 @@ #include "opt_compat.h" #include "opt_tty.h" +#include "opt_vimage.h" #include #include @@ -104,6 +105,7 @@ #include #include #include +#include #include @@ -1141,6 +1143,7 @@ if (t == tp->t_line) return (0); s = spltty(); + CURVNET_SET(TD_TO_VNET(curthread)); ttyld_close(tp, flag); tp->t_line = t; /* XXX: we should use the correct cdev here */ @@ -1156,6 +1159,7 @@ tp->t_line = TTYDISC; (void)ttyld_open(tp, tp->t_dev); } + CURVNET_RESTORE(); splx(s); return (error); break; @@ -2530,6 +2534,7 @@ void ttyinfo(struct tty *tp) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); struct timeval utime, stime; struct proc *p, *pick; struct thread *td, *picktd; @@ -2544,7 +2549,7 @@ return; /* Print load average. */ - load = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT; + load = (V_averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT; ttyprintf(tp, "load: %d.%02d ", load / 100, load % 100); /* @@ -3215,7 +3220,9 @@ goto out; goto open_top; } + CURVNET_SET(TD_TO_VNET(curthread)); error = ttyld_open(tp, dev); + CURVNET_RESTORE(); ttyldoptim(tp); if (tp->t_state & TS_ISOPEN && ISCALLOUT(dev)) tp->t_actout = TRUE; @@ -3232,7 +3239,9 @@ struct tty *tp; tp = dev->si_tty; + CURVNET_SET(TD_TO_VNET(curthread)); ttyld_close(tp, flag); + CURVNET_RESTORE(); ttyldoptim(tp); tt_close(tp); tp->t_do_timestamp = 0; --- /u/marko/p4/head/src/sys/kern/subr_pcpu.c 2007-11-14 19:35:22.000000000 +0100 +++ src/sys/kern/subr_pcpu.c 2007-12-10 11:26:05.000000000 +0100 @@ -46,6 +46,7 @@ __FBSDID("$FreeBSD: src/sys/kern/subr_pcpu.c,v 1.10 2007/11/14 06:21:23 julian Exp $"); #include "opt_ddb.h" +#include "opt_vimage.h" #include #include @@ -132,6 +133,10 @@ db_printf("none\n"); db_show_mdpcpu(pc); +#ifdef VIMAGE + db_printf("curvnet = %p\n", pc->pc_curthread->td_vnet); +#endif + #ifdef WITNESS db_printf("spin locks held:\n"); witness_list_locks(&pc->pc_spinlocks); --- /u/marko/p4/head/src/sys/kern/sys_socket.c 2008-01-15 18:00:10.000000000 +0100 +++ src/sys/kern/sys_socket.c 2008-02-27 11:47:39.000000000 +0100 @@ -33,6 +33,7 @@ __FBSDID("$FreeBSD: src/sys/kern/sys_socket.c,v 1.75 2008/01/07 20:05:18 jhb Exp $"); #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -50,7 +51,9 @@ #include #include #include +#include +#include #include #include @@ -74,16 +77,19 @@ int flags, struct thread *td) { struct socket *so = fp->f_data; -#ifdef MAC int error; +#ifdef MAC SOCK_LOCK(so); error = mac_socket_check_receive(active_cred, so); SOCK_UNLOCK(so); if (error) return (error); #endif - return (soreceive(so, 0, uio, 0, 0, 0)); + CURVNET_SET(so->so_vnet); + error = soreceive(so, 0, uio, 0, 0, 0); + CURVNET_RESTORE(); + return (error); } /* ARGSUSED */ @@ -125,6 +131,7 @@ struct socket *so = fp->f_data; int error = 0; + CURVNET_SET(so->so_vnet); switch (cmd) { case FIONBIO: SOCK_LOCK(so); @@ -205,6 +212,7 @@ (so, cmd, data, 0, td)); break; } + CURVNET_RESTORE(); return (error); } @@ -279,7 +287,8 @@ fp->f_ops = &badfileops; fp->f_data = NULL; - if (so) + if (so) { error = soclose(so); + } return (error); } --- /u/marko/p4/head/src/sys/kern/uipc_domain.c 2007-08-31 03:47:38.000000000 +0200 +++ src/sys/kern/uipc_domain.c 2007-10-22 18:06:33.000000000 +0200 @@ -29,6 +29,8 @@ * @(#)uipc_domain.c 8.2 (Berkeley) 10/18/93 */ +#include "opt_vimage.h" + #include __FBSDID("$FreeBSD: src/sys/kern/uipc_domain.c,v 1.51 2007/08/06 14:26:00 rwatson Exp $"); @@ -43,6 +45,7 @@ #include #include #include +#include #include /* @@ -64,6 +67,11 @@ SYSINIT(domainfin, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, domainfinalize, NULL) +static vnet_attach_fn net_init_domain; +#ifdef VIMAGE +static vnet_detach_fn net_detach_domain; +#endif + static struct callout pffast_callout; static struct callout pfslow_callout; @@ -100,6 +108,9 @@ .pru_sopoll = pru_sopoll_notsupp, }; +VNET_MOD_DECLARE_STATELESS(DOMAIN, domain, net_init_domain, net_detach_domain, + NET) + static void protosw_init(struct protosw *pr) { @@ -128,13 +139,12 @@ } /* - * Add a new protocol domain to the list of supported domains - * Note: you cant unload it again because a socket may be using it. - * XXX can't fail at this time. + * Initialize a domain instance. */ -static void -net_init_domain(struct domain *dp) +static int +net_init_domain(const void *arg) { + const struct domain *dp = arg; struct protosw *pr; if (dp->dom_init) @@ -148,8 +158,29 @@ max_datalen = MHLEN - max_hdr; if (max_datalen < 1) panic("%s: max_datalen < 1", __func__); + return 0; } +#ifdef VIMAGE +/* + * Detach / free a domain instance. + */ +static int +net_detach_domain(const void *arg) +{ + const struct domain *dp = arg; + struct protosw *pr; + + if (dp->dom_destroy) + (*dp->dom_destroy)(); + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_destroy) + (*pr->pr_destroy)(); + + return 0; +} +#endif + /* * Add a new protocol domain to the list of supported domains * Note: you cant unload it again because a socket may be using it. @@ -183,7 +214,11 @@ "domainfinalize()\n", dp->dom_name); #endif mtx_unlock(&dom_mtx); +#ifdef VIMAGE + vnet_mod_register_multi(&vnet_domain_modinfo, dp, dp->dom_name); +#else net_init_domain(dp); +#endif } static void --- /u/marko/p4/head/src/sys/kern/uipc_socket.c 2008-02-27 18:28:53.000000000 +0100 +++ src/sys/kern/uipc_socket.c 2008-02-27 17:58:38.000000000 +0100 @@ -101,6 +101,7 @@ #include "opt_mac.h" #include "opt_zero.h" #include "opt_compat.h" +#include "opt_vimage.h" #include #include @@ -128,6 +129,9 @@ #include #include #include +#include + +#include #include @@ -259,7 +263,7 @@ * soalloc() returns a socket with a ref count of 0. */ static struct socket * -soalloc(void) +soalloc(struct vnet *vnet) { struct socket *so; @@ -280,6 +284,10 @@ mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; +#ifdef VIMAGE + so->so_vnet = vnet; + vnet->sockcnt++; +#endif mtx_unlock(&so_global_mtx); return (so); } @@ -299,6 +307,9 @@ mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; --numopensockets; /* Could be below, but faster here. */ +#ifdef VIMAGE + so->so_vnet->sockcnt--; +#endif mtx_unlock(&so_global_mtx); if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, @@ -352,7 +363,11 @@ if (prp->pr_type != type) return (EPROTOTYPE); - so = soalloc(); +#ifdef VIMAGE + so = soalloc(TD_TO_VNET(td)); +#else + so = soalloc(NULL); +#endif if (so == NULL) return (ENOBUFS); @@ -373,7 +388,9 @@ * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. */ + CURVNET_SET(so->so_vnet); error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); + CURVNET_RESTORE(); if (error) { KASSERT(so->so_count == 1, ("socreate: so_count %d", so->so_count)); @@ -415,7 +432,12 @@ if (over) #endif return (NULL); - so = soalloc(); +#ifdef VIMAGE + VNET_ASSERT(head->so_vnet); + so = soalloc(head->so_vnet); +#else + so = soalloc(NULL); +#endif if (so == NULL) return (NULL); if ((head->so_options & SO_ACCEPTFILTER) != 0) @@ -487,8 +509,12 @@ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { + int error; - return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); + CURVNET_SET(so->so_vnet); + error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); + CURVNET_RESTORE(); + return error; } /* @@ -636,6 +662,7 @@ KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); + CURVNET_SET(so->so_vnet); funsetown(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { @@ -687,6 +714,7 @@ KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; sorele(so); + CURVNET_RESTORE(); return (error); } @@ -762,7 +790,9 @@ * biting us. */ so->so_error = 0; + CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); + CURVNET_RESTORE(); } return (error); @@ -1278,13 +1308,17 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { + int error; /* XXXRW: Temporary debugging. */ KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend, ("sosend: protocol calls sosend")); - return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, - control, flags, td)); + CURVNET_SET(so->so_vnet); + error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, + control, flags, td); + CURVNET_RESTORE(); + return (error); } /* @@ -1864,8 +1898,13 @@ if (how != SHUT_WR) sorflush(so); - if (how != SHUT_RD) - return ((*pr->pr_usrreqs->pru_shutdown)(so)); + if (how != SHUT_RD) { + int error; + CURVNET_SET(so->so_vnet); + error = (*pr->pr_usrreqs->pru_shutdown)(so); + CURVNET_RESTORE(); + return (error); + } return (0); } @@ -1889,6 +1928,7 @@ * socket buffer. Don't let our acquire be interrupted by a signal * despite any existing socket disposition on interruptable waiting. */ + CURVNET_SET(so->so_vnet); socantrcvmore(so); (void) sblock(sb, SBL_WAIT | SBL_NOINTR); @@ -1912,6 +1952,7 @@ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(asb.sb_mb); sbrelease_internal(&asb, so); + CURVNET_RESTORE(); } /* @@ -1978,8 +2019,7 @@ error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto && so->so_proto->pr_ctloutput) - return ((*so->so_proto->pr_ctloutput) - (so, sopt)); + return ((*so->so_proto->pr_ctloutput) (so, sopt)); error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { --- /u/marko/p4/head/src/sys/kern/uipc_syscalls.c 2008-02-27 18:28:54.000000000 +0100 +++ src/sys/kern/uipc_syscalls.c 2008-02-27 11:48:02.000000000 +0100 @@ -39,6 +39,7 @@ #include "opt_compat.h" #include "opt_ktrace.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -64,6 +65,7 @@ #include #include #include +#include #ifdef KTRACE #include #endif @@ -264,7 +266,9 @@ if (error) goto done; #endif + CURVNET_SET(so->so_vnet); error = solisten(so, uap->backlog, td); + CURVNET_RESTORE(); #ifdef MAC done: #endif @@ -429,7 +433,9 @@ tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); sa = 0; + CURVNET_SET(so->so_vnet); error = soaccept(so, &sa); + CURVNET_RESTORE(); if (error) { /* * return a namelen of zero for older code which might @@ -977,9 +983,11 @@ ktruio = cloneuio(&auio); #endif len = auio.uio_resid; + CURVNET_SET(so->so_vnet); error = soreceive(so, &fromsa, &auio, (struct mbuf **)0, (mp->msg_control || controlp) ? &control : (struct mbuf **)0, &mp->msg_flags); + CURVNET_RESTORE(); if (error) { if (auio.uio_resid != (int)len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) @@ -1323,7 +1331,9 @@ error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error == 0) { so = fp->f_data; + CURVNET_SET(so->so_vnet); error = sosetopt(so, &sopt); + CURVNET_RESTORE(); fdrop(fp, td); } return(error); @@ -1401,7 +1411,9 @@ error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error == 0) { so = fp->f_data; + CURVNET_SET(so->so_vnet); error = sogetopt(so, &sopt); + CURVNET_RESTORE(); *valsize = sopt.sopt_valsize; fdrop(fp, td); } @@ -1464,7 +1476,9 @@ return (error); so = fp->f_data; *sa = NULL; + CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); + CURVNET_RESTORE(); if (error) goto bad; if (*sa == NULL) @@ -1564,8 +1578,11 @@ error = ENOTCONN; goto done; } + *sa = NULL; + CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); + CURVNET_RESTORE(); if (error) goto bad; if (*sa == NULL) @@ -2184,9 +2201,11 @@ goto done; } SOCKBUF_UNLOCK(&so->so_snd); + CURVNET_SET(so->so_vnet); /* Avoid error aliasing. */ err = (*so->so_proto->pr_usrreqs->pru_send) (so, 0, m, NULL, NULL, td); + CURVNET_RESTORE(); if (err == 0) { /* * We need two counters to get the --- /u/marko/p4/head/src/sys/kern/uipc_usrreq.c 2008-01-28 23:53:50.000000000 +0100 +++ src/sys/kern/uipc_usrreq.c 2008-02-27 11:48:03.000000000 +0100 @@ -60,6 +60,7 @@ #include "opt_ddb.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -90,6 +91,7 @@ #include #include #include +#include #ifdef DDB #include @@ -1648,6 +1650,10 @@ unp_init(void) { +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); if (unp_zone == NULL) --- /u/marko/p4/head/src/sys/kern/vfs_export.c 2007-08-31 03:47:39.000000000 +0200 +++ src/sys/kern/vfs_export.c 2007-10-22 18:06:34.000000000 +0200 @@ -37,6 +37,8 @@ #include __FBSDID("$FreeBSD: src/sys/kern/vfs_export.c,v 1.341 2007/02/15 22:08:35 pjd Exp $"); +#include "opt_vimage.h" + #include #include #include @@ -50,6 +52,7 @@ #include #include #include +#include #include @@ -135,6 +138,7 @@ } #endif + CURVNET_SET(TD_TO_VNET(curthread)); /* XXX MARKO */ i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO); saddr = (struct sockaddr *) (np + 1); @@ -191,8 +195,10 @@ bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups, sizeof(np->netc_anon.cr_groups)); refcount_init(&np->netc_anon.cr_ref, 1); + CURVNET_RESTORE(); return (0); out: + CURVNET_RESTORE(); free(np, M_NETADDR); return (error); } --- /u/marko/p4/head/src/sys/kern/vfs_lookup.c 2008-02-27 18:28:54.000000000 +0100 +++ src/sys/kern/vfs_lookup.c 2008-02-27 11:48:13.000000000 +0100 @@ -40,6 +40,7 @@ #include "opt_ktrace.h" #include "opt_mac.h" #include "opt_vfs.h" +#include "opt_vimage.h" #include #include @@ -53,6 +54,7 @@ #include #include #include +#include #ifdef KTRACE #include #endif @@ -65,6 +67,15 @@ #define NAMEI_DIAGNOSTIC 1 #undef NAMEI_DIAGNOSTIC +#ifdef VIMAGE +#define IMUNES_SYMLINK_HACK +#endif + +#ifdef IMUNES_SYMLINK_HACK +SYSCTL_V_INT(V_PROCG, vprocg, _vfs, OID_AUTO, morphing_symlinks, CTLFLAG_RW, + morphing_symlinks, 0, "Resolve @ to vimage name in symlinks"); +#endif + /* * Allocation zone for namei */ @@ -129,6 +140,9 @@ struct thread *td = cnp->cn_thread; struct proc *p = td->td_proc; int vfslocked; +#ifdef IMUNES_SYMLINK_HACK + INIT_VPROCG(TD_TO_VPROCG(td)); +#endif KASSERT((cnp->cn_flags & MPSAFE) != 0 || mtx_owned(&Giant) != 0, ("NOT MPSAFE and Giant not held")); @@ -284,6 +298,25 @@ error = ENOENT; break; } +#ifdef IMUNES_SYMLINK_HACK + if (V_morphing_symlinks) { + char *sp = strchr(cp, '@'); + int vnamelen = strlen(TD_TO_VIMAGE(td)->vi_name); + + if (sp) { + if (vnamelen >= auio.uio_resid) { + if (ndp->ni_pathlen > 1) + uma_zfree(namei_zone, cp); + error = ENAMETOOLONG; + break; + } + bcopy(sp + 1, sp + vnamelen, + linklen - (sp - cp)); + bcopy(TD_TO_VIMAGE(td)->vi_name, sp, vnamelen); + linklen += (vnamelen - 1); + } + } +#endif if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); --- /u/marko/p4/head/src/sys/modules/Makefile 2008-02-27 18:28:58.000000000 +0100 +++ src/sys/modules/Makefile 2008-02-27 11:48:22.000000000 +0100 @@ -428,9 +428,6 @@ _tmpfs= tmpfs _wi= wi _xe= xe -.if ${MK_ZFS} != "no" || defined(ALL_MODULES) -_zfs= zfs -.endif .if ${MACHINE} == "i386" _aac= aac _acpi= acpi --- /u/marko/p4/head/src/sys/modules/netgraph/Makefile 2007-08-31 03:47:44.000000000 +0200 +++ src/sys/modules/netgraph/Makefile 2007-10-22 18:06:35.000000000 +0200 @@ -34,6 +34,7 @@ netflow \ netgraph \ one2many \ + pipe \ ppp \ pppoe \ pptpgre \ @@ -51,7 +52,8 @@ tty \ UI \ vjc \ - vlan + vlan \ + ${_wormhole} .if ${MACHINE_ARCH} == "i386" _sync_ar= sync_ar @@ -66,4 +68,9 @@ _mppc= mppc .endif +VIMAGE!= grep VIMAGE ${KERNBUILDDIR}/opt_vimage.h | cut -d" " -f3 || true +.if ${VIMAGE} == 1 +_wormhole= wormhole +.endif + .include --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/modules/netgraph/pipe/Makefile 2007-10-05 12:26:44.000000000 +0200 @@ -0,0 +1,6 @@ +# $FreeBSD: $ + +KMOD= ng_pipe +SRCS= ng_pipe.c + +.include --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/modules/netgraph/wormhole/Makefile 2007-10-22 18:06:35.000000000 +0200 @@ -0,0 +1,6 @@ +# $FreeBSD: $ + +KMOD= ng_wormhole +SRCS= ng_wormhole.c opt_vimage.h + +.include --- /u/marko/p4/head/src/sys/net/bpf.c 2008-02-03 08:16:01.000000000 +0100 +++ src/sys/net/bpf.c 2008-02-27 11:48:34.000000000 +0100 @@ -40,6 +40,7 @@ #include "opt_bpf.h" #include "opt_mac.h" #include "opt_netgraph.h" +#include "opt_vimage.h" #include #include @@ -61,9 +62,11 @@ #include #include #include +#include #include +#include #include #include #ifdef BPF_JITTER @@ -444,8 +447,11 @@ BPFD_UNLOCK(d); funsetown(&d->bd_sigio); mtx_lock(&bpf_mtx); - if (d->bd_bif) + if (d->bd_bif) { + CURVNET_SET(d->bd_bif->bif_ifp->if_vnet); bpf_detachd(d); + CURVNET_RESTORE(); + } mtx_unlock(&bpf_mtx); selwakeuppri(&d->bd_sel, PRINET); #ifdef MAC @@ -666,7 +672,9 @@ BPFD_UNLOCK(d); #endif + CURVNET_SET(ifp->if_vnet); error = (*ifp->if_output)(ifp, m, &dst, NULL); + CURVNET_RESTORE(); if (mc != NULL) { if (error == 0) @@ -763,6 +771,7 @@ return (EPERM); } } + CURVNET_SET(TD_TO_VNET(td)); switch (cmd) { default: @@ -1056,6 +1065,7 @@ *(u_int *)addr = d->bd_sig; break; } + CURVNET_RESTORE(); return (error); } @@ -1150,9 +1160,33 @@ struct bpf_if *bp; struct ifnet *theywant; +#define IMUNES_BPF_HACK +#if defined(VIMAGE) && defined(IMUNES_BPF_HACK) + struct vnet *target_vnet = curvnet; + char *c; + + /* Hack to support tapping in foreign vnets */ + c = rindex(ifr->ifr_name, '@'); + if ( c != NULL ) { +printf("bpf_setif: %s\n", c); + struct vimage *target_vimage; + + *c++ = 0; + target_vimage = vimage_by_name(TD_TO_VIMAGE(curthread), c); + if (target_vimage == NULL) + return ENXIO; + target_vnet = target_vimage->v_net; + } + CURVNET_SET_QUIET(target_vnet); +#endif + theywant = ifunit(ifr->ifr_name); - if (theywant == NULL || theywant->if_bpf == NULL) + if (theywant == NULL || theywant->if_bpf == NULL) { +#if defined(VIMAGE) && defined(IMUNES_BPF_HACK) + CURVNET_RESTORE(); +#endif return (ENXIO); + } bp = theywant->if_bpf; /* @@ -1174,6 +1208,9 @@ BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); +#if defined(VIMAGE) && defined(IMUNES_BPF_HACK) + CURVNET_RESTORE(); +#endif return (0); } --- /u/marko/p4/head/src/sys/net/if.c 2007-10-29 17:17:42.000000000 +0100 +++ src/sys/net/if.c 2007-12-10 11:26:08.000000000 +0100 @@ -35,6 +35,7 @@ #include "opt_inet.h" #include "opt_mac.h" #include "opt_carp.h" +#include "opt_vimage.h" #include #include @@ -56,8 +57,11 @@ #include #include #include +#include + #include +#include #include #include #include @@ -110,7 +114,6 @@ static void if_purgemaddrs(struct ifnet *); static int ifconf(u_long, caddr_t); static void if_freemulti(struct ifmultiaddr *); -static void if_grow(void); static void if_init(void *); static void if_check(void *); static void if_qflush(struct ifaltq *); @@ -134,17 +137,24 @@ extern void nd6_setmtu(struct ifnet *); #endif -int if_index = 0; -struct ifindex_entry *ifindex_table = NULL; +static int vnet_net_iattach(const void *); +#ifdef VIMAGE +static int vnet_net_idetach(const void *); +#endif + int ifqmaxlen = IFQ_MAXLEN; -struct ifnethead ifnet; /* depend on static init XXX */ -struct ifgrouphead ifg_head; struct mtx ifnet_lock; static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; +#ifndef VIMAGE +int if_index = 0; +struct ifindex_entry *ifindex_table = NULL; +struct ifnethead ifnet; /* depend on static init XXX */ +struct ifgrouphead ifg_head; -static int if_indexlim = 8; +static int if_indexlim; static struct knlist ifklist; +#endif /* !VIMAGE */ static void filt_netdetach(struct knote *kn); static int filt_netdev(struct knote *kn, long hint); @@ -152,6 +162,19 @@ static struct filterops netdev_filtops = { 1, NULL, filt_netdetach, filt_netdev }; +#ifdef VIMAGE +static struct vnet_symmap vnet_net_symmap[] = { + VNET_SYMMAP(net, ifnet), + VNET_SYMMAP(net, rt_tables), + VNET_SYMMAP(net, rtstat), + VNET_SYMMAP(net, rttrash), + VNET_SYMMAP_END +}; +#endif + +VNET_MOD_DECLARE(NET, net, vnet_net_iattach, vnet_net_idetach, + NONE, vnet_net_symmap) + /* * System initialization */ @@ -192,6 +215,7 @@ static int netioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; int error, idx; @@ -225,6 +249,7 @@ static int netkqfilter(struct cdev *dev, struct knote *kn) { + INIT_VNET_NET(curvnet); struct knlist *klist; struct ifnet *ifp; int idx; @@ -239,7 +264,7 @@ idx = minor(dev); if (idx == 0) { - klist = &ifklist; + klist = &V_ifklist; } else { ifp = ifnet_byindex(idx); if (ifp == NULL) @@ -294,43 +319,91 @@ static void if_init(void *dummy __unused) { - +#ifdef VIMAGE + vnet_mod_register(&vnet_net_modinfo); +#else + vnet_net_iattach(NULL); +#endif IFNET_LOCK_INIT(); - TAILQ_INIT(&ifnet); - TAILQ_INIT(&ifg_head); - knlist_init(&ifklist, NULL, NULL, NULL, NULL); - if_grow(); /* create initial table */ +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { + INIT_VNET_NET(curvnet); +#endif ifdev_byindex(0) = make_dev(&net_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "network"); +#ifdef VIMAGE + } +#endif if_clone_init(); } -static void +static int +vnet_net_iattach(unused) + const void *unused; +{ + INIT_VNET_NET(curvnet); + + TAILQ_INIT(&V_ifnet); + TAILQ_INIT(&V_ifg_head); + knlist_init(&V_ifklist, NULL, NULL, NULL, NULL); + V_if_indexlim = 8; + if_grow(); /* create initial table */ + + return 0; +} + +#ifdef VIMAGE +static int +vnet_net_idetach(unused) + const void *unused; +{ + INIT_VNET_NET(curvnet); + + VNET_ASSERT(TAILQ_EMPTY(&V_ifnet)); +#ifdef NOTYET + VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head)); +#endif + VNET_ASSERT(SLIST_EMPTY(&V_ifklist.kl_list)); + + free((caddr_t)V_ifindex_table, M_IFNET); + + return 0; +} +#endif + +void if_grow(void) { + INIT_VNET_NET(curvnet); u_int n; struct ifindex_entry *e; - if_indexlim <<= 1; - n = if_indexlim * sizeof(*e); + V_if_indexlim <<= 1; + n = V_if_indexlim * sizeof(*e); e = malloc(n, M_IFNET, M_WAITOK | M_ZERO); - if (ifindex_table != NULL) { - memcpy((caddr_t)e, (caddr_t)ifindex_table, n/2); - free((caddr_t)ifindex_table, M_IFNET); + if (V_ifindex_table != NULL) { + memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2); + free((caddr_t)V_ifindex_table, M_IFNET); } - ifindex_table = e; + V_ifindex_table = e; } /* ARGSUSED*/ static void if_check(void *dummy __unused) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; int s; +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + panic("if_check() called for a non-default vimage!?!"); +#endif + s = splimp(); IFNET_RLOCK(); /* could sleep on rare error; mostly okay XXX */ - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_snd.ifq_maxlen == 0) { if_printf(ifp, "XXX: driver didn't set ifq_maxlen\n"); ifp->if_snd.ifq_maxlen = ifqmaxlen; @@ -344,7 +417,8 @@ } IFNET_RUNLOCK(); splx(s); - if_slowtimo(0); + + timeout(if_slowtimo, (void *)0, hz / IFNET_SLOWHZ); } /* @@ -355,6 +429,7 @@ struct ifnet* if_alloc(u_char type) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO); @@ -365,7 +440,7 @@ * * XXX: should be locked! */ - for (ifp->if_index = 1; ifp->if_index <= if_index; ifp->if_index++) { + for (ifp->if_index = 1; ifp->if_index <= V_if_index; ifp->if_index++) { if (ifnet_byindex(ifp->if_index) == NULL) break; } @@ -374,9 +449,9 @@ free(ifp, M_IFNET); return (NULL); } - if (ifp->if_index > if_index) - if_index = ifp->if_index; - if (if_index >= if_indexlim) + if (ifp->if_index > V_if_index) + V_if_index = ifp->if_index; + if (V_if_index >= V_if_indexlim) if_grow(); ifnet_byindex(ifp->if_index) = ifp; @@ -415,6 +490,7 @@ void if_free_type(struct ifnet *ifp, u_char type) { + INIT_VNET_NET(curvnet); /* ifp->if_vnet can be NULL here ! */ if (ifp != ifnet_byindex(ifp->if_index)) { if_printf(ifp, "%s: value was not if_alloced, skipping\n", @@ -427,8 +503,8 @@ ifnet_byindex(ifp->if_index) = NULL; /* XXX: should be locked with if_findindex() */ - while (if_index > 0 && ifnet_byindex(if_index) == NULL) - if_index--; + while (V_if_index > 0 && ifnet_byindex(V_if_index) == NULL) + V_if_index--; if (if_com_free[type] != NULL) if_com_free[type](ifp->if_l2com, type); @@ -451,6 +527,7 @@ void if_attach(struct ifnet *ifp) { + INIT_VNET_NET(curvnet); unsigned socksize, ifasize; int namelen, masklen; struct sockaddr_dl *sdl; @@ -460,6 +537,11 @@ panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); +#ifdef VIMAGE + ifp->if_vnet = curvnet; + if (ifp->if_home_vnet == NULL) + ifp->if_home_vnet = curvnet; +#endif TASK_INIT(&ifp->if_starttask, 0, if_start_deferred, ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); IF_AFDATA_LOCK_INIT(ifp); @@ -482,12 +564,18 @@ mac_ifnet_create(ifp); #endif +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif ifdev_byindex(ifp->if_index) = make_dev(&net_cdevsw, unit2minor(ifp->if_index), UID_ROOT, GID_WHEEL, 0600, "%s/%s", net_cdevsw.d_name, ifp->if_xname); make_dev_alias(ifdev_byindex(ifp->if_index), "%s%d", net_cdevsw.d_name, ifp->if_index); +#ifdef VIMAGE + } +#endif mtx_init(&ifp->if_snd.ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); @@ -533,13 +621,19 @@ ifp->if_snd.altq_ifp = ifp; IFNET_WLOCK(); - TAILQ_INSERT_TAIL(&ifnet, ifp, if_link); + TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); +#ifdef VIMAGE + curvnet->ifccnt++; +#endif IFNET_WUNLOCK(); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) +#endif devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); /* Announce the interface. */ @@ -552,16 +646,17 @@ static void if_attachdomain(void *dummy) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; int s; s = splnet(); - TAILQ_FOREACH(ifp, &ifnet, if_link) + TAILQ_FOREACH(ifp, &V_ifnet, if_link) if_attachdomain1(ifp); splx(s); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, - if_attachdomain, NULL); + if_attachdomain, NULL); static void if_attachdomain1(struct ifnet *ifp) @@ -662,6 +757,7 @@ void if_detach(struct ifnet *ifp) { + INIT_VNET_NET(ifp->if_vnet); struct ifaddr *ifa; struct radix_node_head *rnh; int s; @@ -670,13 +766,25 @@ struct ifnet *iter; int found = 0; + /* + * Detach from any vlan, bridge or lagg ifnets linked to us. + * A small though unlikely window for a race from here to ifp + * unlinking from ifnet list is possible, hence we repeat the + * procedure once again further bellow. XXX. + */ + EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); + IFNET_WLOCK(); - TAILQ_FOREACH(iter, &ifnet, if_link) + TAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { - TAILQ_REMOVE(&ifnet, ifp, if_link); + TAILQ_REMOVE(&V_ifnet, ifp, if_link); found = 1; break; } +#ifdef VIMAGE + if (found) + curvnet->ifccnt--; +#endif IFNET_WUNLOCK(); if (!found) return; @@ -720,7 +828,13 @@ * Clean up all addresses. */ ifp->if_addr = NULL; +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif destroy_dev(ifdev_byindex(ifp->if_index)); +#ifdef VIMAGE + } +#endif ifdev_byindex(ifp->if_index) = NULL; /* We can now free link ifaddr. */ @@ -737,7 +851,7 @@ * to this interface...oh well... */ for (i = 1; i <= AF_MAX; i++) { - if ((rnh = rt_tables[i]) == NULL) + if ((rnh = V_rt_tables[i]) == NULL) continue; RADIX_NODE_HEAD_LOCK(rnh); (void) rnh->rnh_walktree(rnh, if_rtdel, ifp); @@ -747,6 +861,9 @@ /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) +#endif devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); IF_AFDATA_LOCK(ifp); @@ -765,6 +882,9 @@ knlist_destroy(&ifp->if_klist); mtx_destroy(&ifp->if_snd.ifq_mtx); IF_AFDATA_DESTROY(ifp); +#ifdef VIMAGE + ifp->if_vnet = NULL; +#endif splx(s); } @@ -774,6 +894,7 @@ int if_addgroup(struct ifnet *ifp, const char *groupname) { + INIT_VNET_NET(ifp->if_vnet); struct ifg_list *ifgl; struct ifg_group *ifg = NULL; struct ifg_member *ifgm; @@ -802,7 +923,7 @@ return (ENOMEM); } - TAILQ_FOREACH(ifg, &ifg_head, ifg_next) + TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, groupname)) break; @@ -818,7 +939,7 @@ ifg->ifg_refcnt = 0; TAILQ_INIT(&ifg->ifg_members); EVENTHANDLER_INVOKE(group_attach_event, ifg); - TAILQ_INSERT_TAIL(&ifg_head, ifg, ifg_next); + TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); } ifg->ifg_refcnt++; @@ -843,6 +964,7 @@ int if_delgroup(struct ifnet *ifp, const char *groupname) { + INIT_VNET_NET(ifp->if_vnet); struct ifg_list *ifgl; struct ifg_member *ifgm; @@ -869,7 +991,7 @@ } if (--ifgl->ifgl_group->ifg_refcnt == 0) { - TAILQ_REMOVE(&ifg_head, ifgl->ifgl_group, ifg_next); + TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } @@ -932,6 +1054,7 @@ static int if_getgroupmembers(struct ifgroupreq *data) { + INIT_VNET_NET(curvnet); struct ifgroupreq *ifgr = data; struct ifg_group *ifg; struct ifg_member *ifgm; @@ -939,7 +1062,7 @@ int len, error; IFNET_RLOCK(); - TAILQ_FOREACH(ifg, &ifg_head, ifg_next) + TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, ifgr->ifgr_name)) break; if (ifg == NULL) { @@ -1041,11 +1164,12 @@ struct ifaddr * ifa_ifwithaddr(struct sockaddr *addr) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) + TAILQ_FOREACH(ifp, &V_ifnet, if_link) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; @@ -1071,11 +1195,12 @@ struct ifaddr * ifa_ifwithbroadaddr(struct sockaddr *addr) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) + TAILQ_FOREACH(ifp, &V_ifnet, if_link) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; @@ -1098,11 +1223,12 @@ struct ifaddr * ifa_ifwithdstaddr(struct sockaddr *addr) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { @@ -1126,6 +1252,7 @@ struct ifaddr * ifa_ifwithnet(struct sockaddr *addr) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *ifa_maybe = (struct ifaddr *) 0; @@ -1138,7 +1265,7 @@ */ if (af == AF_LINK) { struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr; - if (sdl->sdl_index && sdl->sdl_index <= if_index) + if (sdl->sdl_index && sdl->sdl_index <= V_if_index) return (ifaddr_byindex(sdl->sdl_index)); } @@ -1147,7 +1274,7 @@ * addresses in this address family. */ IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { char *cp, *cp2, *cp3; @@ -1369,6 +1496,7 @@ struct ifnet *ifp = (struct ifnet *)arg; int link_state = ifp->if_link_state; int link; + CURVNET_SET(ifp->if_vnet); /* Notify that the link state has changed. */ rt_ifmsg(ifp); @@ -1398,6 +1526,9 @@ (*lagg_linkstate_p)(ifp, link_state); } +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) +#endif devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); if (pending > 1) @@ -1405,6 +1536,7 @@ if (log_link_state_change) log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname, (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); + CURVNET_RESTORE(); } /* @@ -1471,12 +1603,15 @@ int s = splimp(); IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + VNET_ITERLOOP_BEGIN(); + INIT_VNET_NET(curvnet); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_timer == 0 || --ifp->if_timer) continue; if (ifp->if_watchdog) (*ifp->if_watchdog)(ifp); } + VNET_ITERLOOP_END(); IFNET_RUNLOCK(); splx(s); timeout(if_slowtimo, (void *)0, hz / IFNET_SLOWHZ); @@ -1489,10 +1624,11 @@ struct ifnet * ifunit(const char *name) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } @@ -1854,6 +1990,24 @@ ifr = (struct ifreq *)data; switch (cmd) { +#ifdef VIMAGE + case SIOCSIFVIMAGE: + error = suser(td); + if (error == 0) + error = vi_if_move((struct vi_req *) data, NULL, + TD_TO_VIMAGE(td)); + return (error); + + /* + * XXX Should be implemented as separate system calls. This is + * just a temporary hack! + */ + case SIOCSPVIMAGE: + case SIOCGPVIMAGE: + error = vi_td_ioctl(cmd, (struct vi_req *) data, td); + return (error); +#endif + case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); @@ -2061,6 +2215,7 @@ static int ifconf(u_long cmd, caddr_t data) { + INIT_VNET_NET(curvnet); struct ifconf *ifc = (struct ifconf *)data; #ifdef __amd64__ struct ifconf32 *ifc32 = (struct ifconf32 *)data; @@ -2096,7 +2251,7 @@ valid_len = 0; IFNET_RLOCK(); /* could sleep XXX */ - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { int addrs; /* @@ -2420,9 +2575,10 @@ int lastref; #ifdef INVARIANTS struct ifnet *oifp; + INIT_VNET_NET(ifp->if_vnet); IFNET_RLOCK(); - TAILQ_FOREACH(oifp, &ifnet, if_link) + TAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) @@ -2744,7 +2900,6 @@ if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f) { - KASSERT(if_com_alloc[type] == NULL, ("if_register_com_alloc: %d already registered", type)); KASSERT(if_com_free[type] == NULL, @@ -2757,7 +2912,6 @@ void if_deregister_com_alloc(u_char type) { - KASSERT(if_com_alloc[type] != NULL, ("if_deregister_com_alloc: %d not registered", type)); KASSERT(if_com_free[type] != NULL, --- /u/marko/p4/head/src/sys/net/if_clone.c 2007-08-31 03:47:47.000000000 +0200 +++ src/sys/net/if_clone.c 2007-10-22 18:06:36.000000000 +0200 @@ -30,6 +30,8 @@ * $FreeBSD: src/sys/net/if_clone.c,v 1.11 2006/07/09 06:04:00 sam Exp $ */ +#include "opt_vimage.h" + #include #include #include @@ -39,7 +41,9 @@ #include #include #include +#include +#include #include #include #if 0 @@ -204,15 +208,14 @@ { int err; - if (ifc->ifc_destroy == NULL) { - err = EOPNOTSUPP; - goto done; - } + if (ifc->ifc_destroy == NULL) + return(EOPNOTSUPP); IF_CLONE_LOCK(ifc); IFC_IFLIST_REMOVE(ifc, ifp); IF_CLONE_UNLOCK(ifc); + CURVNET_SET_QUIET(ifp->if_vnet); if_delgroup(ifp, ifc->ifc_name); err = (*ifc->ifc_destroy)(ifc, ifp); @@ -224,8 +227,7 @@ IFC_IFLIST_INSERT(ifc, ifp); IF_CLONE_UNLOCK(ifc); } - -done: + CURVNET_RESTORE(); return (err); } @@ -402,6 +404,24 @@ * Find a free unit if none was given. */ if (wildcard) { +#ifdef VIMAGE + INIT_VNET_NET(curvnet); + char name[IFNAMSIZ]; + struct ifnet *ifp; + int i = 0; + + IFNET_RLOCK(); +again: + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + sprintf(name, "%s%d", ifc->ifc_name, i); + if (strcmp(name, ifp->if_xname) == 0) { + i++; + goto again; + } + } + IFNET_RUNLOCK(); + *unit = i; +#else while ((bytoff < ifc->ifc_bmlen) && (ifc->ifc_units[bytoff] == 0xff)) bytoff++; @@ -412,6 +432,7 @@ while ((ifc->ifc_units[bytoff] & (1 << bitoff)) != 0) bitoff++; *unit = (bytoff << 3) + bitoff; +#endif } if (*unit > ifc->ifc_maxunit) { @@ -419,6 +440,7 @@ goto done; } +#ifndef VIMAGE if (!wildcard) { bytoff = *unit >> 3; bitoff = *unit - (bytoff << 3); @@ -434,6 +456,7 @@ KASSERT((ifc->ifc_units[bytoff] & (1 << bitoff)) == 0, ("%s: bit is already set", __func__)); ifc->ifc_units[bytoff] |= (1 << bitoff); +#endif IF_CLONE_ADDREF_LOCKED(ifc); done: @@ -444,9 +467,9 @@ void ifc_free_unit(struct if_clone *ifc, int unit) { +#ifndef VIMAGE int bytoff, bitoff; - /* * Compute offset in the bitmap and deallocate the unit. */ @@ -458,6 +481,7 @@ ("%s: bit is already cleared", __func__)); ifc->ifc_units[bytoff] &= ~(1 << bitoff); IF_CLONE_REMREF_LOCKED(ifc); /* releases lock */ +#endif } void --- /u/marko/p4/head/src/sys/net/if_ethersubr.c 2007-11-13 02:49:08.000000000 +0100 +++ src/sys/net/if_ethersubr.c 2007-12-10 11:26:09.000000000 +0100 @@ -37,6 +37,7 @@ #include "opt_mac.h" #include "opt_netgraph.h" #include "opt_carp.h" +#include "opt_vimage.h" #include #include @@ -48,7 +49,9 @@ #include #include #include +#include +#include #include #include #include @@ -135,8 +138,10 @@ int ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, struct ip_fw **rule, int shared); +#ifndef VIMAGE static int ether_ipfw; #endif +#endif /* * Ethernet output routine. @@ -385,9 +390,10 @@ { int error; #if defined(INET) || defined(INET6) + INIT_VNET_NET(ifp->if_vnet); struct ip_fw *rule = ip_dn_claim_rule(m); - if (IPFW_LOADED && ether_ipfw != 0) { + if (IPFW_LOADED && V_ether_ipfw != 0) { if (ether_ipfw_chk(&m, ifp, &rule, 0) == 0) { if (m) { m_freem(m); @@ -416,13 +422,14 @@ ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, struct ip_fw **rule, int shared) { + INIT_VNET_IPFW(dst->if_vnet); struct ether_header *eh; struct ether_header save_eh; struct mbuf *m; int i; struct ip_fw_args args; - if (*rule != NULL && fw_one_pass) + if (*rule != NULL && V_fw_one_pass) return 1; /* dummynet packet, already partially processed */ /* @@ -557,6 +564,8 @@ } #endif + CURVNET_SET_QUIET(ifp->if_vnet); + if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; @@ -593,6 +602,7 @@ /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); + CURVNET_RESTORE(); return; } @@ -641,8 +651,10 @@ ("%s: ng_ether_input_p is NULL", __func__)); m->m_flags &= ~M_PROMISC; (*ng_ether_input_p)(ifp, &m); - if (m == NULL) + if (m == NULL) { + CURVNET_RESTORE(); return; + } } /* @@ -653,8 +665,10 @@ if (ifp->if_bridge != NULL) { m->m_flags &= ~M_PROMISC; BRIDGE_INPUT(ifp, m); - if (m == NULL) + if (m == NULL) { + CURVNET_RESTORE(); return; + } } #ifdef DEV_CARP @@ -690,6 +704,7 @@ random_harvest(m, 16, 3, 0, RANDOM_NET); ether_demux(ifp, m); + CURVNET_RESTORE(); } /* @@ -708,11 +723,12 @@ KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__)); #if defined(INET) || defined(INET6) + INIT_VNET_NET(ifp->if_vnet); /* * Allow dummynet and/or ipfw to claim the frame. * Do not do this for PROMISC frames in case we are re-entered. */ - if (IPFW_LOADED && ether_ipfw != 0 && !(m->m_flags & M_PROMISC)) { + if (IPFW_LOADED && V_ether_ipfw != 0 && !(m->m_flags & M_PROMISC)) { struct ip_fw *rule = ip_dn_claim_rule(m); if (ether_ipfw_chk(&m, NULL, &rule, 0) == 0) { @@ -870,6 +886,25 @@ return (etherbuf); } +#ifdef VIMAGE +static void +ether_reassign(struct ifnet *ifp, struct vnet *vnet, char *dname) +{ + u_char eaddr[6]; + + bcopy(IF_LLADDR(ifp), eaddr, 6); + ether_ifdetach(ifp); + ifp->if_bpf = NULL; + if_reassign_common(ifp, vnet, "eth"); + if (dname) + snprintf(ifp->if_xname, IFNAMSIZ, "%s", dname); + + CURVNET_SET_QUIET(vnet); + ether_ifattach(ifp, eaddr); + CURVNET_RESTORE(); +} +#endif + /* * Perform common duties while attaching to interface list */ @@ -879,6 +914,9 @@ int i; struct ifaddr *ifa; struct sockaddr_dl *sdl; +#ifdef VIMAGE + struct vnet *home_vnet_0 = ifp->if_home_vnet; +#endif ifp->if_addrlen = ETHER_ADDR_LEN; ifp->if_hdrlen = ETHER_HDR_LEN; @@ -887,6 +925,9 @@ ifp->if_output = ether_output; ifp->if_input = ether_input; ifp->if_resolvemulti = ether_resolvemulti; +#ifdef VIMAGE + ifp->if_reassign = ether_reassign; +#endif if (ifp->if_baudrate == 0) ifp->if_baudrate = IF_Mbps(10); /* just a default */ ifp->if_broadcastaddr = etherbroadcastaddr; @@ -906,7 +947,11 @@ for (i = 0; i < ifp->if_addrlen; i++) if (lla[i] != 0) break; +#ifdef VIMAGE + if (i != ifp->if_addrlen && home_vnet_0 != ifp->if_home_vnet) +#else if (i != ifp->if_addrlen) +#endif if_printf(ifp, "Ethernet address: %6D\n", lla, ":"); if (ifp->if_flags & IFF_NEEDSGIANT) if_printf(ifp, "if_start running deferred for Giant\n"); @@ -931,8 +976,8 @@ SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); #if defined(INET) || defined(INET6) -SYSCTL_INT(_net_link_ether, OID_AUTO, ipfw, CTLFLAG_RW, - ðer_ipfw,0,"Pass ether pkts through firewall"); +SYSCTL_V_INT(V_NET, vnet_net, _net_link_ether, OID_AUTO, ipfw, CTLFLAG_RW, + ether_ipfw, 0, "Pass ether pkts through firewall"); #endif #if 0 --- /u/marko/p4/head/src/sys/net/if_faith.c 2007-08-31 03:47:47.000000000 +0200 +++ src/sys/net/if_faith.c 2007-10-22 18:06:36.000000000 +0200 @@ -41,6 +41,7 @@ */ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -54,6 +55,7 @@ #include #include #include +#include #include #include @@ -76,6 +78,7 @@ #include #include #include +#include #endif #define FAITHNAME "faith" @@ -323,11 +326,12 @@ faithprefix(in6) struct in6_addr *in6; { + INIT_VNET_INET6(curvnet); struct rtentry *rt; struct sockaddr_in6 sin6; int ret; - if (ip6_keepfaith == 0) + if (V_ip6_keepfaith == 0) return 0; bzero(&sin6, sizeof(sin6)); --- /u/marko/p4/head/src/sys/net/if_gif.c 2007-10-29 17:17:42.000000000 +0100 +++ src/sys/net/if_gif.c 2007-12-10 11:26:09.000000000 +0100 @@ -33,6 +33,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -48,6 +49,8 @@ #include #include #include +#include + #include #include @@ -92,7 +95,9 @@ */ static struct mtx gif_mtx; static MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface"); +#ifndef VIMAGE static LIST_HEAD(, gif_softc) gif_softc_list; +#endif void (*ng_gif_input_p)(struct ifnet *ifp, struct mbuf **mp, int af); void (*ng_gif_input_orphan_p)(struct ifnet *ifp, struct mbuf *m, int af); @@ -102,6 +107,7 @@ static void gif_start(struct ifnet *); static int gif_clone_create(struct if_clone *, int, caddr_t); static void gif_clone_destroy(struct ifnet *); +static int vnet_gif_iattach(const void *); IFC_SIMPLE_DECLARE(gif, 0); @@ -121,22 +127,30 @@ */ #define MAX_GIF_NEST 1 #endif -static int max_gif_nesting = MAX_GIF_NEST; -SYSCTL_INT(_net_link_gif, OID_AUTO, max_nesting, CTLFLAG_RW, - &max_gif_nesting, 0, "Max nested tunnels"); +#ifndef VIMAGE +static int max_gif_nesting; +#endif +SYSCTL_V_INT(V_NET, vnet_gif, _net_link_gif, OID_AUTO, max_nesting, + CTLFLAG_RW, max_gif_nesting, 0, "Max nested tunnels"); + +#ifdef INET6 +SYSCTL_DECL(_net_inet6_ip6); +SYSCTL_V_INT(V_NET, vnet_gif, _net_inet6_ip6, IPV6CTL_GIF_HLIM, + gifhlim, CTLFLAG_RW, ip6_gif_hlim, 0, ""); +#endif /* * By default, we disallow creation of multiple tunnels between the same * pair of addresses. Some applications require this functionality so * we allow control over this check here. */ -#ifdef XBONEHACK -static int parallel_tunnels = 1; -#else -static int parallel_tunnels = 0; +#ifndef VIMAGE +static int parallel_tunnels; #endif -SYSCTL_INT(_net_link_gif, OID_AUTO, parallel_tunnels, CTLFLAG_RW, - ¶llel_tunnels, 0, "Allow parallel tunnels?"); +SYSCTL_V_INT(V_NET, vnet_gif, _net_link_gif, OID_AUTO, parallel_tunnels, + CTLFLAG_RW, parallel_tunnels, 0, "Allow parallel tunnels?"); + +VNET_MOD_DECLARE(GIF, gif, NULL, vnet_gif_iattach, NET, NULL) static int gif_clone_create(ifc, unit, params) @@ -144,6 +158,7 @@ int unit; caddr_t params; { + INIT_VNET_GIF(curvnet); struct gif_softc *sc; sc = malloc(sizeof(struct gif_softc), M_GIF, M_WAITOK | M_ZERO); @@ -177,7 +192,7 @@ (*ng_gif_attach_p)(GIF2IFP(sc)); mtx_lock(&gif_mtx); - LIST_INSERT_HEAD(&gif_softc_list, sc, gif_list); + LIST_INSERT_HEAD(&V_gif_softc_list, sc, gif_list); mtx_unlock(&gif_mtx); return (0); @@ -220,29 +235,47 @@ } static int +vnet_gif_iattach(unused) + const void *unused; +{ + INIT_VNET_GIF(curvnet); + + LIST_INIT(&V_gif_softc_list); + V_max_gif_nesting = MAX_GIF_NEST; +#ifdef XBONEHACK + V_parallel_tunnels = 1; +#endif + V_ip_gif_ttl = GIF_TTL; +#ifdef INET6 + V_ip6_gif_hlim = GIF_HLIM; +#endif + + return 0; +} + +static int gifmodevent(mod, type, data) module_t mod; int type; void *data; { - switch (type) { case MOD_LOAD: mtx_init(&gif_mtx, "gif_mtx", NULL, MTX_DEF); - LIST_INIT(&gif_softc_list); - if_clone_attach(&gif_cloner); - -#ifdef INET6 - ip6_gif_hlim = GIF_HLIM; +#ifdef VIMAGE + vnet_mod_register(&vnet_gif_modinfo); +#else + vnet_gif_iattach(NULL); #endif - + if_clone_attach(&gif_cloner); break; case MOD_UNLOAD: if_clone_detach(&gif_cloner); - mtx_destroy(&gif_mtx); -#ifdef INET6 - ip6_gif_hlim = 0; +#ifdef VIMAGE + vnet_mod_deregister(&vnet_gif_modinfo); #endif + mtx_destroy(&gif_mtx); + break; default: return EOPNOTSUPP; @@ -353,6 +386,7 @@ struct sockaddr *dst; struct rtentry *rt; /* added in net2 */ { + INIT_VNET_GIF(ifp->if_vnet); struct gif_softc *sc = ifp->if_softc; struct m_tag *mtag; int error = 0; @@ -388,7 +422,7 @@ mtag = m_tag_locate(m, MTAG_GIF, MTAG_GIF_CALLED, mtag); gif_called++; } - if (gif_called > max_gif_nesting) { + if (gif_called > V_max_gif_nesting) { log(LOG_NOTICE, "gif_output: recursively called too many times(%d)\n", gif_called); @@ -822,13 +856,14 @@ struct sockaddr *src; struct sockaddr *dst; { + INIT_VNET_GIF(ifp->if_vnet); struct gif_softc *sc = ifp->if_softc; struct gif_softc *sc2; struct sockaddr *osrc, *odst, *sa; int error = 0; mtx_lock(&gif_mtx); - LIST_FOREACH(sc2, &gif_softc_list, gif_list) { + LIST_FOREACH(sc2, &V_gif_softc_list, gif_list) { if (sc2 == sc) continue; if (!sc2->gif_pdst || !sc2->gif_psrc) @@ -843,7 +878,7 @@ * Disallow parallel tunnels unless instructed * otherwise. */ - if (!parallel_tunnels && + if (!V_parallel_tunnels && bcmp(sc2->gif_pdst, dst, dst->sa_len) == 0 && bcmp(sc2->gif_psrc, src, src->sa_len) == 0) { error = EADDRNOTAVAIL; --- /u/marko/p4/head/src/sys/net/if_gif.h 2007-08-31 03:47:47.000000000 +0200 +++ src/sys/net/if_gif.h 2007-10-05 12:26:48.000000000 +0200 @@ -109,6 +109,29 @@ void gif_delete_tunnel(struct ifnet *); int gif_encapcheck(const struct mbuf *, int, int, void *); +/* + * Virtualization support + */ + +#define INIT_VNET_GIF(vnet) \ + INIT_FROM_VNET(vnet, VNET_MOD_GIF, struct vnet_gif, vnet_gif) + +#define VNET_GIF(sym) VSYM(vnet_gif, sym) + +struct vnet_gif { + LIST_HEAD(, gif_softc) _gif_softc_list; + int _max_gif_nesting; + int _parallel_tunnels; + int _ip_gif_ttl; + int _ip6_gif_hlim; +}; + +#define V_gif_softc_list VNET_GIF(gif_softc_list) +#define V_max_gif_nesting VNET_GIF(max_gif_nesting) +#define V_parallel_tunnels VNET_GIF(parallel_tunnels) +#define V_ip_gif_ttl VNET_GIF(ip_gif_ttl) +#define V_ip6_gif_hlim VNET_GIF(ip6_gif_hlim) + #endif /* _KERNEL */ #endif /* _NET_IF_GIF_H_ */ --- /u/marko/p4/head/src/sys/net/if_gre.c 2007-08-31 03:47:47.000000000 +0200 +++ src/sys/net/if_gre.c 2007-10-22 18:06:37.000000000 +0200 @@ -51,6 +51,7 @@ #include "opt_atalk.h" #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -63,6 +64,7 @@ #include #include #include +#include #include #include @@ -71,6 +73,7 @@ #include #ifdef INET +#include #include #include #include @@ -238,12 +241,15 @@ gre_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct rtentry *rt) { +#ifdef INET6 + INIT_VNET_INET(ifp->if_vnet); +#endif int error = 0; struct gre_softc *sc = ifp->if_softc; struct greip *gh; struct ip *ip; - u_short ip_id = 0; - uint8_t ip_tos = 0; + u_short gre_ip_id = 0; + uint8_t gre_ip_tos = 0; u_int16_t etype = 0; struct mobile_h mob_h; u_int32_t af; @@ -360,13 +366,13 @@ switch (dst->sa_family) { case AF_INET: ip = mtod(m, struct ip *); - ip_tos = ip->ip_tos; - ip_id = ip->ip_id; + gre_ip_tos = ip->ip_tos; + gre_ip_id = ip->ip_id; etype = ETHERTYPE_IP; break; #ifdef INET6 case AF_INET6: - ip_id = ip_newid(); + gre_ip_id = ip_newid(); etype = ETHERTYPE_IPV6; break; #endif @@ -409,8 +415,8 @@ ((struct ip*)gh)->ip_v = IPPROTO_IPV4; ((struct ip*)gh)->ip_hl = (sizeof(struct ip)) >> 2; ((struct ip*)gh)->ip_ttl = GRE_TTL; - ((struct ip*)gh)->ip_tos = ip_tos; - ((struct ip*)gh)->ip_id = ip_id; + ((struct ip*)gh)->ip_tos = gre_ip_tos; + ((struct ip*)gh)->ip_id = gre_ip_id; gh->gi_len = m->m_pkthdr.len; } --- /u/marko/p4/head/src/sys/net/if_loop.c 2007-10-29 17:17:42.000000000 +0100 +++ src/sys/net/if_loop.c 2007-12-10 11:26:09.000000000 +0100 @@ -27,7 +27,7 @@ * SUCH DAMAGE. * * @(#)if_loop.c 8.2 (Berkeley) 1/9/95 - * $FreeBSD: src/sys/net/if_loop.c,v 1.113 2007/10/27 18:25:53 yar Exp $ + * $FreeBSD: src/sys/net/if_loop.c,v 1.112 2007/02/09 00:09:35 cognet Exp $ */ /* @@ -38,6 +38,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipx.h" +#include "opt_vimage.h" #include #include @@ -50,7 +51,9 @@ #include #include #include +#include +#include #include #include #include @@ -94,6 +97,7 @@ struct lo_softc { struct ifnet *sc_ifp; + LIST_ENTRY(lo_softc) sc_next; }; int loioctl(struct ifnet *, u_long, caddr_t); @@ -102,11 +106,20 @@ struct sockaddr *dst, struct rtentry *rt); static int lo_clone_create(struct if_clone *, int, caddr_t); static void lo_clone_destroy(struct ifnet *); +static int vnet_loif_iattach(const void *); +#ifdef VIMAGE +static int vnet_loif_idetach(const void *); +#endif +#ifndef VIMAGE struct ifnet *loif = NULL; /* Used externally */ +static LIST_HEAD(lo_list, lo_softc) lo_list; +#endif /* !VIMAGE */ static MALLOC_DEFINE(M_LO, LONAME, "Loopback Interface"); +static struct mtx lo_mtx; + IFC_SIMPLE_DECLARE(lo, 1); static void @@ -114,12 +127,18 @@ struct ifnet *ifp; { struct lo_softc *sc; +#ifdef INVARIANTS + INIT_VNET_NET(ifp->if_vnet); +#endif sc = ifp->if_softc; /* XXX: destroying lo0 will lead to panics. */ - KASSERT(loif != ifp, ("%s: destroying lo0", __func__)); + KASSERT(V_loif != ifp, ("%s: destroying lo0", __func__)); + mtx_lock(&lo_mtx); + LIST_REMOVE(sc, sc_next); + mtx_unlock(&lo_mtx); bpfdetach(ifp); if_detach(ifp); if_free(ifp); @@ -132,6 +151,7 @@ int unit; caddr_t params; { + INIT_VNET_NET(curvnet); struct ifnet *ifp; struct lo_softc *sc; @@ -141,6 +161,8 @@ free(sc, M_LO); return (ENOSPC); } + if (V_loif == NULL) + V_loif = ifp; if_initname(ifp, ifc->ifc_name, unit); ifp->if_mtu = LOMTU; @@ -151,18 +173,72 @@ ifp->if_softc = sc; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); - if (loif == NULL) - loif = ifp; + mtx_lock(&lo_mtx); + LIST_INSERT_HEAD(&V_lo_list, sc, sc_next); + mtx_unlock(&lo_mtx); return (0); } +VNET_MOD_DECLARE_STATELESS(LOIF, loif, vnet_loif_iattach, vnet_loif_idetach, + NET) + +static int vnet_loif_iattach(unused) + const void *unused; +{ + INIT_VNET_NET(curvnet); + + LIST_INIT(&V_lo_list); +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) + if_clone_attach(&lo_cloner); + else + lo_cloner.ifc_attach(&lo_cloner); +#else + if_clone_attach(&lo_cloner); +#endif + return 0; +} + +#ifdef VIMAGE +static int vnet_loif_idetach(unused) + const void *unused; +{ + INIT_VNET_NET(curvnet); + struct lo_softc *sc, *nsc; + + LIST_FOREACH_SAFE(sc, &V_lo_list, sc_next, nsc) { + struct ifnet *ifp = sc->sc_ifp; + + if (ifp == V_loif) { + /* + * A hack to allow lo0 to be detached: + * bump if_unit number from 0 to 1. By + * setting V_loif to NULL we prevent queuing + * of routing messages that would have + * m_pkthdr.rcvif pointing to a nonexisting + * ifnet, i.e. the lo0 we just destroyed. + */ + ifp->if_dunit = 1; + V_loif = NULL; + } + if_clone_destroy(ifp->if_xname); + } + return 0; +} +#endif + static int loop_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: - if_clone_attach(&lo_cloner); + mtx_init(&lo_mtx, "lo_mtx", NULL, MTX_DEF); +#ifdef VIMAGE + vnet_mod_register(&vnet_loif_modinfo); +#else + vnet_loif_iattach(NULL); +#endif break; case MOD_UNLOAD: printf("loop module unload - not possible for this module type\n"); @@ -195,7 +271,7 @@ if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { m_freem(m); return (rt->rt_flags & RTF_BLACKHOLE ? 0 : - rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); + rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); } ifp->if_opackets++; @@ -241,6 +317,7 @@ int af; int hlen; { + INIT_VNET_NET(ifp->if_vnet); int isr; M_ASSERTPKTHDR(m); @@ -262,15 +339,15 @@ bpf_mtap(ifp->if_bpf, m); } } else { - if (bpf_peers_present(loif->if_bpf)) { - if ((m->m_flags & M_MCAST) == 0 || loif == ifp) { + if (bpf_peers_present(V_loif->if_bpf)) { + if ((m->m_flags & M_MCAST) == 0 || V_loif == ifp) { /* XXX beware sizeof(af) != 4 */ u_int32_t af1 = af; /* * We need to prepend the address family. */ - bpf_mtap2(loif->if_bpf, &af1, sizeof(af1), m); + bpf_mtap2(V_loif->if_bpf, &af1, sizeof(af1), m); } } } --- /u/marko/p4/head/src/sys/net/if_mib.c 2007-08-31 03:47:48.000000000 +0200 +++ src/sys/net/if_mib.c 2007-10-22 18:06:37.000000000 +0200 @@ -29,12 +29,16 @@ * $FreeBSD: src/sys/net/if_mib.c,v 1.18 2006/01/04 12:57:09 harti Exp $ */ +#include "opt_vimage.h" + #include #include #include #include #include +#include +#include #include #include @@ -64,12 +68,15 @@ SYSCTL_DECL(_net_link_generic); SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system, CTLFLAG_RW, 0, "Variables global to all interfaces"); -SYSCTL_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount, CTLFLAG_RD, - &if_index, 0, "Number of configured interfaces"); + +SYSCTL_V_INT(V_NET, vnet_net, _net_link_generic_system, IFMIB_IFCOUNT, + ifcount, CTLFLAG_RD, if_index, 0, + "Number of configured interfaces"); static int sysctl_ifdata(SYSCTL_HANDLER_ARGS) /* XXX bad syntax! */ { + INIT_VNET_NET(curvnet); int *name = (int *)arg1; int error; u_int namelen = arg2; @@ -81,7 +88,7 @@ if (namelen != 2) return EINVAL; - if (name[0] <= 0 || name[0] > if_index || + if (name[0] <= 0 || name[0] > V_if_index || ifnet_byindex(name[0]) == NULL) return ENOENT; --- /u/marko/p4/head/src/sys/net/if_ppp.c 2007-10-29 17:17:42.000000000 +0100 +++ src/sys/net/if_ppp.c 2007-12-10 11:26:09.000000000 +0100 @@ -80,6 +80,7 @@ #include "opt_ipx.h" #include "opt_mac.h" #include "opt_ppp.h" +#include "opt_vimage.h" #ifdef INET #define VJC @@ -98,6 +99,7 @@ #include #include #include +#include #include #include @@ -1396,6 +1398,7 @@ struct mbuf *mp, *dmp = NULL; u_char *iphdr; u_int hlen; + CURVNET_SET(ifp->if_vnet); sc->sc_stats.ppp_ipackets++; @@ -1430,7 +1433,7 @@ m_freem(m); if (dmp == NULL) { /* no error, but no decompressed packet produced */ - return; + goto done; } m = dmp; cp = mtod(m, u_char *); @@ -1587,7 +1590,7 @@ ilen, 0) == 0) { /* drop this packet */ m_freem(m); - return; + goto done; } if (sc->sc_active_filt.bf_insns == 0 || bpf_filter(sc->sc_active_filt.bf_insns, (u_char *) m, ilen, 0)) @@ -1616,13 +1619,13 @@ || sc->sc_npmode[NP_IP] != NPMODE_PASS) { /* interface is down - drop the packet. */ m_freem(m); - return; + goto done; } m->m_pkthdr.len -= PPP_HDRLEN; m->m_data += PPP_HDRLEN; m->m_len -= PPP_HDRLEN; if ((m = ip_fastforward(m)) == NULL) - return; + goto done; isr = NETISR_IP; break; #endif @@ -1635,7 +1638,7 @@ || sc->sc_npmode[NP_IPV6] != NPMODE_PASS) { /* interface is down - drop the packet. */ m_freem(m); - return; + goto done; } m->m_pkthdr.len -= PPP_HDRLEN; m->m_data += PPP_HDRLEN; @@ -1652,7 +1655,7 @@ /* XXX: || sc->sc_npmode[NP_IPX] != NPMODE_PASS*/) { /* interface is down - drop the packet. */ m_freem(m); - return; + goto done; } m->m_pkthdr.len -= PPP_HDRLEN; m->m_data += PPP_HDRLEN; @@ -1687,6 +1690,8 @@ if (isr == -1) (*sc->sc_ctlp)(sc); + done: + CURVNET_RESTORE(); return; bad: @@ -1694,6 +1699,7 @@ m_freem(m); PPP2IFP(sc)->if_ierrors++; sc->sc_stats.ppp_ierrors++; + CURVNET_RESTORE(); } #define MAX_DUMP_BYTES 128 --- /u/marko/p4/head/src/sys/net/if_spppsubr.c 2007-08-31 03:47:48.000000000 +0200 +++ src/sys/net/if_spppsubr.c 2007-10-22 18:06:37.000000000 +0200 @@ -27,6 +27,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipx.h" +#include "opt_vimage.h" #endif #ifdef NetBSD1_3 @@ -48,6 +49,7 @@ #endif #include #include +#include #if defined (__OpenBSD__) #include @@ -55,10 +57,13 @@ #include #endif +#include #include #include #include #include + +#include #include #include #include @@ -4938,6 +4943,7 @@ static void sppp_set_ip_addr(struct sppp *sp, u_long src) { + INIT_VNET_INET(curvnet); STDDCL; struct ifaddr *ifa; struct sockaddr_in *si; --- /u/marko/p4/head/src/sys/net/if_stf.c 2007-10-29 17:17:42.000000000 +0100 +++ src/sys/net/if_stf.c 2007-12-10 11:26:09.000000000 +0100 @@ -77,6 +77,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -91,7 +92,9 @@ #include #include +#include +#include #include #include #include @@ -99,6 +102,7 @@ #include #include +#include #include #include #include @@ -361,6 +365,7 @@ stf_getsrcifa6(ifp) struct ifnet *ifp; { + INIT_VNET_INET(ifp->if_vnet); struct ifaddr *ia; struct in_ifaddr *ia4; struct sockaddr_in6 *sin6; @@ -555,6 +560,7 @@ struct in_addr *in; struct ifnet *inifp; /* incoming interface */ { + INIT_VNET_INET(curvnet); struct in_ifaddr *ia4; /* @@ -578,7 +584,7 @@ /* * reject packets with broadcast */ - for (ia4 = TAILQ_FIRST(&in_ifaddrhead); + for (ia4 = TAILQ_FIRST(&V_in_ifaddrhead); ia4; ia4 = TAILQ_NEXT(ia4, ia_link)) { --- /u/marko/p4/head/src/sys/net/if_tap.c 2007-08-31 03:47:48.000000000 +0200 +++ src/sys/net/if_tap.c 2007-10-22 18:06:38.000000000 +0200 @@ -37,6 +37,7 @@ #include "opt_compat.h" #include "opt_inet.h" +#include "opt_vimage.h" #include #include @@ -58,7 +59,9 @@ #include #include #include +#include +#include #include #include #include --- /u/marko/p4/head/src/sys/net/if_tun.c 2007-10-29 17:17:42.000000000 +0100 +++ src/sys/net/if_tun.c 2007-12-10 11:26:09.000000000 +0100 @@ -21,6 +21,7 @@ #include "opt_inet6.h" #include "opt_ipx.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -43,6 +44,7 @@ #include #include #include +#include #include #include @@ -224,6 +226,7 @@ else append_unit = 0; + CURVNET_SET(TD_TO_VNET(curthread)); /* find any existing device, or allocate new unit number */ i = clone_create(&tunclones, &tun_cdevsw, &u, dev, 0); if (i) { @@ -242,6 +245,7 @@ } if_clone_create(name, namelen, NULL); + CURVNET_RESTORE(); } static void @@ -253,6 +257,7 @@ KASSERT((tp->tun_flags & TUN_OPEN) == 0, ("tununits is out of sync - unit %d", TUN2IFP(tp)->if_dunit)); + CURVNET_SET(TUN2IFP(tp)->if_vnet); dev = tp->tun_dev; bpfdetach(TUN2IFP(tp)); if_detach(TUN2IFP(tp)); @@ -261,6 +266,7 @@ knlist_destroy(&tp->tun_rsel.si_note); mtx_destroy(&tp->tun_mtx); free(tp, M_TUN); + CURVNET_RESTORE(); } static void @@ -447,6 +453,7 @@ /* * junk all pending output */ + CURVNET_SET(ifp->if_vnet); s = splimp(); IFQ_PURGE(&ifp->if_snd); splx(s); @@ -476,6 +483,7 @@ ifp->if_drv_flags &= ~IFF_DRV_RUNNING; splx(s); } + CURVNET_RESTORE(); funsetown(&tp->tun_sigio); selwakeuppri(&tp->tun_rsel, PZERO + 1); @@ -924,7 +932,9 @@ random_harvest(m, 16, 3, 0, RANDOM_NET); ifp->if_ibytes += m->m_pkthdr.len; ifp->if_ipackets++; + CURVNET_SET(ifp->if_vnet); netisr_dispatch(isr, m); + CURVNET_RESTORE(); return (0); } --- /u/marko/p4/head/src/sys/net/if_var.h 2007-12-27 19:32:15.000000000 +0100 +++ src/sys/net/if_var.h 2008-01-14 19:23:47.000000000 +0100 @@ -70,6 +70,7 @@ struct ether_header; struct carp_if; struct ifvlantrunk; +struct vnet; #endif #include /* get TAILQ macros */ @@ -160,6 +161,10 @@ (void *); int (*if_resolvemulti) /* validate/resolve multicast */ (struct ifnet *, struct sockaddr **, struct sockaddr *); + void (*if_reassign) /* reassign to vnet routine */ + (struct ifnet *, struct vnet *, char *); + struct vnet *if_vnet; /* network stack instance */ + struct vnet *if_home_vnet; /* where this ifnet originates from */ struct ifaddr *if_addr; /* pointer to link-level address */ void *if_llsoftc; /* link layer softc */ int if_drv_flags; /* driver-managed status flags */ @@ -644,20 +649,22 @@ struct cdev *ife_dev; }; -#define ifnet_byindex(idx) ifindex_table[(idx)].ife_ifnet +#define ifnet_byindex(idx) V_ifindex_table[(idx)].ife_ifnet /* * Given the index, ifaddr_byindex() returns the one and only * link-level ifaddr for the interface. You are not supposed to use * it to traverse the list of addresses associated to the interface. */ #define ifaddr_byindex(idx) ifnet_byindex(idx)->if_addr -#define ifdev_byindex(idx) ifindex_table[(idx)].ife_dev +#define ifdev_byindex(idx) V_ifindex_table[(idx)].ife_dev +extern int ifqmaxlen; +#ifndef VIMAGE extern struct ifnethead ifnet; extern struct ifindex_entry *ifindex_table; -extern int ifqmaxlen; -extern struct ifnet *loif; /* first loopback interface */ extern int if_index; +extern struct ifnet *loif; /* first loopback interface */ +#endif /* !VIMAGE */ int if_addgroup(struct ifnet *, const char *); int if_delgroup(struct ifnet *, const char *); @@ -665,6 +672,7 @@ int if_allmulti(struct ifnet *, int); struct ifnet* if_alloc(u_char); void if_attach(struct ifnet *); +void if_grow(void); int if_delmulti(struct ifnet *, struct sockaddr *); void if_delmulti_ifma(struct ifmultiaddr *); void if_detach(struct ifnet *); --- /u/marko/p4/head/src/sys/net/if_vlan.c 2007-10-20 18:52:07.000000000 +0200 +++ src/sys/net/if_vlan.c 2007-10-22 18:06:38.000000000 +0200 @@ -42,6 +42,7 @@ */ #include "opt_vlan.h" +#include "opt_vimage.h" #include #include @@ -55,7 +56,9 @@ #include #include #include +#include +#include #include #include #include @@ -421,6 +424,8 @@ sc = ifp->if_softc; ifp_p = PARENT(sc); + CURVNET_SET_QUIET(ifp_p->if_vnet); + bzero((char *)&sdl, sizeof(sdl)); sdl.sdl_len = sizeof(sdl); sdl.sdl_family = AF_LINK; @@ -455,6 +460,7 @@ return (error); } + CURVNET_RESTORE(); return (0); } @@ -572,13 +578,14 @@ static struct ifnet * vlan_clone_match_ethertag(struct if_clone *ifc, const char *name, int *tag) { + INIT_VNET_NET(curvnet); const char *cp; struct ifnet *ifp; int t = 0; /* Check for . style interface names. */ IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_type != IFT_ETHER) continue; if (strncmp(ifp->if_xname, name, strlen(ifp->if_xname)) != 0) @@ -1345,6 +1352,12 @@ error = copyin(ifr->ifr_data, &vlr, sizeof(vlr)); if (error) break; +#ifdef VIMAGE + if (ifp->if_home_vnet != ifp->if_vnet) { + error = EPERM; + break; + } +#endif if (vlr.vlr_parent[0] == '\0') { vlan_unconfig(ifp); break; @@ -1372,6 +1385,12 @@ case SIOCGETVLAN: bzero(&vlr, sizeof(vlr)); +#ifdef VIMAGE + if (ifp->if_home_vnet != ifp->if_vnet) { + error = EPERM; + break; + } +#endif VLAN_LOCK(); if (TRUNK(ifv) != NULL) { strlcpy(vlr.vlr_parent, PARENT(ifv)->if_xname, --- /u/marko/p4/head/src/sys/net/netisr.c 2007-08-31 03:47:49.000000000 +0200 +++ src/sys/net/netisr.c 2007-10-22 18:06:38.000000000 +0200 @@ -28,6 +28,7 @@ */ #include "opt_device_polling.h" +#include "opt_vimage.h" #include #include @@ -49,7 +50,9 @@ #include #include +#include +#include #include #include #include @@ -140,7 +143,10 @@ IF_DEQUEUE(ni->ni_queue, m); if (m == NULL) break; + VNET_ASSERT(m->m_pkthdr.rcvif != NULL); + CURVNET_SET(m->m_pkthdr.rcvif->if_vnet); ni->ni_handler(m); + CURVNET_RESTORE(); } } @@ -161,6 +167,7 @@ m_freem(m); return; } + VNET_ASSERT(m->m_pkthdr.rcvif != NULL) /* * Do direct dispatch only for MPSAFE netisrs (and * only when enabled). Note that when a netisr is @@ -172,8 +179,19 @@ * from an interface but does not guarantee ordering * between multiple places in the system (e.g. IP * dispatched from interfaces vs. IP queued from IPSec). + * + * If the kernel was compiled with options VIMAGE, also defer + * dispatch of netisr handlers for mbufs that have crossed a + * boundary between two vnets. Direct dispatching in such + * cases could lead to various LORs, or in most extreme + * circumstances cause the kernel stack to overflow. */ +#ifndef VIMAGE if (netisr_direct && (ni->ni_flags & NETISR_MPSAFE)) { +#else + if (netisr_direct && (ni->ni_flags & NETISR_MPSAFE) && + !(m->m_flags & M_REMOTE_VNET)) { +#endif isrstat.isrs_directed++; /* * NB: We used to drain the queue before handling @@ -184,6 +202,15 @@ */ ni->ni_handler(m); } else { +#ifdef VIMAGE + /* + * Once direct netisr dispatching is avoided using the + * M_REMOTE_VNET flag, it should not be observed any + * more, so clear it here in order to avoid further + * defering of direct netisr dispatching. + */ + m->m_flags &= ~M_REMOTE_VNET; +#endif isrstat.isrs_deferred++; if (IF_HANDOFF(ni->ni_queue, m, NULL)) schednetisr(num); @@ -210,6 +237,10 @@ m_freem(m); return (ENXIO); } + VNET_ASSERT(m->m_pkthdr.rcvif != NULL) +#ifdef VIMAGE + m->m_flags &= ~M_REMOTE_VNET; +#endif isrstat.isrs_queued++; if (!IF_HANDOFF(ni->ni_queue, m, NULL)) return (ENOBUFS); /* IF_HANDOFF has free'd the mbuf */ --- /u/marko/p4/head/src/sys/net/raw_cb.c 2007-08-31 03:47:49.000000000 +0200 +++ src/sys/net/raw_cb.c 2007-10-22 18:06:38.000000000 +0200 @@ -30,6 +30,8 @@ * $FreeBSD: src/sys/net/raw_cb.c,v 1.34 2006/06/02 08:27:15 rwatson Exp $ */ +#include "opt_vimage.h" + #include #include #include @@ -39,7 +41,9 @@ #include #include #include +#include +#include #include /* @@ -52,7 +56,9 @@ */ struct mtx rawcb_mtx; +#ifndef VIMAGE struct rawcb_list_head rawcb_list; +#endif const static u_long raw_sendspace = RAWSNDQ; const static u_long raw_recvspace = RAWRCVQ; @@ -66,6 +72,7 @@ register struct socket *so; int proto; { + INIT_VNET_NET(so->so_vnet); register struct rawcb *rp = sotorawcb(so); int error; @@ -83,7 +90,7 @@ rp->rcb_proto.sp_family = so->so_proto->pr_domain->dom_family; rp->rcb_proto.sp_protocol = proto; mtx_lock(&rawcb_mtx); - LIST_INSERT_HEAD(&rawcb_list, rp, list); + LIST_INSERT_HEAD(&V_rawcb_list, rp, list); mtx_unlock(&rawcb_mtx); return (0); } --- /u/marko/p4/head/src/sys/net/raw_cb.h 2007-08-31 03:47:49.000000000 +0200 +++ src/sys/net/raw_cb.h 2007-10-05 12:26:49.000000000 +0200 @@ -56,7 +56,11 @@ #define RAWRCVQ 8192 #ifdef _KERNEL + +#ifndef VIMAGE extern LIST_HEAD(rawcb_list_head, rawcb) rawcb_list; +#endif + extern struct mtx rawcb_mtx; /* protosw entries */ --- /u/marko/p4/head/src/sys/net/raw_usrreq.c 2007-08-31 03:47:49.000000000 +0200 +++ src/sys/net/raw_usrreq.c 2007-10-22 18:06:38.000000000 +0200 @@ -30,6 +30,8 @@ * $FreeBSD: src/sys/net/raw_usrreq.c,v 1.44 2006/11/06 13:42:02 rwatson Exp $ */ +#include "opt_vimage.h" + #include #include #include @@ -43,7 +45,9 @@ #include #include #include +#include +#include #include MTX_SYSINIT(rawcb_mtx, &rawcb_mtx, "rawcb", MTX_DEF); @@ -54,8 +58,11 @@ void raw_init() { +#ifndef VIMAGE + INIT_VNET_NET(curvnet); - LIST_INIT(&rawcb_list); + LIST_INIT(&V_rawcb_list); +#endif } @@ -73,13 +80,14 @@ register struct sockproto *proto; struct sockaddr *src, *dst; { + INIT_VNET_NET(curvnet); register struct rawcb *rp; register struct mbuf *m = m0; struct socket *last; last = 0; mtx_lock(&rawcb_mtx); - LIST_FOREACH(rp, &rawcb_list, list) { + LIST_FOREACH(rp, &V_rawcb_list, list) { if (rp->rcb_proto.sp_family != proto->sp_family) continue; if (rp->rcb_proto.sp_protocol && --- /u/marko/p4/head/src/sys/net/route.c 2008-02-27 18:29:00.000000000 +0100 +++ src/sys/net/route.c 2008-02-27 11:48:38.000000000 +0100 @@ -32,6 +32,7 @@ #include "opt_inet.h" #include "opt_mrouting.h" +#include "opt_vimage.h" #include #include @@ -40,7 +41,9 @@ #include #include #include +#include +#include #include #include @@ -49,14 +52,18 @@ #include +#ifndef VIMAGE static struct rtstat rtstat; struct radix_node_head *rt_tables[AF_MAX+1]; - static int rttrash; /* routes not in table but not freed */ +#endif /* !VIMAGE */ static void rt_maskedcopy(struct sockaddr *, struct sockaddr *, struct sockaddr *); -static void rtable_init(void **); +static int rtable_init(const void *); +#ifdef VIMAGE +static int rtable_idetach(const void *); +#endif /* compare two sockaddr structures */ #define sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0) @@ -73,15 +80,38 @@ */ #define RNTORT(p) ((struct rtentry *)(p)) -static void -rtable_init(void **table) +VNET_MOD_DECLARE_STATELESS(RTABLE, rtable, rtable_init, rtable_idetach, + NET) + +static int +rtable_init(unused) + const void *unused; { + INIT_VNET_NET(curvnet); + struct domain *dom; for (dom = domains; dom; dom = dom->dom_next) if (dom->dom_rtattach) - dom->dom_rtattach(&table[dom->dom_family], + dom->dom_rtattach((void *)&V_rt_tables[dom->dom_family], + dom->dom_rtoffset); + return 0; +} + +#ifdef VIMAGE +static int +rtable_idetach(unused) + const void *unused; +{ + INIT_VNET_NET(curvnet); + + struct domain *dom; + for (dom = domains; dom; dom = dom->dom_next) + if (dom->dom_rtdetach) + dom->dom_rtdetach((void *)&V_rt_tables[dom->dom_family], dom->dom_rtoffset); + return 0; } +#endif static uma_zone_t rtzone; /* Routing table UMA zone. */ @@ -91,7 +121,11 @@ rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); rn_init(); /* initialize all zeroes, all ones, mask table */ - rtable_init((void **)rt_tables); +#ifdef VIMAGE + vnet_mod_register(&vnet_rtable_modinfo); +#else + rtable_init(NULL); +#endif } /* @@ -128,7 +162,8 @@ struct rtentry * rtalloc1(struct sockaddr *dst, int report, u_long ignflags) { - struct radix_node_head *rnh = rt_tables[dst->sa_family]; + INIT_VNET_NET(curvnet); + struct radix_node_head *rnh = V_rt_tables[dst->sa_family]; struct rtentry *rt; struct radix_node *rn; struct rtentry *newrt; @@ -141,7 +176,7 @@ * Look up the address in the table for that Address Family */ if (rnh == NULL) { - rtstat.rts_unreach++; + V_rtstat.rts_unreach++; goto miss2; } RADIX_NODE_HEAD_LOCK(rnh); @@ -203,7 +238,7 @@ * Which basically means * "caint get there frm here" */ - rtstat.rts_unreach++; + V_rtstat.rts_unreach++; miss: RADIX_NODE_HEAD_UNLOCK(rnh); miss2: if (report) { @@ -229,10 +264,11 @@ void rtfree(struct rtentry *rt) { + INIT_VNET_NET(curvnet); struct radix_node_head *rnh; KASSERT(rt != NULL,("%s: NULL rt", __func__)); - rnh = rt_tables[rt_key(rt)->sa_family]; + rnh = V_rt_tables[rt_key(rt)->sa_family]; KASSERT(rnh != NULL,("%s: NULL rnh", __func__)); RT_LOCK_ASSERT(rt); @@ -271,7 +307,7 @@ * the rtentry must have been removed from the routing table * so it is represented in rttrash.. remove that now. */ - rttrash--; + V_rttrash--; #ifdef DIAGNOSTIC if (rt->rt_refcnt < 0) { printf("rtfree: %p not freed (neg refs)\n", rt); @@ -318,6 +354,7 @@ int flags, struct sockaddr *src) { + INIT_VNET_NET(curvnet); struct rtentry *rt, *rt0 = NULL; int error = 0; short *stat = NULL; @@ -381,7 +418,7 @@ if (rt0) RTFREE_LOCKED(rt0); - stat = &rtstat.rts_dynamic; + stat = &V_rtstat.rts_dynamic; } else { struct rtentry *gwrt; @@ -391,7 +428,7 @@ */ rt->rt_flags |= RTF_MODIFIED; flags |= RTF_MODIFIED; - stat = &rtstat.rts_newgateway; + stat = &V_rtstat.rts_newgateway; /* * add the key and gateway (in one malloc'd chunk). */ @@ -407,7 +444,7 @@ RTFREE_LOCKED(rt); out: if (error) - rtstat.rts_badredirect++; + V_rtstat.rts_badredirect++; else if (stat != NULL) (*stat)++; bzero((caddr_t)&info, sizeof(info)); @@ -591,6 +628,7 @@ int rtexpunge(struct rtentry *rt) { + INIT_VNET_NET(curvnet); struct radix_node *rn; struct radix_node_head *rnh; struct ifaddr *ifa; @@ -608,7 +646,7 @@ /* * Find the correct routing tree to use for this Address Family */ - rnh = rt_tables[rt_key(rt)->sa_family]; + rnh = V_rt_tables[rt_key(rt)->sa_family]; if (rnh == NULL) return (EAFNOSUPPORT); @@ -666,7 +704,7 @@ * one more rtentry floating around that is not * linked to the routing table. */ - rttrash++; + V_rttrash++; bad: RADIX_NODE_HEAD_UNLOCK(rnh); return (error); @@ -675,6 +713,7 @@ int rtrequest1(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt) { + INIT_VNET_NET(curvnet); int error = 0; register struct rtentry *rt; register struct radix_node *rn; @@ -686,7 +725,7 @@ /* * Find the correct routing tree to use for this Address Family */ - rnh = rt_tables[dst->sa_family]; + rnh = V_rt_tables[dst->sa_family]; if (rnh == NULL) return (EAFNOSUPPORT); RADIX_NODE_HEAD_LOCK(rnh); @@ -745,7 +784,7 @@ * linked to the routing table. rttrash will be decremented * when RTFREE(rt) is eventually called. */ - rttrash++; + V_rttrash++; /* * If the caller wants it, then it can have it, @@ -1021,8 +1060,9 @@ int rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) { + INIT_VNET_NET(curvnet); /* XXX dst may be overwritten, can we move this to below */ - struct radix_node_head *rnh = rt_tables[dst->sa_family]; + struct radix_node_head *rnh = V_rt_tables[dst->sa_family]; int dlen = SA_SIZE(dst), glen = SA_SIZE(gate); again: @@ -1161,6 +1201,7 @@ int rtinit(struct ifaddr *ifa, int cmd, int flags) { + INIT_VNET_NET(curvnet); struct sockaddr *dst; struct sockaddr *netmask; struct mbuf *m = NULL; @@ -1205,7 +1246,7 @@ * Look up an rtentry that is in the routing tree and * contains the correct info. */ - if ((rnh = rt_tables[dst->sa_family]) == NULL) + if ((rnh = V_rt_tables[dst->sa_family]) == NULL) goto bad; RADIX_NODE_HEAD_LOCK(rnh); error = ((rn = rnh->rnh_lookup(dst, netmask, rnh)) == NULL || --- /u/marko/p4/head/src/sys/net/rtsock.c 2007-09-08 22:08:44.000000000 +0200 +++ src/sys/net/rtsock.c 2007-10-22 18:06:38.000000000 +0200 @@ -29,7 +29,10 @@ * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 * $FreeBSD: src/sys/net/rtsock.c,v 1.143 2007/09/08 19:28:45 cognet Exp $ */ + #include "opt_sctp.h" +#include "opt_vimage.h" + #include #include #include @@ -44,7 +47,9 @@ #include #include #include +#include +#include #include #include #include @@ -312,6 +317,7 @@ route_output(struct mbuf *m, struct socket *so) { #define sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0) + INIT_VNET_NET(so->so_vnet); struct rt_msghdr *rtm = NULL; struct rtentry *rt = NULL; struct radix_node_head *rnh; @@ -410,7 +416,7 @@ case RTM_GET: case RTM_CHANGE: case RTM_LOCK: - rnh = rt_tables[info.rti_info[RTAX_DST]->sa_family]; + rnh = V_rt_tables[info.rti_info[RTAX_DST]->sa_family]; if (rnh == NULL) senderr(EAFNOSUPPORT); RADIX_NODE_HEAD_LOCK(rnh); @@ -1052,6 +1058,7 @@ static void rt_dispatch(struct mbuf *m, const struct sockaddr *sa) { + INIT_VNET_NET(curvnet); struct m_tag *tag; /* @@ -1069,6 +1076,14 @@ *(unsigned short *)(tag + 1) = sa->sa_family; m_tag_prepend(m, tag); } +#ifdef VIMAGE + if (V_loif) + m->m_pkthdr.rcvif = V_loif; + else { + m_freem(m); + return; + } +#endif netisr_queue(NETISR_ROUTE, m); /* mbuf is free'd on failure. */ } @@ -1115,6 +1130,7 @@ static int sysctl_iflist(int af, struct walkarg *w) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifaddr *ifa; struct rt_addrinfo info; @@ -1122,7 +1138,7 @@ bzero((caddr_t)&info, sizeof(info)); IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; ifa = ifp->if_addr; @@ -1175,6 +1191,7 @@ int sysctl_ifmalist(int af, struct walkarg *w) { + INIT_VNET_NET(curvnet); struct ifnet *ifp; struct ifmultiaddr *ifma; struct rt_addrinfo info; @@ -1183,7 +1200,7 @@ bzero((caddr_t)&info, sizeof(info)); IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; ifa = ifp->if_addr; @@ -1224,6 +1241,7 @@ static int sysctl_rtsock(SYSCTL_HANDLER_ARGS) { + INIT_VNET_NET(curvnet); int *name = (int *)arg1; u_int namelen = arg2; struct radix_node_head *rnh; @@ -1258,7 +1276,7 @@ } else /* dump only one table */ i = lim = af; for (error = 0; error == 0 && i <= lim; i++) - if ((rnh = rt_tables[i]) != NULL) { + if ((rnh = V_rt_tables[i]) != NULL) { RADIX_NODE_HEAD_LOCK(rnh); error = rnh->rnh_walktree(rnh, sysctl_dumpentry, &w); --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/net/vnet.h 2007-10-05 12:26:49.000000000 +0200 @@ -0,0 +1,92 @@ +/*- + * Copyright (c) 2006 University of Zagreb + * Copyright (c) 2006 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * XXX RCS tag goes here + */ + +#ifndef _NET_VNET_H_ +#define _NET_VNET_H_ + + +#ifdef VIMAGE +#include +#include +#include + +#include +#include +#include +#include + +struct vnet_net { + int _if_index; + struct ifindex_entry *_ifindex_table; + struct ifnethead _ifnet; + struct ifgrouphead _ifg_head; + + int _if_indexlim; + struct knlist _ifklist; + + struct rtstat _rtstat; + struct radix_node_head *_rt_tables[AF_MAX+1]; + int _rttrash; + + struct ifnet *_loif; + LIST_HEAD(, lo_softc) _lo_list; + + LIST_HEAD(, rawcb) _rawcb_list; + + int _ether_ipfw; +}; + +#endif + +/* + * Symbol translation macros + */ +#define INIT_VNET_NET(vnet) \ + INIT_FROM_VNET(vnet, VNET_MOD_NET, struct vnet_net, vnet_net) + +#define VNET_NET(sym) VSYM(vnet_net, sym) + +#define V_if_index VNET_NET(if_index) +#define V_ifindex_table VNET_NET(ifindex_table) +#define V_ifnet VNET_NET(ifnet) +#define V_ifg_head VNET_NET(ifg_head) +#define V_if_indexlim VNET_NET(if_indexlim) +#define V_ifklist VNET_NET(ifklist) +#define V_rtstat VNET_NET(rtstat) +#define V_rt_tables VNET_NET(rt_tables) +#define V_rttrash VNET_NET(rttrash) +#define V_loif VNET_NET(loif) +#define V_lo_list VNET_NET(lo_list) +#define V_rawcb_list VNET_NET(rawcb_list) +#define V_ether_ipfw VNET_NET(ether_ipfw) + +#endif /* !_NET_VNET_H_ */ --- /u/marko/p4/head/src/sys/net80211/ieee80211.c 2007-12-27 19:32:16.000000000 +0100 +++ src/sys/net80211/ieee80211.c 2008-01-14 19:23:50.000000000 +0100 @@ -31,14 +31,19 @@ * IEEE 802.11 generic handler */ +#include "opt_vimage.h" + #include #include #include #include +#include #include +#include #include +#include #include #include @@ -77,6 +82,7 @@ static int media_status(enum ieee80211_opmode , const struct ieee80211_channel *); +static struct ieee80211com * ieee80211_find_instance(struct ifnet *ifp); /* list of all instances */ SLIST_HEAD(ieee80211_list, ieee80211com); @@ -213,6 +219,9 @@ ether_ifattach(ifp, ic->ic_myaddr); ifp->if_output = ieee80211_output; +#ifdef VIMAGE + ifp->if_reassign = NULL; /* Override ether_reassign() */ +#endif bpfattach2(ifp, DLT_IEEE802_11, sizeof(struct ieee80211_frame_addr4), &ic->ic_rawbpf); @@ -303,6 +312,30 @@ ether_ifdetach(ifp); } +#ifdef VIMAGE +void +ieee80211_reassign(struct ieee80211com *ic, struct vnet *vnet, char *dname) +{ + u_char eaddr[6]; + struct ifnet *ifp = ic->ic_ifp; + + bcopy(IF_LLADDR(ifp), eaddr, 6); + bpfdetach(ifp); + ether_ifdetach(ifp); + ifp->if_bpf = NULL; + ic->ic_rawbpf = NULL; + if_reassign_common(ifp, vnet, ifp->if_dname); + if (dname) + snprintf(ifp->if_xname, IFNAMSIZ, "%s", dname); + + CURVNET_SET_QUIET(vnet); + ether_ifattach(ifp, eaddr); + bpfattach2(ifp, DLT_IEEE802_11, + sizeof(struct ieee80211_frame_addr4), &ic->ic_rawbpf); + CURVNET_RESTORE(); +} +#endif + static __inline int mapgsm(u_int freq, u_int flags) { --- /u/marko/p4/head/src/sys/net80211/ieee80211_freebsd.c 2007-11-13 02:49:09.000000000 +0100 +++ src/sys/net80211/ieee80211_freebsd.c 2007-12-10 11:26:09.000000000 +0100 @@ -26,6 +26,8 @@ #include __FBSDID("$FreeBSD: src/sys/net80211/ieee80211_freebsd.c,v 1.17 2007/11/02 05:22:24 sam Exp $"); +#include "opt_vimage.h" + /* * IEEE 802.11 support (FreeBSD-specific code) */ @@ -37,8 +39,8 @@ #include #include #include - #include +#include #include #include @@ -296,6 +298,7 @@ struct ifnet *ifp = ic->ic_ifp; struct ieee80211_join_event iev; + CURVNET_SET(ifp->if_vnet); memset(&iev, 0, sizeof(iev)); if (ni == ic->ic_bss) { IEEE80211_ADDR_COPY(iev.iev_addr, ni->ni_bssid); @@ -309,6 +312,7 @@ RTM_IEEE80211_JOIN : RTM_IEEE80211_REJOIN, &iev, sizeof(iev)); } + CURVNET_RESTORE(); } void @@ -317,6 +321,7 @@ struct ifnet *ifp = ic->ic_ifp; struct ieee80211_leave_event iev; + CURVNET_SET_QUIET(ifp->if_vnet); if (ni == ic->ic_bss) { rt_ieee80211msg(ifp, RTM_IEEE80211_DISASSOC, NULL, 0); if_link_state_change(ifp, LINK_STATE_DOWN); @@ -326,6 +331,7 @@ IEEE80211_ADDR_COPY(iev.iev_addr, ni->ni_macaddr); rt_ieee80211msg(ifp, RTM_IEEE80211_LEAVE, &iev, sizeof(iev)); } + CURVNET_RESTORE(); } void @@ -336,7 +342,9 @@ IEEE80211_DPRINTF(ic, IEEE80211_MSG_SCAN, "%s\n", "notify scan done"); /* dispatch wireless event indicating scan completed */ + CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_SCAN, NULL, 0); + CURVNET_RESTORE(); } void @@ -364,7 +372,9 @@ iev.iev_keyix = k->wk_keyix; iev.iev_keyrsc = k->wk_keyrsc; iev.iev_rsc = rsc; + CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_REPLAY, &iev, sizeof(iev)); + CURVNET_RESTORE(); } } @@ -386,7 +396,9 @@ IEEE80211_ADDR_COPY(iev.iev_src, wh->i_addr2); iev.iev_cipher = IEEE80211_CIPHER_TKIP; iev.iev_keyix = keyix; + CURVNET_SET(ifp->if_vnet); rt_ieee80211msg(ifp, RTM_IEEE80211_MICHAEL, &iev, sizeof(iev)); + CURVNET_RESTORE(); } } --- /u/marko/p4/head/src/sys/net80211/ieee80211_ioctl.c 2007-11-13 02:49:09.000000000 +0100 +++ src/sys/net80211/ieee80211_ioctl.c 2007-12-10 11:26:09.000000000 +0100 @@ -35,6 +35,7 @@ #include "opt_inet.h" #include "opt_ipx.h" +#include "opt_vimage.h" #include #include @@ -43,6 +44,7 @@ #include #include #include +#include #include #include --- /u/marko/p4/head/src/sys/net80211/ieee80211_var.h 2007-11-27 15:48:32.000000000 +0100 +++ src/sys/net80211/ieee80211_var.h 2007-12-10 11:26:10.000000000 +0100 @@ -401,6 +401,8 @@ void ieee80211_ifattach(struct ieee80211com *); void ieee80211_ifdetach(struct ieee80211com *); +void ieee80211_reassign(struct ieee80211com *, struct vnet *, char *); + const struct ieee80211_rateset *ieee80211_get_suprates(struct ieee80211com *ic, const struct ieee80211_channel *); void ieee80211_announce(struct ieee80211com *); --- /u/marko/p4/head/src/sys/netgraph/netgraph.h 2008-01-31 10:37:19.000000000 +0100 +++ src/sys/netgraph/netgraph.h 2008-02-27 11:48:48.000000000 +0100 @@ -351,6 +351,7 @@ LIST_ENTRY(ng_node) nd_idnodes; /* ID hash collision list */ TAILQ_ENTRY(ng_node) nd_work; /* nodes with work to do */ struct ng_queue nd_input_queue; /* input queue for locking */ + struct vnet *nd_vnet; /* network stack instance */ #ifdef NETGRAPH_DEBUG /*----------------------------------------------*/ #define ND_MAGIC 0x59264837 int nd_magic; @@ -1123,6 +1124,7 @@ struct ng_type *ng_findtype(const char *type); int ng_make_node_common(struct ng_type *typep, node_p *nodep); int ng_name_node(node_p node, const char *name); +node_p ng_name2noderef(node_p node, const char *name); int ng_newtype(struct ng_type *tp); ng_ID_t ng_node2ID(node_p node); item_p ng_package_data(struct mbuf *m, int flags); --- /u/marko/p4/head/src/sys/netgraph/ng_base.c 2008-02-27 18:29:04.000000000 +0100 +++ src/sys/netgraph/ng_base.c 2008-02-27 17:58:56.000000000 +0100 @@ -46,6 +46,8 @@ * This file implements the base netgraph code. */ +#include "opt_vimage.h" + #include #include #include @@ -61,9 +63,11 @@ #include #include #include +#include #include +#include #include #include #include @@ -71,7 +75,9 @@ MODULE_VERSION(netgraph, NG_ABI_VERSION); /* List of all active nodes */ +#ifndef VIMAGE static LIST_HEAD(, ng_node) ng_nodelist; +#endif static struct mtx ng_nodelist_mtx; /* Mutex to protect topology events. */ @@ -88,8 +94,8 @@ static void ng_dumpitems(void); static void ng_dumpnodes(void); static void ng_dumphooks(void); - #endif /* NETGRAPH_DEBUG */ + /* * DEAD versions of the structures. * In order to avoid races, it is sometimes neccesary to point @@ -169,15 +175,16 @@ /* Hash related definitions */ /* XXX Don't need to initialise them because it's a LIST */ -#define NG_ID_HASH_SIZE 32 /* most systems wont need even this many */ +#ifndef VIMAGE static LIST_HEAD(, ng_node) ng_ID_hash[NG_ID_HASH_SIZE]; +#endif static struct mtx ng_idhash_mtx; /* Method to find a node.. used twice so do it here */ #define NG_IDHASH_FN(ID) ((ID) % (NG_ID_HASH_SIZE)) #define NG_IDHASH_FIND(ID, node) \ do { \ mtx_assert(&ng_idhash_mtx, MA_OWNED); \ - LIST_FOREACH(node, &ng_ID_hash[NG_IDHASH_FN(ID)], \ + LIST_FOREACH(node, &V_ng_ID_hash[NG_IDHASH_FN(ID)], \ nd_idnodes) { \ if (NG_NODE_IS_VALID(node) \ && (NG_NODE_ID(node) == ID)) { \ @@ -207,7 +214,6 @@ /* Imported, these used to be externally visible, some may go back. */ void ng_destroy_hook(hook_p hook); -node_p ng_name2noderef(node_p node, const char *name); int ng_path2noderef(node_p here, const char *path, node_p *dest, hook_p *lasthook); int ng_make_node(const char *type, node_p *nodepp); @@ -243,6 +249,14 @@ #define NG_WORKLIST_UNLOCK() \ mtx_unlock(&ng_worklist_mtx) +static vnet_attach_fn vnet_netgraph_iattach; +#ifdef VIMAGE +static vnet_detach_fn vnet_netgraph_idetach; +#endif /* VIMAGE */ + +VNET_MOD_DECLARE(NETGRAPH, netgraph, vnet_netgraph_iattach, + vnet_netgraph_idetach, LOIF, NULL) + #ifdef NETGRAPH_DEBUG /*----------------------------------------------*/ /* * In debug mode: @@ -341,7 +355,9 @@ #define TRAP_ERROR() #endif -static ng_ID_t nextID = 1; +#ifndef VIMAGE +static ng_ID_t nextID; +#endif #ifdef INVARIANTS #define CHECK_DATA_MBUF(m) do { \ @@ -565,7 +581,8 @@ return (EINVAL); } - /* Locate the node type. If we fail we return. Do not try to load + /* + * Locate the node type. If we fail we return. Do not try to load * module. */ if ((type = ng_findtype(typename)) == NULL) @@ -603,6 +620,7 @@ int ng_make_node_common(struct ng_type *type, node_p *nodepp) { + INIT_VNET_NETGRAPH(curvnet); node_p node; /* Require the node type to have been already installed */ @@ -618,6 +636,9 @@ return (ENOMEM); } node->nd_type = type; +#ifdef VIMAGE + node->nd_vnet = curvnet; +#endif NG_NODE_REF(node); /* note reference */ type->refs++; @@ -632,7 +653,7 @@ /* Link us into the node linked list */ mtx_lock(&ng_nodelist_mtx); - LIST_INSERT_HEAD(&ng_nodelist, node, nd_nodes); + LIST_INSERT_HEAD(&V_ng_nodelist, node, nd_nodes); mtx_unlock(&ng_nodelist_mtx); @@ -640,7 +661,7 @@ mtx_lock(&ng_idhash_mtx); for (;;) { /* wrap protection, even if silly */ node_p node2 = NULL; - node->nd_ID = nextID++; /* 137/second for 1 year before wrap */ + node->nd_ID = V_nextID++; /* 137/sec for 1 year before wrap */ /* Is there a problem with the new number? */ NG_IDHASH_FIND(node->nd_ID, node2); /* already taken? */ @@ -648,7 +669,7 @@ break; } } - LIST_INSERT_HEAD(&ng_ID_hash[NG_IDHASH_FN(node->nd_ID)], + LIST_INSERT_HEAD(&V_ng_ID_hash[NG_IDHASH_FN(node->nd_ID)], node, nd_idnodes); mtx_unlock(&ng_idhash_mtx); @@ -789,6 +810,7 @@ static node_p ng_ID2noderef(ng_ID_t ID) { + INIT_VNET_NETGRAPH(curvnet); node_p node; mtx_lock(&ng_idhash_mtx); NG_IDHASH_FIND(ID, node); @@ -857,6 +879,7 @@ node_p ng_name2noderef(node_p here, const char *name) { + INIT_VNET_NETGRAPH(curvnet); node_p node; ng_ID_t temp; @@ -873,7 +896,7 @@ /* Find node by name */ mtx_lock(&ng_nodelist_mtx); - LIST_FOREACH(node, &ng_nodelist, nd_nodes) { + LIST_FOREACH(node, &V_ng_nodelist, nd_nodes) { if (NG_NODE_IS_VALID(node) && NG_NODE_HAS_NAME(node) && (strcmp(NG_NODE_NAME(node), name) == 0)) { @@ -2548,6 +2571,7 @@ static int ng_generic_msg(node_p here, item_p item, hook_p lasthook) { + INIT_VNET_NETGRAPH(curvnet); int error = 0; struct ng_mesg *msg; struct ng_mesg *resp = NULL; @@ -2706,7 +2730,7 @@ mtx_lock(&ng_nodelist_mtx); /* Count number of nodes */ - LIST_FOREACH(node, &ng_nodelist, nd_nodes) { + LIST_FOREACH(node, &V_ng_nodelist, nd_nodes) { if (NG_NODE_IS_VALID(node) && (unnamed || NG_NODE_HAS_NAME(node))) { num++; @@ -2726,7 +2750,7 @@ /* Cycle through the linked list of nodes */ nl->numnames = 0; mtx_lock(&ng_nodelist_mtx); - LIST_FOREACH(node, &ng_nodelist, nd_nodes) { + LIST_FOREACH(node, &V_ng_nodelist, nd_nodes) { struct nodeinfo *const np = &nl->nodeinfo[nl->numnames]; if (NG_NODE_NOT_VALID(node)) @@ -3153,6 +3177,11 @@ uma_zone_set_max(ng_qzone, maxalloc); netisr_register(NETISR_NETGRAPH, (netisr_t *)ngintr, NULL, NETISR_MPSAFE); +#ifdef VIMAGE + vnet_mod_register(&vnet_netgraph_modinfo); +#else + vnet_netgraph_iattach(NULL); +#endif /* !VIMAGE */ break; case MOD_UNLOAD: /* You can't unload it because an interface may be using it. */ @@ -3165,6 +3194,42 @@ return (error); } +static int vnet_netgraph_iattach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + + LIST_INIT(&V_ng_nodelist); + V_nextID = 1; + + return 0; +} + +#ifdef VIMAGE +static int vnet_netgraph_idetach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + node_p node, last_killed = NULL; + + while ((node = LIST_FIRST(&V_ng_nodelist)) != NULL) { + if (node == last_killed) { + /* This should never happen */ + node->nd_flags |= NGF_REALLY_DIE; + printf("netgraph node %s needs NGF_REALLY_DIE\n", + node->nd_name); + ng_rmnode(node, NULL, NULL, 0); + /* This must never happen */ + if (node == LIST_FIRST(&V_ng_nodelist)) + panic("netgraph node %s won't die", + node->nd_name); + } + ng_rmnode(node, NULL, NULL, 0); + last_killed = node; + } + + return 0; +} +#endif /* VIMAGE */ + static moduledata_t netgraph_mod = { "netgraph", ngb_mod_event, @@ -3326,6 +3391,7 @@ NG_WORKLIST_UNLOCK(); break; } + CURVNET_SET(node->nd_vnet); node->nd_flags &= ~NGF_WORKQ; TAILQ_REMOVE(&ng_worklist, node, nd_work); NG_WORKLIST_UNLOCK(); @@ -3360,6 +3426,7 @@ } } NG_NODE_UNREF(node); + CURVNET_RESTORE(); } } @@ -3711,7 +3778,9 @@ { item_p item = arg; + CURVNET_SET(NGI_NODE(item)->nd_vnet); ng_snd_item(item, 0); + CURVNET_RESTORE(); } --- /u/marko/p4/head/src/sys/netgraph/ng_bridge.c 2007-08-31 03:47:58.000000000 +0200 +++ src/sys/netgraph/ng_bridge.c 2007-10-05 12:26:59.000000000 +0200 @@ -95,13 +95,14 @@ /* Per-node private data */ struct ng_bridge_private { struct ng_bridge_bucket *tab; /* hash table bucket array */ - struct ng_bridge_link *links[NG_BRIDGE_MAX_LINKS]; + struct ng_bridge_link *links[NG_BRIDGE_MAX_LINKS + 1]; struct ng_bridge_config conf; /* node configuration */ node_p node; /* netgraph node */ u_int numHosts; /* num entries in table */ u_int numBuckets; /* num buckets in table */ u_int hashMask; /* numBuckets - 1 */ int numLinks; /* num connected links */ + int persistent; /* can exist w/o any hooks */ struct callout timer; /* one second periodic timer */ }; typedef struct ng_bridge_private *priv_p; @@ -342,13 +343,13 @@ ng_bridge_newhook(node_p node, hook_p hook, const char *name) { const priv_p priv = NG_NODE_PRIVATE(node); + int linkNum = -1; /* Check for a link hook */ if (strncmp(name, NG_BRIDGE_HOOK_LINK_PREFIX, strlen(NG_BRIDGE_HOOK_LINK_PREFIX)) == 0) { const char *cp; char *eptr; - u_long linkNum; cp = name + strlen(NG_BRIDGE_HOOK_LINK_PREFIX); if (!isdigit(*cp) || (cp[0] == '0' && cp[1] != '\0')) @@ -356,6 +357,14 @@ linkNum = strtoul(cp, &eptr, 10); if (*eptr != '\0' || linkNum >= NG_BRIDGE_MAX_LINKS) return (EINVAL); + } else if (strcmp(name, "anchor") == 0) { + linkNum = NG_BRIDGE_MAX_LINKS; + if (priv->persistent) + return (EISCONN); + priv->persistent = 1; + } + + if (linkNum >= 0 ) { if (priv->links[linkNum] != NULL) return (EISCONN); MALLOC(priv->links[linkNum], struct ng_bridge_link *, @@ -366,7 +375,7 @@ NG_HOOK_SET_PRIVATE(hook, (void *)linkNum); priv->numLinks++; return (0); - } + } /* Unknown hook name */ return (EINVAL); @@ -782,7 +791,7 @@ /* Get link number */ linkNum = (intptr_t)NG_HOOK_PRIVATE(hook); - KASSERT(linkNum >= 0 && linkNum < NG_BRIDGE_MAX_LINKS, + KASSERT(linkNum >= 0 && linkNum <= NG_BRIDGE_MAX_LINKS, ("%s: linkNum=%u", __func__, linkNum)); /* Remove all hosts associated with this link */ @@ -796,7 +805,8 @@ /* If no more hooks, go away */ if ((NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0) - && (NG_NODE_IS_VALID(NG_HOOK_NODE(hook)))) { + && (NG_NODE_IS_VALID(NG_HOOK_NODE(hook))) + && !priv->persistent) { ng_rmnode_self(NG_HOOK_NODE(hook)); } return (0); --- /u/marko/p4/head/src/sys/netgraph/ng_eiface.c 2007-08-31 03:47:58.000000000 +0200 +++ src/sys/netgraph/ng_eiface.c 2007-12-01 01:36:47.000000000 +0100 @@ -28,6 +28,8 @@ * $FreeBSD: src/sys/netgraph/ng_eiface.c,v 1.39 2007/07/26 10:54:33 glebius Exp $ */ +#include "opt_vimage.h" + #include #include #include @@ -38,11 +40,14 @@ #include #include #include +#include #include #include #include +#include +#include #include #include #include @@ -111,7 +116,15 @@ }; NETGRAPH_INIT(eiface, &typestruct); +static vnet_attach_fn ng_eiface_iattach; +static vnet_detach_fn ng_eiface_idetach; + +#ifndef VIMAGE static struct unrhdr *ng_eiface_unit; +#endif + +VNET_MOD_DECLARE_STATELESS(NG_EIFACE, ng_eiface, ng_eiface_iattach, + ng_eiface_idetach, NETGRAPH) /************************************************************************ INTERFACE STUFF @@ -244,6 +257,14 @@ * Send packet; if hook is not connected, mbuf will get * freed. */ +#ifdef VIMAGE + if (ifp->if_vnet != node->nd_vnet) { + m->m_flags |= M_REMOTE_VNET; + CURVNET_SET_QUIET(node->nd_vnet); + NG_SEND_DATA_ONLY(error, priv->ether, m); + CURVNET_RESTORE(); + } else +#endif NG_SEND_DATA_ONLY(error, priv->ether, m); /* Update stats */ @@ -332,6 +353,7 @@ static int ng_eiface_constructor(node_p node) { + INIT_VNET_NETGRAPH(curvnet); struct ifnet *ifp; priv_p priv; u_char eaddr[6] = {0,0,0,0,0,0}; @@ -351,7 +373,7 @@ ifp->if_softc = priv; /* Get an interface unit number */ - priv->unit = alloc_unr(ng_eiface_unit); + priv->unit = alloc_unr(V_ng_eiface_unit); /* Link together node and private info */ NG_NODE_SET_PRIVATE(node, priv); @@ -367,12 +389,10 @@ ifp->if_snd.ifq_maxlen = IFQ_MAXLEN; ifp->if_flags = (IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST); -#if 0 - /* Give this node name */ - bzero(ifname, sizeof(ifname)); - sprintf(ifname, "if%s", ifp->if_xname); - (void)ng_name_node(node, ifname); -#endif + /* Give this node the same name as the interface (if possible) */ + if (ng_name_node(node, ifp->if_xname) != 0) + log(LOG_WARNING, "%s: can't acquire netgraph name\n", + ifp->if_xname); /* Attach the interface */ ether_ifattach(ifp, eaddr); @@ -445,8 +465,6 @@ caddr_t ptr; int buflen; -#define SA_SIZE(s) ((s)->sa_lensa_len) - /* Determine size of response and allocate it */ buflen = 0; TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) @@ -532,6 +550,12 @@ /* Update interface stats */ ifp->if_ipackets++; +#ifdef VIMAGE + /* Mark up the mbuf if crossing vnet boundary */ + if (ifp->if_vnet != NG_HOOK_NODE(hook)->nd_vnet) + m->m_flags |= M_REMOTE_VNET; +#endif + (*ifp->if_input)(ifp, m); /* Done */ @@ -544,12 +568,15 @@ static int ng_eiface_rmnode(node_p node) { + INIT_VNET_NETGRAPH(curvnet); const priv_p priv = NG_NODE_PRIVATE(node); struct ifnet *const ifp = priv->ifp; + CURVNET_SET_QUIET(ifp->if_vnet); ether_ifdetach(ifp); if_free(ifp); - free_unr(ng_eiface_unit, priv->unit); + CURVNET_RESTORE(); + free_unr(V_ng_eiface_unit, priv->unit); FREE(priv, M_NETGRAPH); NG_NODE_SET_PRIVATE(node, NULL); NG_NODE_UNREF(node); @@ -578,10 +605,18 @@ switch (event) { case MOD_LOAD: - ng_eiface_unit = new_unrhdr(0, 0xffff, NULL); +#ifdef VIMAGE + vnet_mod_register(&vnet_ng_eiface_modinfo); +#else + ng_eiface_iattach(NULL); +#endif break; case MOD_UNLOAD: - delete_unrhdr(ng_eiface_unit); +#ifdef VIMAGE + vnet_mod_deregister(&vnet_ng_eiface_modinfo); +#else + ng_eiface_idetach(NULL); +#endif break; default: error = EOPNOTSUPP; @@ -589,3 +624,32 @@ } return (error); } + +static int ng_eiface_iattach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + + V_ng_eiface_unit = new_unrhdr(0, 0xffff, NULL); + + return 0; +} + +static int ng_eiface_idetach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); +#ifdef VIMAGE + node_p node; + + do { + LIST_FOREACH(node, &V_ng_nodelist, nd_nodes) + if (node->nd_type == &typestruct) + break; + if (node != NULL) + ng_rmnode_self(node); + } while (node != NULL); +#endif + + delete_unrhdr(V_ng_eiface_unit); + + return 0; +} --- /u/marko/p4/head/src/sys/netgraph/ng_ether.c 2007-08-31 03:47:58.000000000 +0200 +++ src/sys/netgraph/ng_ether.c 2007-10-22 18:06:39.000000000 +0200 @@ -46,6 +46,8 @@ * ng_ether(4) netgraph node type */ +#include "opt_vimage.h" + #include #include #include @@ -54,7 +56,9 @@ #include #include #include +#include +#include #include #include #include @@ -70,6 +74,12 @@ #define IFP2NG(ifp) (IFP2AC((ifp))->ac_netgraph) +static vnet_attach_fn ng_ether_iattach; +static vnet_detach_fn ng_ether_idetach; + +VNET_MOD_DECLARE_STATELESS(NG_ETHER, ng_ether, ng_ether_iattach, + ng_ether_idetach, NETGRAPH) + /* Per-node private data */ struct private { struct ifnet *ifp; /* associated interface */ @@ -282,6 +292,17 @@ priv_p priv; node_p node; + /* + * Do not create / attach an ether node to this ifnet if + * a netgraph node with the same name already exists. + * This should prevent ether nodes to be attached to + * eiface nodes in the same vnet, which is pointless. + */ + if ((node = ng_name2noderef(NULL, ifp->if_xname)) != NULL) { + NG_NODE_UNREF(node); + return; + } + /* Create node */ KASSERT(!IFP2NG(ifp), ("%s: node already exists?", __func__)); if (ng_make_node_common(&ng_ether_typestruct, &node) != 0) { @@ -730,53 +751,25 @@ static int ng_ether_mod_event(module_t mod, int event, void *data) { - struct ifnet *ifp; int error = 0; int s; s = splnet(); switch (event) { case MOD_LOAD: - - /* Register function hooks */ - if (ng_ether_attach_p != NULL) { - error = EEXIST; - break; - } - ng_ether_attach_p = ng_ether_attach; - ng_ether_detach_p = ng_ether_detach; - ng_ether_output_p = ng_ether_output; - ng_ether_input_p = ng_ether_input; - ng_ether_input_orphan_p = ng_ether_input_orphan; - ng_ether_link_state_p = ng_ether_link_state; - - /* Create nodes for any already-existing Ethernet interfaces */ - IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { - if (ifp->if_type == IFT_ETHER - || ifp->if_type == IFT_L2VLAN) - ng_ether_attach(ifp); - } - IFNET_RUNLOCK(); +#ifdef VIMAGE + vnet_mod_register(&vnet_ng_ether_modinfo); +#else + error = ng_ether_iattach(NULL); +#endif break; case MOD_UNLOAD: - - /* - * Note that the base code won't try to unload us until - * all nodes have been removed, and that can't happen - * until all Ethernet interfaces are removed. In any - * case, we know there are no nodes left if the action - * is MOD_UNLOAD, so there's no need to detach any nodes. - */ - - /* Unregister function hooks */ - ng_ether_attach_p = NULL; - ng_ether_detach_p = NULL; - ng_ether_output_p = NULL; - ng_ether_input_p = NULL; - ng_ether_input_orphan_p = NULL; - ng_ether_link_state_p = NULL; +#ifdef VIMAGE + vnet_mod_deregister(&vnet_ng_ether_modinfo); +#else + ng_ether_idetach(NULL); +#endif break; default: @@ -787,3 +780,62 @@ return (error); } +static int ng_ether_iattach(const void *unused) +{ + INIT_VNET_NET(curvnet); + struct ifnet *ifp; + +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)){ +#endif + /* Register function hooks */ + if (ng_ether_attach_p != NULL) + return(EEXIST); + ng_ether_attach_p = ng_ether_attach; + ng_ether_detach_p = ng_ether_detach; + ng_ether_output_p = ng_ether_output; + ng_ether_input_p = ng_ether_input; + ng_ether_input_orphan_p = ng_ether_input_orphan; + ng_ether_link_state_p = ng_ether_link_state; +#ifdef VIMAGE + } +#endif + + /* Create nodes for any already-existing Ethernet interfaces */ + IFNET_RLOCK(); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if (ifp->if_type == IFT_ETHER + || ifp->if_type == IFT_L2VLAN) + ng_ether_attach(ifp); + } + IFNET_RUNLOCK(); + + return 0; +} + +static int ng_ether_idetach(const void *unused) +{ + /* + * Note that the base code won't try to unload us until + * all nodes have been removed, and that can't happen + * until all Ethernet interfaces are removed. In any + * case, we know there are no nodes left if the action + * is MOD_UNLOAD, so there's no need to detach any nodes. + */ + +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return(0); +#endif + + /* Unregister function hooks */ + ng_ether_attach_p = NULL; + ng_ether_detach_p = NULL; + ng_ether_output_p = NULL; + ng_ether_input_p = NULL; + ng_ether_input_orphan_p = NULL; + ng_ether_link_state_p = NULL; + + return 0; +} + --- /u/marko/p4/head/src/sys/netgraph/ng_gif.c 2007-08-31 03:47:58.000000000 +0200 +++ src/sys/netgraph/ng_gif.c 2007-12-10 11:26:11.000000000 +0100 @@ -69,6 +69,8 @@ * ng_gif(4) netgraph node type */ +#include "opt_vimage.h" + #include #include #include @@ -77,7 +79,9 @@ #include #include #include +#include +#include #include #include #include @@ -560,10 +564,13 @@ /* Create nodes for any already-existing gif interfaces */ IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { + VNET_ITERLOOP_BEGIN_QUIET(); + INIT_VNET_NET(curvnet); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_type == IFT_GIF) ng_gif_attach(ifp); } + VNET_ITERLOOP_END(); IFNET_RUNLOCK(); break; --- /u/marko/p4/head/src/sys/netgraph/ng_hub.c 2007-08-31 03:47:58.000000000 +0200 +++ src/sys/netgraph/ng_hub.c 2007-10-05 12:27:00.000000000 +0200 @@ -37,6 +37,7 @@ #include static ng_constructor_t ng_hub_constructor; +static ng_newhook_t ng_hub_newhook; static ng_rcvdata_t ng_hub_rcvdata; static ng_disconnect_t ng_hub_disconnect; @@ -44,6 +45,7 @@ .version = NG_ABI_VERSION, .name = NG_HUB_NODE_TYPE, .constructor = ng_hub_constructor, + .newhook = ng_hub_newhook, .rcvdata = ng_hub_rcvdata, .disconnect = ng_hub_disconnect, }; @@ -57,6 +59,14 @@ return (0); } +static int +ng_hub_newhook(node_p node, hook_p hook, const char *name) +{ + if (strcmp(name, "anchor") == 0) + node->nd_private = (void *) 1; + return 0; +} + static int ng_hub_rcvdata(hook_p hook, item_p item) { @@ -94,7 +104,7 @@ { if (NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0 && - NG_NODE_IS_VALID(NG_HOOK_NODE(hook))) + NG_NODE_IS_VALID(NG_HOOK_NODE(hook)) && !hook->hk_node->nd_private) ng_rmnode_self(NG_HOOK_NODE(hook)); return (0); } --- /u/marko/p4/head/src/sys/netgraph/ng_iface.c 2008-01-31 10:37:20.000000000 +0100 +++ src/sys/netgraph/ng_iface.c 2008-02-27 11:48:53.000000000 +0100 @@ -56,6 +56,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipx.h" +#include "opt_vimage.h" #include #include @@ -69,6 +70,7 @@ #include #include #include +#include #include #include @@ -77,6 +79,7 @@ #include +#include #include #include #include @@ -121,6 +124,10 @@ static int ng_iface_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int ng_iface_output(struct ifnet *ifp, struct mbuf *m0, struct sockaddr *dst, struct rtentry *rt0); +#ifdef VIMAGE +static void ng_iface_reassign(struct ifnet *ifp, struct vnet *vnet, + char *dname); +#endif static void ng_iface_bpftap(struct ifnet *ifp, struct mbuf *m, sa_family_t family); static int ng_iface_send(struct ifnet *ifp, struct mbuf *m, @@ -207,7 +214,15 @@ }; NETGRAPH_INIT(iface, &typestruct); +static vnet_attach_fn ng_iface_iattach; +static vnet_detach_fn ng_iface_idetach; + +#ifndef VIMAGE static struct unrhdr *ng_iface_unit; +#endif + +VNET_MOD_DECLARE_STATELESS(NG_IFACE, ng_iface, ng_iface_iattach, + ng_iface_idetach, NETGRAPH) /************************************************************************ HELPER STUFF @@ -449,6 +464,14 @@ /* Send packet. If hook is not connected, mbuf will get freed. */ +#ifdef VIMAGE + if (ifp->if_vnet != priv->node->nd_vnet) { + m->m_flags |= M_REMOTE_VNET; + CURVNET_SET_QUIET(priv->node->nd_vnet); + NG_SEND_DATA_ONLY(error, *get_hook_from_iffam(priv, iffam), m); + CURVNET_RESTORE(); + } else +#endif NG_SEND_DATA_ONLY(error, *get_hook_from_iffam(priv, iffam), m); /* Update stats. */ @@ -505,6 +528,7 @@ static int ng_iface_constructor(node_p node) { + INIT_VNET_NETGRAPH(curvnet); struct ifnet *ifp; priv_p priv; @@ -523,7 +547,7 @@ priv->ifp = ifp; /* Get an interface unit number */ - priv->unit = alloc_unr(ng_iface_unit); + priv->unit = alloc_unr(V_ng_iface_unit); /* Link together node and private info */ NG_NODE_SET_PRIVATE(node, priv); @@ -534,6 +558,9 @@ ifp->if_output = ng_iface_output; ifp->if_start = ng_iface_start; ifp->if_ioctl = ng_iface_ioctl; +#ifdef VIMAGE + ifp->if_reassign = ng_iface_reassign; +#endif ifp->if_watchdog = NULL; ifp->if_mtu = NG_IFACE_MTU_DEFAULT; ifp->if_flags = (IFF_SIMPLEX|IFF_POINTOPOINT|IFF_NOARP|IFF_MULTICAST); @@ -558,6 +585,24 @@ return (0); } +#ifdef VIMAGE +static void +ng_iface_reassign(struct ifnet *ifp, struct vnet *vnet, char *dname) +{ + bpfdetach(ifp); + if_detach(ifp); + ifp->if_bpf = NULL; + if_reassign_common(ifp, vnet, "ser"); + if (dname) + snprintf(ifp->if_xname, IFNAMSIZ, "%s", dname); + + CURVNET_SET_QUIET(vnet); + if_attach(ifp); + bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); + CURVNET_RESTORE(); +} +#endif + /* * Give our ok for a hook to be added */ @@ -720,6 +765,12 @@ ifp->if_ipackets++; ifp->if_ibytes += m->m_pkthdr.len; +#ifdef VIMAGE + /* Mark up the mbuf if crossing vnet boundary */ + if (ifp->if_vnet != NG_HOOK_NODE(hook)->nd_vnet) + m->m_flags |= M_REMOTE_VNET; +#endif + /* Note receiving interface */ m->m_pkthdr.rcvif = ifp; @@ -765,13 +816,16 @@ static int ng_iface_shutdown(node_p node) { + INIT_VNET_NETGRAPH(curvnet); const priv_p priv = NG_NODE_PRIVATE(node); + CURVNET_SET_QUIET(priv->ifp->if_vnet); bpfdetach(priv->ifp); if_detach(priv->ifp); if_free(priv->ifp); + CURVNET_RESTORE(); priv->ifp = NULL; - free_unr(ng_iface_unit, priv->unit); + free_unr(V_ng_iface_unit, priv->unit); FREE(priv, M_NETGRAPH_IFACE); NG_NODE_SET_PRIVATE(node, NULL); NG_NODE_UNREF(node); @@ -804,10 +858,18 @@ switch (event) { case MOD_LOAD: - ng_iface_unit = new_unrhdr(0, 0xffff, NULL); +#ifdef VIMAGE + vnet_mod_register(&vnet_ng_iface_modinfo); +#else + ng_iface_iattach(NULL); +#endif break; case MOD_UNLOAD: - delete_unrhdr(ng_iface_unit); +#ifdef VIMAGE + vnet_mod_deregister(&vnet_ng_iface_modinfo); +#else + ng_iface_idetach(NULL); +#endif break; default: error = EOPNOTSUPP; @@ -815,3 +877,32 @@ } return (error); } + +static int ng_iface_iattach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + + V_ng_iface_unit = new_unrhdr(0, 0xffff, NULL); + + return 0; +} + +static int ng_iface_idetach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); +#ifdef VIMAGE + node_p node; + + do { + LIST_FOREACH(node, &V_ng_nodelist, nd_nodes) + if (node->nd_type == &typestruct) + break; + if (node != NULL) + ng_rmnode_self(node); + } while (node != NULL); +#endif + + delete_unrhdr(V_ng_iface_unit); + + return 0; +} --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/netgraph/ng_pipe.c 2007-10-30 22:13:05.000000000 +0100 @@ -0,0 +1,1051 @@ +/* + * Copyright (c) 2004, 2005, 2007 University of Zagreb + * Copyright (c) 2007 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This node permits simple traffic shaping by emulating bandwidth + * and delay, as well as random packet losses. + * The node has two hooks, upper and lower. Traffic flowing from upper to + * lower hook is referenced as downstream, and vice versa. Parameters for + * both directions can be set separately, except for delay. + */ + +#include "opt_vimage.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include + +static MALLOC_DEFINE(M_NG_PIPE, "ng_pipe", "ng_pipe"); + +struct mtx ng_pipe_giant; + +/* Packet header struct */ +struct ngp_hdr { + TAILQ_ENTRY(ngp_hdr) ngp_link; /* next pkt in queue */ + struct timeval when; /* this packet's due time */ + struct mbuf *m; /* ptr to the packet data */ +}; +TAILQ_HEAD(p_head, ngp_hdr); + +/* FIFO queue struct */ +struct ngp_fifo { + TAILQ_ENTRY(ngp_fifo) fifo_le; /* list of active queues only */ + struct p_head packet_head; /* FIFO queue head */ + u_int32_t hash; /* flow signature */ + struct timeval vtime; /* virtual time, for WFQ */ + u_int32_t rr_deficit; /* for DRR */ + u_int32_t packets; /* # of packets in this queue */ +}; + +/* Per hook info */ +struct hookinfo { + hook_p hook; + int noqueue; /* bypass any processing */ + TAILQ_HEAD(, ngp_fifo) fifo_head; /* FIFO queues */ + TAILQ_HEAD(, ngp_hdr) qout_head; /* delay queue head */ + LIST_ENTRY(hookinfo) active_le; /* active hooks */ + struct timeval qin_utime; + struct ng_pipe_hookcfg cfg; + struct ng_pipe_hookrun run; + struct ng_pipe_hookstat stats; + uint64_t *ber_p; /* loss_p(BER,psize) map */ +}; + +/* Per node info */ +struct node_priv { + u_int64_t delay; + u_int32_t overhead; + u_int32_t header_offset; + struct hookinfo lower; + struct hookinfo upper; +}; +typedef struct node_priv *priv_p; + +/* Macro for calculating the virtual time for packet dequeueing in WFQ */ +#define FIFO_VTIME_SORT(plen) \ + if (hinfo->cfg.wfq && hinfo->cfg.bandwidth) { \ + ngp_f->vtime.tv_usec = now->tv_usec + ((uint64_t) (plen) \ + + priv->overhead ) * hinfo->run.fifo_queues * \ + 8000000 / hinfo->cfg.bandwidth; \ + ngp_f->vtime.tv_sec = now->tv_sec + \ + ngp_f->vtime.tv_usec / 1000000; \ + ngp_f->vtime.tv_usec = ngp_f->vtime.tv_usec % 1000000; \ + TAILQ_FOREACH(ngp_f1, &hinfo->fifo_head, fifo_le) \ + if (ngp_f1->vtime.tv_sec > ngp_f->vtime.tv_sec || \ + (ngp_f1->vtime.tv_sec == ngp_f->vtime.tv_sec && \ + ngp_f1->vtime.tv_usec > ngp_f->vtime.tv_usec)) \ + break; \ + if (ngp_f1 == NULL) \ + TAILQ_INSERT_TAIL(&hinfo->fifo_head, ngp_f, fifo_le); \ + else \ + TAILQ_INSERT_BEFORE(ngp_f1, ngp_f, fifo_le); \ + } else \ + TAILQ_INSERT_TAIL(&hinfo->fifo_head, ngp_f, fifo_le); \ + + +static void parse_cfg(struct ng_pipe_hookcfg *, struct ng_pipe_hookcfg *, + struct hookinfo *, priv_p); +static void pipe_dequeue(struct hookinfo *, struct timeval *); +static void pipe_scheduler(void *); +static void pipe_poll(void); +static int ngp_modevent(module_t, int, void *); + +/* linked list of active "pipe" hooks */ +static LIST_HEAD(, hookinfo) active_head; +static int active_gen_id = 0; + +/* timeout handle for pipe_scheduler */ +static struct callout polling_timer; + +/* zone for storing ngp_hdr-s */ +static uma_zone_t ngp_zone; + +/* Netgraph methods */ +static ng_constructor_t ngp_constructor; +static ng_rcvmsg_t ngp_rcvmsg; +static ng_shutdown_t ngp_shutdown; +static ng_newhook_t ngp_newhook; +static ng_rcvdata_t ngp_rcvdata; +static ng_disconnect_t ngp_disconnect; + +/* Parse type for struct ng_pipe_hookstat */ +static const struct ng_parse_struct_field + ng_pipe_hookstat_type_fields[] = NG_PIPE_HOOKSTAT_INFO; +static const struct ng_parse_type ng_pipe_hookstat_type = { + &ng_parse_struct_type, + &ng_pipe_hookstat_type_fields +}; + +/* Parse type for struct ng_pipe_stats */ +static const struct ng_parse_struct_field ng_pipe_stats_type_fields[] = + NG_PIPE_STATS_INFO(&ng_pipe_hookstat_type); +static const struct ng_parse_type ng_pipe_stats_type = { + &ng_parse_struct_type, + &ng_pipe_stats_type_fields +}; + +/* Parse type for struct ng_pipe_hookrun */ +static const struct ng_parse_struct_field + ng_pipe_hookrun_type_fields[] = NG_PIPE_HOOKRUN_INFO; +static const struct ng_parse_type ng_pipe_hookrun_type = { + &ng_parse_struct_type, + &ng_pipe_hookrun_type_fields +}; + +/* Parse type for struct ng_pipe_run */ +static const struct ng_parse_struct_field + ng_pipe_run_type_fields[] = NG_PIPE_RUN_INFO(&ng_pipe_hookrun_type); +static const struct ng_parse_type ng_pipe_run_type = { + &ng_parse_struct_type, + &ng_pipe_run_type_fields +}; + +/* Parse type for struct ng_pipe_hookcfg */ +static const struct ng_parse_struct_field + ng_pipe_hookcfg_type_fields[] = NG_PIPE_HOOKCFG_INFO; +static const struct ng_parse_type ng_pipe_hookcfg_type = { + &ng_parse_struct_type, + &ng_pipe_hookcfg_type_fields +}; + +/* Parse type for struct ng_pipe_cfg */ +static const struct ng_parse_struct_field + ng_pipe_cfg_type_fields[] = NG_PIPE_CFG_INFO(&ng_pipe_hookcfg_type); +static const struct ng_parse_type ng_pipe_cfg_type = { + &ng_parse_struct_type, + &ng_pipe_cfg_type_fields +}; + +/* List of commands and how to convert arguments to/from ASCII */ +static const struct ng_cmdlist ngp_cmds[] = { + { + .cookie = NGM_PIPE_COOKIE, + .cmd = NGM_PIPE_GET_STATS, + .name = "getstats", + .respType = &ng_pipe_stats_type + }, + { + .cookie = NGM_PIPE_COOKIE, + .cmd = NGM_PIPE_CLR_STATS, + .name = "clrstats" + }, + { + .cookie = NGM_PIPE_COOKIE, + .cmd = NGM_PIPE_GETCLR_STATS, + .name = "getclrstats", + .respType = &ng_pipe_stats_type + }, + { + .cookie = NGM_PIPE_COOKIE, + .cmd = NGM_PIPE_GET_RUN, + .name = "getrun", + .respType = &ng_pipe_run_type + }, + { + .cookie = NGM_PIPE_COOKIE, + .cmd = NGM_PIPE_GET_CFG, + .name = "getcfg", + .respType = &ng_pipe_cfg_type + }, + { + .cookie = NGM_PIPE_COOKIE, + .cmd = NGM_PIPE_SET_CFG, + .name = "setcfg", + .mesgType = &ng_pipe_cfg_type, + }, + { 0 } +}; + +/* Netgraph type descriptor */ +static struct ng_type ng_pipe_typestruct = { + .version = NG_ABI_VERSION, + .name = NG_PIPE_NODE_TYPE, + .mod_event = ngp_modevent, + .constructor = ngp_constructor, + .shutdown = ngp_shutdown, + .rcvmsg = ngp_rcvmsg, + .newhook = ngp_newhook, + .rcvdata = ngp_rcvdata, + .disconnect = ngp_disconnect, + .cmdlist = ngp_cmds +}; +NETGRAPH_INIT(pipe, &ng_pipe_typestruct); + +/* Node constructor */ +static int +ngp_constructor(node_p node) +{ + priv_p priv; + + MALLOC(priv, priv_p, sizeof(*priv), M_NG_PIPE, M_ZERO | M_NOWAIT); + if (priv == NULL) + return (ENOMEM); + NG_NODE_SET_PRIVATE(node, priv); + + return (0); +} + +/* Add a hook */ +static int +ngp_newhook(node_p node, hook_p hook, const char *name) +{ + const priv_p priv = NG_NODE_PRIVATE(node); + struct hookinfo *hinfo; + + if (strcmp(name, NG_PIPE_HOOK_UPPER) == 0) { + bzero(&priv->upper, sizeof(priv->upper)); + priv->upper.hook = hook; + NG_HOOK_SET_PRIVATE(hook, &priv->upper); + } else if (strcmp(name, NG_PIPE_HOOK_LOWER) == 0) { + bzero(&priv->lower, sizeof(priv->lower)); + priv->lower.hook = hook; + NG_HOOK_SET_PRIVATE(hook, &priv->lower); + } else + return (EINVAL); + + /* Load non-zero initial cfg values */ + hinfo = NG_HOOK_PRIVATE(hook); + hinfo->cfg.qin_size_limit = 50; + hinfo->cfg.fifo = 1; + hinfo->cfg.droptail = 1; + TAILQ_INIT(&hinfo->fifo_head); + TAILQ_INIT(&hinfo->qout_head); + return (0); +} + +/* Receive a control message */ +static int +ngp_rcvmsg(node_p node, item_p item, hook_p lasthook) +{ + const priv_p priv = NG_NODE_PRIVATE(node); + struct ng_mesg *resp = NULL; + struct ng_mesg *msg; + struct ng_pipe_stats *stats; + struct ng_pipe_run *run; + struct ng_pipe_cfg *cfg; + int error = 0; + + mtx_lock(&ng_pipe_giant); + + NGI_GET_MSG(item, msg); + switch (msg->header.typecookie) { + case NGM_PIPE_COOKIE: + switch (msg->header.cmd) { + case NGM_PIPE_GET_STATS: + case NGM_PIPE_CLR_STATS: + case NGM_PIPE_GETCLR_STATS: + if (msg->header.cmd != NGM_PIPE_CLR_STATS) { + NG_MKRESPONSE(resp, msg, + sizeof(*stats), M_NOWAIT); + if (resp == NULL) { + error = ENOMEM; + break; + } + stats = (struct ng_pipe_stats *)resp->data; + bcopy(&priv->upper.stats, &stats->downstream, + sizeof(stats->downstream)); + bcopy(&priv->lower.stats, &stats->upstream, + sizeof(stats->upstream)); + } + if (msg->header.cmd != NGM_PIPE_GET_STATS) { + bzero(&priv->upper.stats, + sizeof(priv->upper.stats)); + bzero(&priv->lower.stats, + sizeof(priv->lower.stats)); + } + break; + case NGM_PIPE_GET_RUN: + NG_MKRESPONSE(resp, msg, sizeof(*run), M_NOWAIT); + if (resp == NULL) { + error = ENOMEM; + break; + } + run = (struct ng_pipe_run *)resp->data; + bcopy(&priv->upper.run, &run->downstream, + sizeof(run->downstream)); + bcopy(&priv->lower.run, &run->upstream, + sizeof(run->upstream)); + break; + case NGM_PIPE_GET_CFG: + NG_MKRESPONSE(resp, msg, sizeof(*cfg), M_NOWAIT); + if (resp == NULL) { + error = ENOMEM; + break; + } + cfg = (struct ng_pipe_cfg *)resp->data; + bcopy(&priv->upper.cfg, &cfg->downstream, + sizeof(cfg->downstream)); + bcopy(&priv->lower.cfg, &cfg->upstream, + sizeof(cfg->upstream)); + cfg->delay = priv->delay; + cfg->overhead = priv->overhead; + cfg->header_offset = priv->header_offset; + if (cfg->upstream.bandwidth == + cfg->downstream.bandwidth) { + cfg->bandwidth = cfg->upstream.bandwidth; + cfg->upstream.bandwidth = 0; + cfg->downstream.bandwidth = 0; + } else + cfg->bandwidth = 0; + break; + case NGM_PIPE_SET_CFG: + cfg = (struct ng_pipe_cfg *)msg->data; + if (msg->header.arglen != sizeof(*cfg)) { + error = EINVAL; + break; + } + + if (cfg->delay == -1) + priv->delay = 0; + else if (cfg->delay > 0 && cfg->delay < 10000000) + priv->delay = cfg->delay; + + if (cfg->bandwidth == -1) { + priv->upper.cfg.bandwidth = 0; + priv->lower.cfg.bandwidth = 0; + priv->overhead = 0; + } else if (cfg->bandwidth >= 100 && + cfg->bandwidth <= 1000000000) { + priv->upper.cfg.bandwidth = cfg->bandwidth; + priv->lower.cfg.bandwidth = cfg->bandwidth; + if (cfg->bandwidth >= 10000000) + priv->overhead = 8+4+12; /* Ethernet */ + else + priv->overhead = 10; /* HDLC */ + } + + if (cfg->overhead == -1) + priv->overhead = 0; + else if (cfg->overhead > 0 && cfg->overhead < 256) + priv->overhead = cfg->overhead; + + if (cfg->header_offset == -1) + priv->header_offset = 0; + else if (cfg->header_offset > 0 && + cfg->header_offset < 64) + priv->header_offset = cfg->header_offset; + + parse_cfg(&priv->upper.cfg, &cfg->downstream, + &priv->upper, priv); + parse_cfg(&priv->lower.cfg, &cfg->upstream, + &priv->lower, priv); + break; + default: + error = EINVAL; + break; + } + break; + default: + error = EINVAL; + break; + } + NG_RESPOND_MSG(error, node, item, resp); + NG_FREE_MSG(msg); + + mtx_unlock(&ng_pipe_giant); + + return (error); +} + +static void +parse_cfg(struct ng_pipe_hookcfg *current, struct ng_pipe_hookcfg *new, + struct hookinfo *hinfo, priv_p priv) +{ + + if (new->ber == -1) { + current->ber = 0; + if (hinfo->ber_p) { + FREE(hinfo->ber_p, M_NG_PIPE); + hinfo->ber_p = NULL; + } + } else if (new->ber >= 1 && new->ber <= 1000000000000) { + static const uint64_t one = 0x1000000000000; /* = 2^48 */ + uint64_t p0, p; + uint32_t fsize, i; + + if (hinfo->ber_p == NULL) + MALLOC(hinfo->ber_p, uint64_t *, \ + (MAX_FSIZE + MAX_OHSIZE)*sizeof(uint64_t), \ + M_NG_PIPE, M_NOWAIT); + current->ber = new->ber; + + /* + * For given BER and each frame size N (in bytes) calculate + * the probability P_OK that the frame is clean: + * + * P_OK(BER,N) = (1 - 1/BER)^(N*8) + * + * We use a 64-bit fixed-point format with decimal point + * positioned between bits 47 and 48. + */ + p0 = one - one / new->ber; + p = one; + for (fsize = 0; fsize < MAX_FSIZE + MAX_OHSIZE; fsize++) { + hinfo->ber_p[fsize] = p; + for (i=0; i<8; i++) + p = (p*(p0&0xffff)>>48) + \ + (p*((p0>>16)&0xffff)>>32) + \ + (p*(p0>>32)>>16); + } + } + + if (new->qin_size_limit == -1) + current->qin_size_limit = 0; + else if (new->qin_size_limit >= 5) + current->qin_size_limit = new->qin_size_limit; + + if (new->qout_size_limit == -1) + current->qout_size_limit = 0; + else if (new->qout_size_limit >= 5) + current->qout_size_limit = new->qout_size_limit; + + if (new->duplicate == -1) + current->duplicate = 0; + else if (new->duplicate > 0 && new->duplicate <= 50) + current->duplicate = new->duplicate; + + if (new->fifo) { + current->fifo = 1; + current->wfq = 0; + current->drr = 0; + } + + if (new->wfq) { + current->fifo = 0; + current->wfq = 1; + current->drr = 0; + } + + if (new->drr) { + current->fifo = 0; + current->wfq = 0; + /* DRR quantum */ + if (new->drr >= 32) + current->drr = new->drr; + else + current->drr = 2048; /* default quantum */ + } + + if (new->droptail) { + current->droptail = 1; + current->drophead = 0; + } + + if (new->drophead) { + current->droptail = 0; + current->drophead = 1; + } + + if (new->bandwidth == -1) { + current->bandwidth = 0; + current->fifo = 1; + current->wfq = 0; + current->drr = 0; + } else if (new->bandwidth >= 100 && new->bandwidth <= 1000000000) + current->bandwidth = new->bandwidth; + + if (current->bandwidth | priv->delay | + current->duplicate | current->ber) + hinfo->noqueue = 0; + else + hinfo->noqueue = 1; +} + +/* + * Compute a hash signature for a packet. This function suffers from the + * NIH sindrome, so probably it would be wise to look around what other + * folks have found out to be a good and efficient IP hash function... + */ +static int ip_hash(struct mbuf *m, int offset) +{ + u_int64_t i; + struct ip *ip = (struct ip *)(mtod(m, u_char *) + offset); + + if (m->m_len < sizeof(struct ip) + offset || + ip->ip_v != 4 || ip->ip_hl << 2 != sizeof(struct ip)) + return 0; + + i = ((u_int64_t) ip->ip_src.s_addr ^ + ((u_int64_t) ip->ip_src.s_addr << 13) ^ + ((u_int64_t) ip->ip_dst.s_addr << 7) ^ + ((u_int64_t) ip->ip_dst.s_addr << 19)); + return (i ^ (i >> 32)); +} + +/* + * Receive data on a hook - both in upstream and downstream direction. + * We put the frame on the inbound queue, and try to initiate dequeuing + * sequence immediately. If inbound queue is full, discard one frame + * depending on dropping policy (from the head or from the tail of the + * queue). + */ +static int +ngp_rcvdata(hook_p hook, item_p item) +{ + struct hookinfo *const hinfo = NG_HOOK_PRIVATE(hook); + const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); + struct timeval uuptime; + struct timeval *now = &uuptime; + struct ngp_fifo *ngp_f = NULL, *ngp_f1; + struct ngp_hdr *ngp_h = NULL; + struct mbuf *m; + int hash; + int error = 0; + + if (hinfo->noqueue) { + struct hookinfo *dest; + if (hinfo == &priv->lower) + dest = &priv->upper; + else + dest = &priv->lower; + NG_FWD_ITEM_HOOK(error, item, dest->hook); + return error; + } + + mtx_lock(&ng_pipe_giant); + microuptime(now); + + /* + * Attach us to the list of active ng_pipes if this was an empty + * one before, and also update the queue service deadline time. + */ + if (hinfo->run.qin_frames == 0) { + struct timeval *when = &hinfo->qin_utime; + if (when->tv_sec < now->tv_sec || (when->tv_sec == now->tv_sec + && when->tv_usec < now->tv_usec)) { + when->tv_sec = now->tv_sec; + when->tv_usec = now->tv_usec; + } + if (hinfo->run.qout_frames == 0) + LIST_INSERT_HEAD(&active_head, hinfo, active_le); + } + + /* Populate the packet header */ + ngp_h = uma_zalloc(ngp_zone, M_NOWAIT); + KASSERT((ngp_h != NULL), ("ngp_h zalloc failed (1)")); + NGI_GET_M(item, m); + KASSERT(m != NULL, ("NGI_GET_M failed")); + ngp_h->m = m; + NG_FREE_ITEM(item); + + if (hinfo->cfg.fifo) + hash = 0; /* all packets go into a single FIFO queue */ + else + hash = ip_hash(m, priv->header_offset); + + /* Find the appropriate FIFO queue for the packet and enqueue it*/ + TAILQ_FOREACH(ngp_f, &hinfo->fifo_head, fifo_le) + if (hash == ngp_f->hash) + break; + if (ngp_f == NULL) { + ngp_f = uma_zalloc(ngp_zone, M_NOWAIT); + KASSERT(ngp_h != NULL, ("ngp_h zalloc failed (2)")); + TAILQ_INIT(&ngp_f->packet_head); + ngp_f->hash = hash; + ngp_f->packets = 1; + ngp_f->rr_deficit = hinfo->cfg.drr; /* DRR quantum */ + hinfo->run.fifo_queues++; + TAILQ_INSERT_TAIL(&ngp_f->packet_head, ngp_h, ngp_link); + FIFO_VTIME_SORT(m->m_pkthdr.len); + } else { + TAILQ_INSERT_TAIL(&ngp_f->packet_head, ngp_h, ngp_link); + ngp_f->packets++; + } + hinfo->run.qin_frames++; + hinfo->run.qin_octets += m->m_pkthdr.len; + + /* Discard a frame if inbound queue limit has been reached */ + if (hinfo->run.qin_frames > hinfo->cfg.qin_size_limit) { + struct mbuf *m1; + int longest = 0; + + /* Find the longest queue */ + TAILQ_FOREACH(ngp_f1, &hinfo->fifo_head, fifo_le) + if (ngp_f1->packets > longest) { + longest = ngp_f1->packets; + ngp_f = ngp_f1; + } + + /* Drop a frame from the queue head/tail, depending on cfg */ + if (hinfo->cfg.drophead) + ngp_h = TAILQ_FIRST(&ngp_f->packet_head); + else + ngp_h = TAILQ_LAST(&ngp_f->packet_head, p_head); + TAILQ_REMOVE(&ngp_f->packet_head, ngp_h, ngp_link); + m1 = ngp_h->m; + uma_zfree(ngp_zone, ngp_h); + hinfo->run.qin_octets -= m1->m_pkthdr.len; + hinfo->stats.in_disc_octets += m1->m_pkthdr.len; + m_freem(m1); + if (--(ngp_f->packets) == 0) { + TAILQ_REMOVE(&hinfo->fifo_head, ngp_f, fifo_le); + uma_zfree(ngp_zone, ngp_f); + hinfo->run.fifo_queues--; + } + hinfo->run.qin_frames--; + hinfo->stats.in_disc_frames++; + } else if (hinfo->run.qin_frames > hinfo->cfg.qin_size_limit) { + struct mbuf *m1; + int longest = 0; + + /* Find the longest queue */ + TAILQ_FOREACH(ngp_f1, &hinfo->fifo_head, fifo_le) + if (ngp_f1->packets > longest) { + longest = ngp_f1->packets; + ngp_f = ngp_f1; + } + + /* Drop a frame from the queue head/tail, depending on cfg */ + if (hinfo->cfg.drophead) + ngp_h = TAILQ_FIRST(&ngp_f->packet_head); + else + ngp_h = TAILQ_LAST(&ngp_f->packet_head, p_head); + TAILQ_REMOVE(&ngp_f->packet_head, ngp_h, ngp_link); + m1 = ngp_h->m; + uma_zfree(ngp_zone, ngp_h); + hinfo->run.qin_octets -= m1->m_pkthdr.len; + hinfo->stats.in_disc_octets += m1->m_pkthdr.len; + m_freem(m1); + if (--(ngp_f->packets) == 0) { + TAILQ_REMOVE(&hinfo->fifo_head, ngp_f, fifo_le); + uma_zfree(ngp_zone, ngp_f); + hinfo->run.fifo_queues--; + } + hinfo->run.qin_frames--; + hinfo->stats.in_disc_frames++; + } + + /* + * Try to start the dequeuing process immediately. We must + * hold the ng_pipe_giant lock here and pipe_dequeue() will + * release it + */ + pipe_dequeue(hinfo, now); + + return (0); +} + + +/* + * Dequeueing sequence - we basically do the following: + * 1) Try to extract the frame from the inbound (bandwidth) queue; + * 2) In accordance to BER specified, discard the frame randomly; + * 3) If the frame survives BER, prepend it with delay info and move it + * to outbound (delay) queue; + * 4) Loop to 2) until bandwidth quota for this timeslice is reached, or + * inbound queue is flushed completely; + * 5) Extract the first frame from the outbound queue, if it's time has + * come. Queue the frame for transmission on the outbound hook; + * 6) Loop to 5) until outbound queue is flushed completely, or the next + * frame in the queue is not scheduled to be dequeued yet; + * 7) Transimit all frames queued in 5) + * + * Note: the caller must hold the ng_pipe_giant lock; this function + * returns with the lock released. + */ +static void +pipe_dequeue(struct hookinfo *hinfo, struct timeval *now) { + static uint64_t rand, oldrand; + const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hinfo->hook)); + struct hookinfo *dest; + struct ngp_fifo *ngp_f, *ngp_f1; + struct ngp_hdr *ngp_h; + struct timeval *when; + struct mbuf *q_head = NULL; + struct mbuf *q_tail = NULL; + struct mbuf *m; + int error = 0; + + /* Which one is the destination hook? */ + if (hinfo == &priv->lower) + dest = &priv->upper; + else + dest = &priv->lower; + + /* Bandwidth queue processing */ + while ((ngp_f = TAILQ_FIRST(&hinfo->fifo_head))) { + when = &hinfo->qin_utime; + if (when->tv_sec > now->tv_sec || (when->tv_sec == now->tv_sec + && when->tv_usec > now->tv_usec)) + break; + + ngp_h = TAILQ_FIRST(&ngp_f->packet_head); + m = ngp_h->m; + + /* Deficit Round Robin (DRR) processing */ + if (hinfo->cfg.drr) { + if (ngp_f->rr_deficit >= m->m_pkthdr.len) { + ngp_f->rr_deficit -= m->m_pkthdr.len; + } else { + ngp_f->rr_deficit += hinfo->cfg.drr; + TAILQ_REMOVE(&hinfo->fifo_head, ngp_f, fifo_le); + TAILQ_INSERT_TAIL(&hinfo->fifo_head, + ngp_f, fifo_le); + continue; + } + } + + /* + * Either create a duplicate and pass it on, or dequeue + * the original packet... + */ + if (hinfo->cfg.duplicate && + random() % 100 <= hinfo->cfg.duplicate) { + ngp_h = uma_zalloc(ngp_zone, M_NOWAIT); + KASSERT(ngp_h != NULL, ("ngp_h zalloc failed (3)")); + ngp_h->m = m_dup(m, M_NOWAIT); + KASSERT(ngp_h->m != NULL, ("m_dup failed")); + } else { + TAILQ_REMOVE(&ngp_f->packet_head, ngp_h, ngp_link); + hinfo->run.qin_frames--; + hinfo->run.qin_octets -= m->m_pkthdr.len; + ngp_f->packets--; + } + + /* Calculate the serialization delay */ + if (hinfo->cfg.bandwidth) { + hinfo->qin_utime.tv_usec += ((uint64_t) m->m_pkthdr.len + + priv->overhead ) * + 8000000 / hinfo->cfg.bandwidth; + hinfo->qin_utime.tv_sec += + hinfo->qin_utime.tv_usec / 1000000; + hinfo->qin_utime.tv_usec = + hinfo->qin_utime.tv_usec % 1000000; + } + when = &ngp_h->when; + when->tv_sec = hinfo->qin_utime.tv_sec; + when->tv_usec = hinfo->qin_utime.tv_usec; + + /* Sort / rearrange inbound queues */ + if (ngp_f->packets) { + if (hinfo->cfg.wfq) { + TAILQ_REMOVE(&hinfo->fifo_head, ngp_f, fifo_le); + FIFO_VTIME_SORT(TAILQ_FIRST( + &ngp_f->packet_head)->m->m_pkthdr.len) + } + } else { + TAILQ_REMOVE(&hinfo->fifo_head, ngp_f, fifo_le); + uma_zfree(ngp_zone, ngp_f); + hinfo->run.fifo_queues--; + } + + /* Randomly discard the frame, according to BER setting */ + if (hinfo->cfg.ber && + ((oldrand = rand) ^ (rand = random())<<17) >= + hinfo->ber_p[priv->overhead + m->m_pkthdr.len] ) { + hinfo->stats.out_disc_frames++; + hinfo->stats.out_disc_octets += m->m_pkthdr.len; + uma_zfree(ngp_zone, ngp_h); + m_freem(m); + continue; + } + + /* Discard frame if outbound queue size limit exceeded */ + if (hinfo->cfg.qout_size_limit && + hinfo->run.qout_frames>=hinfo->cfg.qout_size_limit) { + hinfo->stats.out_disc_frames++; + hinfo->stats.out_disc_octets += m->m_pkthdr.len; + uma_zfree(ngp_zone, ngp_h); + m_freem(m); + continue; + } + + /* Calculate the propagation delay */ + when->tv_usec += priv->delay; + when->tv_sec += when->tv_usec / 1000000; + when->tv_usec = when->tv_usec % 1000000; + + /* Put the frame into the delay queue */ + TAILQ_INSERT_TAIL(&hinfo->qout_head, ngp_h, ngp_link); + hinfo->run.qout_frames++; + hinfo->run.qout_octets += m->m_pkthdr.len; + } + + /* Delay queue processing */ + while ((ngp_h = TAILQ_FIRST(&hinfo->qout_head))) { + struct mbuf *m = ngp_h->m; + + when = &ngp_h->when; + if (when->tv_sec > now->tv_sec || + (when->tv_sec == now->tv_sec && + when->tv_usec > now->tv_usec)) + break; + + /* Update outbound queue stats */ + hinfo->stats.fwd_frames++; + hinfo->stats.fwd_octets += m->m_pkthdr.len; + hinfo->run.qout_frames--; + hinfo->run.qout_octets -= m->m_pkthdr.len; + + /* Dequeue the packet from qout */ + TAILQ_REMOVE(&hinfo->qout_head, ngp_h, ngp_link); + uma_zfree(ngp_zone, ngp_h); + + /* Enqueue locally for sending downstream */ + if (q_head == NULL) + q_head = m; + if (q_tail) + q_tail->m_nextpkt = m; + q_tail = m; + m->m_nextpkt = NULL; + } + + /* If both queues are empty detach us from the list of active queues */ + if (hinfo->run.qin_frames + hinfo->run.qout_frames == 0) { + LIST_REMOVE(hinfo, active_le); + active_gen_id++; + } + + mtx_unlock(&ng_pipe_giant); + + while ((m = q_head) != NULL) { + q_head = m->m_nextpkt; + m->m_nextpkt = NULL; + NG_SEND_DATA(error, dest->hook, m, meta); + } +} + + +/* + * This routine is called on every clock tick. We poll all nodes/hooks + * for queued frames by calling pipe_dequeue(). + */ +static void +pipe_scheduler(void *arg) +{ + pipe_poll(); + + /* Reschedule */ + callout_reset(&polling_timer, 1, &pipe_scheduler, NULL); +} + + +/* + * Traverse the list of all active hooks and attempt to dequeue + * some packets. Hooks with empty queues are not traversed since + * they are not linked into this list. + */ +static void +pipe_poll(void) +{ + struct hookinfo *hinfo; + struct timeval now; + int old_gen_id = active_gen_id; + + mtx_lock(&ng_pipe_giant); + microuptime(&now); + LIST_FOREACH(hinfo, &active_head, active_le) { + CURVNET_SET(NG_HOOK_NODE(hinfo->hook)->nd_vnet); + pipe_dequeue(hinfo, &now); + CURVNET_RESTORE(); + mtx_lock(&ng_pipe_giant); + if (old_gen_id != active_gen_id) { + /* the list was updated; restart traversing */ + hinfo = LIST_FIRST(&active_head); + if (hinfo == NULL) + break; + old_gen_id = active_gen_id; + continue; + } + } + mtx_unlock(&ng_pipe_giant); +} + + +/* + * Shutdown processing + * + * This is tricky. If we have both a lower and upper hook, then we + * probably want to extricate ourselves and leave the two peers + * still linked to each other. Otherwise we should just shut down as + * a normal node would. + */ +static int +ngp_shutdown(node_p node) +{ + const priv_p priv = NG_NODE_PRIVATE(node); + + if (priv->lower.hook && priv->upper.hook) + ng_bypass(priv->lower.hook, priv->upper.hook); + else { + if (priv->upper.hook != NULL) + ng_rmhook_self(priv->upper.hook); + if (priv->lower.hook != NULL) + ng_rmhook_self(priv->lower.hook); + } + NG_NODE_UNREF(node); + FREE(priv, M_NG_PIPE); + return (0); +} + + +/* + * Hook disconnection + */ +static int +ngp_disconnect(hook_p hook) +{ + struct hookinfo *const hinfo = NG_HOOK_PRIVATE(hook); + struct ngp_fifo *ngp_f; + struct ngp_hdr *ngp_h; + int removed = 0; + + mtx_lock(&ng_pipe_giant); + + KASSERT(hinfo != NULL, ("%s: null info", __FUNCTION__)); + hinfo->hook = NULL; + + /* Flush all fifo queues associated with the hook */ + while ((ngp_f = TAILQ_FIRST(&hinfo->fifo_head))) { + while ((ngp_h = TAILQ_FIRST(&ngp_f->packet_head))) { + TAILQ_REMOVE(&ngp_f->packet_head, ngp_h, ngp_link); + m_freem(ngp_h->m); + uma_zfree(ngp_zone, ngp_h); + removed++; + } + TAILQ_REMOVE(&hinfo->fifo_head, ngp_f, fifo_le); + uma_zfree(ngp_zone, ngp_f); + } + + /* Flush the delay queue */ + while ((ngp_h = TAILQ_FIRST(&hinfo->qout_head))) { + TAILQ_REMOVE(&hinfo->qout_head, ngp_h, ngp_link); + m_freem(ngp_h->m); + uma_zfree(ngp_zone, ngp_h); + removed++; + } + + /* + * Both queues should be empty by now, so detach us from + * the list of active queues + */ + if (removed) { + LIST_REMOVE(hinfo, active_le); + active_gen_id++; + } + if (hinfo->run.qin_frames + hinfo->run.qout_frames != removed) + printf("Mismatch: queued=%d but removed=%d !?!", + hinfo->run.qin_frames + hinfo->run.qout_frames, removed); + + /* Release the packet loss probability table (BER) */ + if (hinfo->ber_p) + FREE(hinfo->ber_p, M_NG_PIPE); + + mtx_unlock(&ng_pipe_giant); + + return (0); +} + +static int +ngp_modevent(module_t mod, int type, void *unused) +{ + int error = 0; + + switch (type) { + case MOD_LOAD: + ngp_zone = uma_zcreate("ng_pipe", max(sizeof(struct ngp_hdr), + sizeof (struct ngp_fifo)), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + if (ngp_zone == NULL) + panic("ng_pipe: couldn't allocate descriptor zone"); + + mtx_init(&ng_pipe_giant, "ng_pipe_giant", NULL, MTX_DEF); + LIST_INIT(&active_head); + callout_init(&polling_timer, CALLOUT_MPSAFE); + callout_reset(&polling_timer, 1, &pipe_scheduler, NULL); + break; + case MOD_UNLOAD: + callout_drain(&polling_timer); + uma_zdestroy(ngp_zone); + mtx_destroy(&ng_pipe_giant); + break; + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/netgraph/ng_pipe.h 2007-10-05 12:27:01.000000000 +0200 @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2004, 2007 University of Zagreb + * Copyright (c) 2007 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETGRAPH_PIPE_H_ +#define _NETGRAPH_PIPE_H_ + +/* Node type name and magic cookie */ +#define NG_PIPE_NODE_TYPE "pipe" +#define NGM_PIPE_COOKIE 200708191 + +/* Hook names */ +#define NG_PIPE_HOOK_UPPER "upper" +#define NG_PIPE_HOOK_LOWER "lower" + +#define MAX_FSIZE 16384 /* Largest supported frame size, in bytes, for BER */ +#define MAX_OHSIZE 256 /* Largest supported dummy-framing size, in bytes */ + +/* Statistics structure for one hook */ +struct ng_pipe_hookstat { + u_int64_t fwd_octets; + u_int64_t fwd_frames; + u_int64_t in_disc_octets; + u_int64_t in_disc_frames; + u_int64_t out_disc_octets; + u_int64_t out_disc_frames; +}; + +/* Keep this in sync with the above structure definition */ +#define NG_PIPE_HOOKSTAT_INFO { \ + { "FwdOctets", &ng_parse_uint64_type }, \ + { "FwdFrames", &ng_parse_uint64_type }, \ + { "queueDropOctets", &ng_parse_uint64_type }, \ + { "queueDropFrames", &ng_parse_uint64_type }, \ + { "delayDropOctets", &ng_parse_uint64_type }, \ + { "delayDropFrames", &ng_parse_uint64_type }, \ + { NULL }, \ +} + +/* Statistics structure returned by NGM_PIPE_GET_STATS */ +struct ng_pipe_stats { + struct ng_pipe_hookstat downstream; + struct ng_pipe_hookstat upstream; +}; + +/* Keep this in sync with the above structure definition */ +#define NG_PIPE_STATS_INFO(hstype) { \ + { "downstream", (hstype) }, \ + { "upstream", (hstype) }, \ + { NULL }, \ +} + +/* Runtime structure for one hook */ +struct ng_pipe_hookrun { + u_int32_t fifo_queues; + u_int32_t qin_octets; + u_int32_t qin_frames; + u_int32_t qout_octets; + u_int32_t qout_frames; +}; + +/* Keep this in sync with the above structure definition */ +#define NG_PIPE_HOOKRUN_INFO { \ + { "queues", &ng_parse_uint32_type }, \ + { "queuedOctets", &ng_parse_uint32_type }, \ + { "queuedFrames", &ng_parse_uint32_type }, \ + { "delayedOctets", &ng_parse_uint32_type }, \ + { "delayedFrames", &ng_parse_uint32_type }, \ + { NULL }, \ +} + +/* Runtime structure returned by NGM_PIPE_GET_RUN */ +struct ng_pipe_run { + struct ng_pipe_hookrun downstream; + struct ng_pipe_hookrun upstream; +}; + +/* Keep this in sync with the above structure definition */ +#define NG_PIPE_RUN_INFO(hstype) { \ + { "downstream", (hstype) }, \ + { "upstream", (hstype) }, \ + { NULL }, \ +} + +/* Config structure for one hook */ +struct ng_pipe_hookcfg { + u_int64_t bandwidth; + u_int64_t ber; + u_int32_t qin_size_limit; + u_int32_t qout_size_limit; + u_int32_t duplicate; + u_int32_t fifo; + u_int32_t drr; + u_int32_t wfq; + u_int32_t droptail; + u_int32_t drophead; +}; + +/* Keep this in sync with the above structure definition */ +#define NG_PIPE_HOOKCFG_INFO { \ + { "bandwidth", &ng_parse_uint64_type }, \ + { "BER", &ng_parse_uint64_type }, \ + { "queuelen", &ng_parse_uint32_type }, \ + { "delaylen", &ng_parse_uint32_type }, \ + { "duplicate", &ng_parse_uint32_type }, \ + { "fifo", &ng_parse_uint32_type }, \ + { "drr", &ng_parse_uint32_type }, \ + { "wfq", &ng_parse_uint32_type }, \ + { "droptail", &ng_parse_uint32_type }, \ + { "drophead", &ng_parse_uint32_type }, \ + { NULL }, \ +} + +/* Config structure returned by NGM_PIPE_GET_CFG */ +struct ng_pipe_cfg { + u_int64_t bandwidth; + u_int64_t delay; + u_int32_t header_offset; + u_int32_t overhead; + struct ng_pipe_hookcfg downstream; + struct ng_pipe_hookcfg upstream; +}; + +/* Keep this in sync with the above structure definition */ +#define NG_PIPE_CFG_INFO(hstype) { \ + { "bandwidth", &ng_parse_uint64_type }, \ + { "delay", &ng_parse_uint64_type }, \ + { "header_offset", &ng_parse_uint32_type }, \ + { "overhead", &ng_parse_uint32_type }, \ + { "downstream", (hstype) }, \ + { "upstream", (hstype) }, \ + { NULL }, \ +} + +/* Netgraph commands */ +enum { + NGM_PIPE_GET_STATS=1, /* get stats */ + NGM_PIPE_CLR_STATS, /* clear stats */ + NGM_PIPE_GETCLR_STATS, /* atomically get and clear stats */ + NGM_PIPE_GET_RUN, /* get current runtime status */ + NGM_PIPE_GET_CFG, /* get configurable parameters */ + NGM_PIPE_SET_CFG, /* set configurable parameters */ +}; + +#endif /* _NETGRAPH_PIPE_H_ */ --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/netgraph/ng_wormhole.c 2007-10-22 18:06:39.000000000 +0200 @@ -0,0 +1,451 @@ +/*- + * Copyright (c) 2007 University of Zagreb + * Copyright (c) 2007 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * XXX RCS tag goes here + */ + +/* + * A "worm" node can be used to establish a datapath between independent + * netgraph address spaces, i.e. between two virtual network stacks. A + * wormhole path is defined by a pair of wormhole nodes each residing in + * a different stack instance. Each node accepts only a single + * arbitrarily named hook. Once a wormhole datapath is established, all + * data messages received on the local hook will be forwarded to the + * hook connected to the remote node, and vice versa. + * + * "worm" nodes understand two node-specific messages: "peer" and + * "status". The "peer" message is used to specify the remote + * endpoint in form of "remote_worm_node_name@remote_vnet_name", or + * to fetch the current peering configuration if invoked without + * arguments. Both involved nodes must configure their peerings before + * the datapath will be established. The "status" command can be used + * to check the current state of the wormhole path, which can be one of + * unconfigured, pending or active. + * + * NB while the vnet addressing space is currently flat, it is reasonable + * to expect that this could change in the nearest future, which may be + * reflected in the addressing model for ng_wormhole datapaths. + * + * The following example shows how a netgraph path can be established + * between two network stack instances, "1" and "2": + * + * #!/bin/csh + * + * foreach vi (1 2) + * vimage -c $vi + * vimage $vi ngctl mkpeer eiface ether ether + * vimage $vi ngctl mkpeer ngeth0: worm ether ether + * vimage $vi ifconfig ngeth0 ether 40:0:0:0:0:$vi + * vimage $vi ifconfig ngeth0 10.0.0.$vi/24 + * end + * vimage 1 ngctl msg worm0: peer worm0@2 + * vimage 2 ngctl msg worm0: peer worm0@1 + * + */ + +#include "opt_vimage.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +struct ng_wormhole; +typedef struct ng_wormhole_priv *priv_p; + +#define NG_WORMHOLE_NODE_TYPE "worm" +#define NGM_WORMHOLE_COOKIE 20070806 + +static int ng_wormhole_mod_event(module_t, int, void *); +static ng_constructor_t ng_wormhole_constructor; +static ng_shutdown_t ng_wormhole_shutdown; +static ng_newhook_t ng_wormhole_newhook; +static ng_disconnect_t ng_wormhole_disconnect; +static ng_rcvdata_t ng_wormhole_rcvdata; +static ng_rcvmsg_t ng_wormhole_rcvmsg; +static vnet_attach_fn ng_wormhole_iattach; +static vnet_detach_fn ng_wormhole_idetach; +static void ng_wormhole_update_status(priv_p); +static ng_parse_t ng_wormhole_peer_parse; +static ng_unparse_t ng_wormhole_peer_unparse; +static ng_unparse_t ng_wormhole_status_unparse; + +/* Node state */ +enum { + NG_WORMHOLE_UNCONFIGURED = 0, + NG_WORMHOLE_PENDING, + NG_WORMHOLE_ACTIVE +}; + +/* Netgraph commands */ +enum { + NGM_WORMHOLE_PEER = 1, + NGM_WORMHOLE_STATUS +}; + +static const struct ng_parse_type ng_wormhole_peer_type = { + .parse = &ng_wormhole_peer_parse, + .unparse = &ng_wormhole_peer_unparse, +}; + +static const struct ng_parse_type ng_wormhole_status_type = { + .unparse = &ng_wormhole_status_unparse, +}; + +static const struct ng_cmdlist ng_wormhole_cmds[] = { + { + .cookie = NGM_WORMHOLE_COOKIE, + .cmd = NGM_WORMHOLE_PEER, + .name = "peer", + .mesgType = &ng_wormhole_peer_type, + .respType = &ng_wormhole_peer_type, + }, + { + .cookie = NGM_WORMHOLE_COOKIE, + .cmd = NGM_WORMHOLE_STATUS, + .name = "status", + .respType = &ng_wormhole_status_type, + }, + { 0 } +}; + +static struct ng_type typestruct = { + .version = NG_ABI_VERSION, + .name = NG_WORMHOLE_NODE_TYPE, + .mod_event = ng_wormhole_mod_event, + .constructor = ng_wormhole_constructor, + .rcvmsg = ng_wormhole_rcvmsg, + .shutdown = ng_wormhole_shutdown, + .newhook = ng_wormhole_newhook, + .rcvdata = ng_wormhole_rcvdata, + .disconnect = ng_wormhole_disconnect, + .cmdlist = ng_wormhole_cmds +}; +NETGRAPH_INIT(ng_wormhole, &typestruct); + +VNET_MOD_DECLARE_STATELESS(NG_WORMHOLE, ng_wormhole, ng_wormhole_iattach, + ng_wormhole_idetach, NETGRAPH) + +struct ng_wormhole_priv { + int status; + priv_p remote_priv; + struct vnet *vnet; + hook_p hook; + node_p node; + LIST_ENTRY(ng_wormhole_priv) all_wormholes_le; + int unit; +}; + +LIST_HEAD(, ng_wormhole_priv) all_wormholes_head; +/* XXX need a lock around the above list */ + +static int +ng_wormhole_constructor(node_p node) +{ + INIT_VNET_NETGRAPH(curvnet); + priv_p priv; + char buf[NG_NODESIZ]; + + MALLOC(priv, priv_p, sizeof(*priv), M_NETGRAPH, M_ZERO | M_NOWAIT); + if (priv == NULL) + return (ENOMEM); + + NG_NODE_SET_PRIVATE(node, priv); + priv->unit = alloc_unr(V_ng_wormhole_unit); + snprintf(buf, NG_NODESIZ, "%s%d", typestruct.name, priv->unit); + if (ng_name_node(node, buf) != 0) + log(LOG_WARNING, "%s: can't acquire netgraph name\n", buf); + priv->vnet = curvnet; + priv->node = node; + priv->hook = NULL; + priv->status = NG_WORMHOLE_UNCONFIGURED; + LIST_INSERT_HEAD(&all_wormholes_head, priv, all_wormholes_le); + return (0); +} + +static int +ng_wormhole_newhook(node_p node, hook_p hook, const char *name) +{ + priv_p priv = NG_NODE_PRIVATE(node); + + if (priv->hook) + return(EBUSY); + priv->hook = hook; + ng_wormhole_update_status(priv); + return (0); +} + +static int +ng_wormhole_disconnect(hook_p hook) +{ + priv_p priv = NG_NODE_PRIVATE(hook->hk_node); + + priv->hook = NULL; + ng_wormhole_update_status(priv); + return (0); +} + +static int +ng_wormhole_rcvmsg(node_p node, item_p item, hook_p lasthook) +{ + priv_p priv = NG_NODE_PRIVATE(node); + priv_p *remote_priv; + struct ng_mesg *resp = NULL; + struct ng_mesg *msg; + int error = 0; + + NGI_GET_MSG(item, msg); + switch (msg->header.typecookie) { + case NGM_WORMHOLE_COOKIE: + switch (msg->header.cmd) { + case NGM_WORMHOLE_PEER: + remote_priv = (priv_p *) &msg->data; + if (*remote_priv) { + if (*remote_priv == priv) + error = EINVAL; + else + priv->remote_priv = *remote_priv; + /* XXX drop all wormhole lock */ + ng_wormhole_update_status(priv); + } else { + NG_MKRESPONSE(resp, msg, + sizeof(priv->remote_priv), M_NOWAIT); + if (resp == NULL) + error = ENOMEM; + else + bcopy(&priv->remote_priv, resp->data, + sizeof(priv->remote_priv)); + } + + break; + case NGM_WORMHOLE_STATUS: + NG_MKRESPONSE(resp, msg, + sizeof(priv->status), M_NOWAIT); + if (resp == NULL) + error = ENOMEM; + else + bcopy(&priv->status, resp->data, + sizeof(priv->status)); + break; + default: + error = EINVAL; + break; + } + break; + default: + error = EINVAL; + break; + } + NG_RESPOND_MSG(error, node, item, resp); + NG_FREE_MSG(msg); + return (error); +} + +static int +ng_wormhole_peer_parse(const struct ng_parse_type *type, + const char *s, int *off, const u_char *const start, + u_char *const buf, int *buflen) +{ + char node_name_buf[NG_NODESIZ]; + char *t; + int len; + int error = 0; + priv_p *remote_priv = (priv_p *)buf; + + *buflen = sizeof(priv_p); + + while (isspace(s[*off])) + (*off)++; + if (strlen(&s[*off]) == 0) { + /* XXX to drop or not to drop the lock? */ + *remote_priv = NULL; + return (error); + } + if ((t = index(s + *off, '@')) == NULL) + return (EINVAL); + if ((len = t - (s + *off)) > sizeof(node_name_buf) - 1) + return (EINVAL); + strncpy(node_name_buf, s + *off, len); + node_name_buf[len] = '\0'; + *off += len + 1; /* vnet name should be in &s[*off] now */ + + /* XXX should lock all wormhole list here */ + LIST_FOREACH(*remote_priv, &all_wormholes_head, all_wormholes_le) + if (strcmp((*remote_priv)->node->nd_name, node_name_buf) == 0 && + strcmp(vnet_name((*remote_priv)->vnet), &s[*off]) == 0) + break; + if (*remote_priv) { + /* XXX should return with the lock held, drop it in rcvmsg */ + } else { + error = ENOENT; + /* XXX should unlock the all wormholes list now */ + } + return (error); +} + +static int +ng_wormhole_peer_unparse(const struct ng_parse_type *type, + const u_char *data, int *off, char *cbuf, int cbuflen) +{ + const priv_p *remote_priv = (const priv_p *)(data + *off); + + if (*remote_priv) { + /* XXX lock all wormhole list; check whether remote exists */ + snprintf(cbuf, cbuflen, "%s@%s", + (*remote_priv)->node->nd_name, + vnet_name((*remote_priv)->vnet)); + *off += sizeof(*remote_priv); + } + return (0); +} + +static int +ng_wormhole_status_unparse(const struct ng_parse_type *type, + const u_char *data, int *off, char *cbuf, int cbuflen) +{ + const int *status = (const int *)(data + *off); + + switch (*status) { + case NG_WORMHOLE_UNCONFIGURED: + snprintf(cbuf, cbuflen, "unconfigured"); + break; + case NG_WORMHOLE_PENDING: + snprintf(cbuf, cbuflen, "pending"); + break; + case NG_WORMHOLE_ACTIVE: + snprintf(cbuf, cbuflen, "active"); + break; + default: + panic("unknown status %d", *status); + } + *off += sizeof(*status); + return (0); +} + +static void +ng_wormhole_update_status(priv_p priv) +{ + priv_p remote_priv; + + /* XXX lock / unlock the all wormhole list while doing this */ + remote_priv = priv->remote_priv; + if (remote_priv == NULL) + priv->status = NG_WORMHOLE_UNCONFIGURED; + else if (remote_priv->remote_priv != priv) + priv->status = NG_WORMHOLE_PENDING; + else if (remote_priv->hook == NULL || priv->hook == NULL) + priv->status = remote_priv->status = NG_WORMHOLE_PENDING; + else + priv->status = remote_priv->status = NG_WORMHOLE_ACTIVE; +} + +static int +ng_wormhole_rcvdata(hook_p hook, item_p item) +{ + priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); + int error = 0; + priv_p remote_priv = priv->remote_priv; + struct mbuf *m; + + if (priv->status != NG_WORMHOLE_ACTIVE) { + NG_FREE_ITEM(item); + error = ENOTCONN; + } else { + m = NGI_M(item); + m->m_flags |= M_REMOTE_VNET; + CURVNET_SET_QUIET(remote_priv->vnet); + NG_FWD_ITEM_HOOK(error, item, remote_priv->hook); + CURVNET_RESTORE(); + } + return (error); +} + +static int +ng_wormhole_shutdown(node_p node) +{ + priv_p priv = NG_NODE_PRIVATE(node); + INIT_VNET_NETGRAPH(priv->vnet); + + LIST_REMOVE(priv, all_wormholes_le); + free_unr(V_ng_wormhole_unit, priv->unit); + FREE(priv, M_NETGRAPH); + NG_NODE_SET_PRIVATE(node, NULL); + NG_NODE_UNREF(node); + return (0); +} + +static int +ng_wormhole_mod_event(module_t mod, int event, void *data) +{ + int error = 0; + + switch (event) { + case MOD_LOAD: + vnet_mod_register(&vnet_ng_wormhole_modinfo); + break; + case MOD_UNLOAD: + vnet_mod_deregister(&vnet_ng_wormhole_modinfo); + break; + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +static int ng_wormhole_iattach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + + V_ng_wormhole_unit = new_unrhdr(0, 0xffff, NULL); + return (0); +} + +static int ng_wormhole_idetach(const void *unused) +{ + INIT_VNET_NETGRAPH(curvnet); + node_p node; + + do { + LIST_FOREACH(node, &V_ng_nodelist, nd_nodes) + if (node->nd_type == &typestruct) { + ng_rmnode_self(node); + break; + } + } while (node != NULL); + delete_unrhdr(V_ng_wormhole_unit); + return (0); +} --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/netgraph/vnetgraph.h 2007-10-05 12:27:01.000000000 +0200 @@ -0,0 +1,65 @@ +/*- + * Copyright (c) 2006 University of Zagreb + * Copyright (c) 2006 FreeBSD Foundation + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * XXX RCS tag goes here + */ + +#ifndef _NETGRAPH_VNETGRPAH_H_ +#define _NETGRAPH_VNETGRAPH_H_ + +#include + +#define INIT_VNET_NETGRAPH(vnet) \ + INIT_FROM_VNET(vnet, VNET_MOD_NETGRAPH, \ + struct vnet_netgraph, vnet_netgraph) + +#define VNET_NETGRAPH(sym) VSYM(vnet_netgraph, sym) + +#define NG_ID_HASH_SIZE 32 /* most systems wont need even this many */ + +#ifdef VIMAGE +struct vnet_netgraph { + LIST_HEAD(, ng_node) _ng_ID_hash[NG_ID_HASH_SIZE]; + LIST_HEAD(, ng_node) _ng_nodelist; + ng_ID_t _nextID; + struct unrhdr *_ng_iface_unit; + struct unrhdr *_ng_eiface_unit; + struct unrhdr *_ng_wormhole_unit; +}; +#endif + +/* Symbol translation macros */ +#define V_ng_ID_hash VNET_NETGRAPH(ng_ID_hash) +#define V_ng_nodelist VNET_NETGRAPH(ng_nodelist) +#define V_nextID VNET_NETGRAPH(nextID) +#define V_ng_iface_unit VNET_NETGRAPH(ng_iface_unit) +#define V_ng_eiface_unit VNET_NETGRAPH(ng_eiface_unit) +#define V_ng_wormhole_unit VNET_NETGRAPH(ng_wormhole_unit) + +#endif /* !_NETGRAPH_VNETGRAPH_H_ */ --- /u/marko/p4/head/src/sys/netinet/icmp_var.h 2007-08-31 03:47:59.000000000 +0200 +++ src/sys/netinet/icmp_var.h 2007-10-05 12:27:01.000000000 +0200 @@ -74,7 +74,9 @@ #ifdef _KERNEL SYSCTL_DECL(_net_inet_icmp); +#ifndef VIMAGE extern struct icmpstat icmpstat; /* icmp statistics */ +#endif extern int badport_bandlim(int); #define BANDLIM_UNLIMITED -1 #define BANDLIM_ICMP_UNREACH 0 --- /u/marko/p4/head/src/sys/netinet/if_ether.c 2008-01-04 13:50:39.000000000 +0100 +++ src/sys/netinet/if_ether.c 2008-01-14 19:23:52.000000000 +0100 @@ -41,6 +41,7 @@ #include "opt_inet.h" #include "opt_mac.h" #include "opt_carp.h" +#include "opt_vimage.h" #include #include @@ -49,9 +50,12 @@ #include #include #include +#include #include #include +#include +#include #include #include #include @@ -60,6 +64,7 @@ #include #include +#include #include #include #include @@ -80,10 +85,12 @@ SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); /* timer values */ -static int arpt_keep = (20*60); /* once resolved, good for 20 more minutes */ +#ifndef VIMAGE +static int arpt_keep; +#endif -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW, - &arpt_keep, 0, "ARP entry lifetime in seconds"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, max_age, + CTLFLAG_RW, arpt_keep, 0, "ARP entry lifetime in seconds"); #define rt_expire rt_rmx.rmx_expire @@ -96,20 +103,25 @@ }; static struct ifqueue arpintrq; -static int arp_allocated; -static int arp_maxtries = 5; -static int useloopback = 1; /* use loopback interface for local traffic */ -static int arp_proxyall = 0; +#ifndef VIMAGE +static int arp_maxtries; +static int useloopback; +static int arp_proxyall; +#endif -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW, - &arp_maxtries, 0, "ARP resolution attempts before returning error"); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW, - &useloopback, 0, "Use the loopback interface for local traffic"); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW, - &arp_proxyall, 0, "Enable proxy ARP for all suitable requests"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, maxtries, + CTLFLAG_RW, arp_maxtries, 0, + "ARP resolution attempts before returning error"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, useloopback, + CTLFLAG_RW, useloopback, 0, + "Use the loopback interface for local traffic"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, proxyall, + CTLFLAG_RW, arp_proxyall, 0, + "Enable proxy ARP for all suitable requests"); static void arp_init(void); +static int arp_iattach(const void *); static void arp_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static void arprequest(struct ifnet *, struct in_addr *, struct in_addr *, u_char *); @@ -121,6 +133,8 @@ static void in_arpinput(struct mbuf *); #endif +VNET_MOD_DECLARE_STATELESS(ARP, arp, arp_iattach, NULL, INET) + /* * Timeout routine. */ @@ -138,7 +152,9 @@ */ RT_UNLOCK(rt); + CURVNET_SET(rt->rt_ifp->if_vnet); rtrequest(RTM_DELETE, rt_key(rt), NULL, rt_mask(rt), 0, NULL); + CURVNET_RESTORE(); } /* @@ -147,6 +163,8 @@ static void arp_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); struct sockaddr *gate; struct llinfo_arp *la; static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; @@ -213,7 +231,6 @@ log(LOG_DEBUG, "%s: malloc failed\n", __func__); break; } - arp_allocated++; /* * We are storing a route entry outside of radix tree. So, * it can be found and accessed by other means than radix @@ -248,7 +265,7 @@ } #endif - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifp == rt->rt_ifp && SIN(rt_key(rt))->sin_addr.s_addr == (IA_SIN(ia))->sin_addr.s_addr) @@ -268,9 +285,9 @@ rt->rt_expire = 0; bcopy(IF_LLADDR(rt->rt_ifp), LLADDR(SDL(gate)), SDL(gate)->sdl_alen = rt->rt_ifp->if_addrlen); - if (useloopback) { - rt->rt_ifp = loif; - rt->rt_rmx.rmx_mtu = loif->if_mtu; + if (V_useloopback) { + rt->rt_ifp = V_loif; + rt->rt_rmx.rmx_mtu = V_loif->if_mtu; } /* @@ -358,6 +375,7 @@ arpresolve(struct ifnet *ifp, struct rtentry *rt0, struct mbuf *m, struct sockaddr *dst, u_char *desten) { + INIT_VNET_INET(ifp->if_vnet); struct llinfo_arp *la = NULL; struct rtentry *rt = NULL; struct sockaddr_dl *sdl; @@ -468,7 +486,7 @@ * if we have already sent arp_maxtries ARP requests. Retransmit the * ARP request, but not faster than one request per second. */ - if (la->la_asked < arp_maxtries) + if (la->la_asked < V_arp_maxtries) error = EWOULDBLOCK; /* First request. */ else error = (rt == rt0) ? EHOSTDOWN : EHOSTUNREACH; @@ -589,7 +607,8 @@ sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr.s_addr = 0; - + INIT_VNET_INET(ifp->if_vnet); + if (ifp->if_bridge) bridged = 1; @@ -644,7 +663,7 @@ /* * If bridging, fall back to using any inet address. */ - if (!bridged || (ia = TAILQ_FIRST(&in_ifaddrhead)) == NULL) + if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) goto drop; match: if (!enaddr) @@ -780,11 +799,11 @@ th->rcf = trld->trld_rcf; } if (rt->rt_expire) { - rt->rt_expire = time_uptime + arpt_keep; - callout_reset(&la->la_timer, hz * arpt_keep, arptimer, rt); + rt->rt_expire = time_uptime + V_arpt_keep; + callout_reset(&la->la_timer, hz * V_arpt_keep, arptimer, rt); } la->la_asked = 0; - la->la_preempt = arp_maxtries; + la->la_preempt = V_arp_maxtries; hold = la->la_hold; la->la_hold = NULL; RT_UNLOCK(rt); @@ -803,7 +822,7 @@ if (rt == NULL) { struct sockaddr_in sin; - if (!arp_proxyall) + if (!V_arp_proxyall) goto drop; bzero(&sin, sizeof sin); @@ -971,12 +990,31 @@ ifa->ifa_flags |= RTF_CLONING; } +static int +arp_iattach(unused) + const void *unused; +{ + INIT_VNET_INET(curvnet); + + V_arpt_keep = (20*60); /* once resolved, good for 20 more minutes */ + V_arp_maxtries = 5; + V_useloopback = 1; /* use loopback interface for local traffic */ + V_arp_proxyall = 0; + + return 0; +} + static void arp_init(void) { - +#ifdef VIMAGE + vnet_mod_register(&vnet_arp_modinfo); +#else + arp_iattach(NULL); +#endif arpintrq.ifq_maxlen = 50; mtx_init(&arpintrq.ifq_mtx, "arp_inq", NULL, MTX_DEF); netisr_register(NETISR_ARP, arpintr, &arpintrq, NETISR_MPSAFE); } + SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0); --- /u/marko/p4/head/src/sys/netinet/igmp.c 2007-10-29 17:17:43.000000000 +0100 +++ src/sys/netinet/igmp.c 2007-12-10 11:26:11.000000000 +0100 @@ -48,6 +48,7 @@ __FBSDID("$FreeBSD: src/sys/netinet/igmp.c,v 1.56 2007/10/28 15:55:21 rwatson Exp $"); #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -57,10 +58,13 @@ #include #include #include +#include +#include #include #include +#include #include #include #include @@ -79,10 +83,12 @@ static struct router_info *find_rti(struct ifnet *ifp); static void igmp_sendpkt(struct in_multi *, int, unsigned long); +#ifndef VIMAGE static struct igmpstat igmpstat; +#endif -SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RW, &igmpstat, - igmpstat, ""); +SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_igmp, IGMPCTL_STATS, + stats, CTLFLAG_RW, igmpstat, igmpstat, ""); /* * igmp_mtx protects all mutable global variables in igmp.c, as well as the @@ -92,7 +98,9 @@ * when accessed via an in_multi read-only. */ static struct mtx igmp_mtx; +#ifndef VIMAGE static SLIST_HEAD(, router_info) router_info_head; +#endif static int igmp_timers_are_running; /* @@ -115,8 +123,12 @@ void igmp_init(void) { + INIT_VNET_INET(curvnet); struct ipoption *ra; +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif /* * To avoid byte-swapping the same value over and over again. */ @@ -138,17 +150,22 @@ router_alert->m_len = sizeof(ra->ipopt_dst) + ra->ipopt_list[1]; mtx_init(&igmp_mtx, "igmp_mtx", NULL, MTX_DEF); - SLIST_INIT(&router_info_head); +#ifdef VIMAGE + } +#endif + + SLIST_INIT(&V_router_info_head); } static struct router_info * find_rti(struct ifnet *ifp) { + INIT_VNET_INET(ifp->if_vnet); struct router_info *rti; mtx_assert(&igmp_mtx, MA_OWNED); IGMP_PRINTF("[igmp.c, _find_rti] --> entering \n"); - SLIST_FOREACH(rti, &router_info_head, rti_list) { + SLIST_FOREACH(rti, &V_router_info_head, rti_list) { if (rti->rti_ifp == ifp) { IGMP_PRINTF( "[igmp.c, _find_rti] --> found old entry \n"); @@ -163,7 +180,7 @@ rti->rti_ifp = ifp; rti->rti_type = IGMP_V2_ROUTER; rti->rti_time = 0; - SLIST_INSERT_HEAD(&router_info_head, rti, rti_list); + SLIST_INSERT_HEAD(&V_router_info_head, rti, rti_list); IGMP_PRINTF("[igmp.c, _find_rti] --> created an entry \n"); return (rti); } @@ -182,8 +199,9 @@ struct in_multistep step; struct router_info *rti; int timer; /** timer value in the igmp query header **/ + INIT_VNET_INET(ifp->if_vnet); - ++igmpstat.igps_rcv_total; + ++V_igmpstat.igps_rcv_total; ip = mtod(m, struct ip *); igmplen = ip->ip_len; @@ -192,14 +210,14 @@ * Validate lengths. */ if (igmplen < IGMP_MINLEN) { - ++igmpstat.igps_rcv_tooshort; + ++V_igmpstat.igps_rcv_tooshort; m_freem(m); return; } minlen = iphlen + IGMP_MINLEN; if ((m->m_flags & M_EXT || m->m_len < minlen) && (m = m_pullup(m, minlen)) == 0) { - ++igmpstat.igps_rcv_tooshort; + ++V_igmpstat.igps_rcv_tooshort; return; } @@ -210,7 +228,7 @@ m->m_len -= iphlen; igmp = mtod(m, struct igmp *); if (in_cksum(m, igmplen)) { - ++igmpstat.igps_rcv_badsum; + ++V_igmpstat.igps_rcv_badsum; m_freem(m); return; } @@ -235,7 +253,7 @@ */ switch (igmp->igmp_type) { case IGMP_MEMBERSHIP_QUERY: - ++igmpstat.igps_rcv_queries; + ++V_igmpstat.igps_rcv_queries; if (ifp->if_flags & IFF_LOOPBACK) break; @@ -262,7 +280,7 @@ if (ip->ip_dst.s_addr != igmp_all_hosts_group || igmp->igmp_group.s_addr != 0) { - ++igmpstat.igps_rcv_badqueries; + ++V_igmpstat.igps_rcv_badqueries; m_freem(m); return; } @@ -273,7 +291,7 @@ if (igmp->igmp_group.s_addr != 0 && !IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { - ++igmpstat.igps_rcv_badqueries; + ++V_igmpstat.igps_rcv_badqueries; m_freem(m); return; } @@ -321,13 +339,13 @@ ip->ip_src.s_addr == IA_SIN(ia)->sin_addr.s_addr) break; - ++igmpstat.igps_rcv_reports; + ++V_igmpstat.igps_rcv_reports; if (ifp->if_flags & IFF_LOOPBACK) break; if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { - ++igmpstat.igps_rcv_badreports; + ++V_igmpstat.igps_rcv_badreports; m_freem(m); return; } @@ -354,7 +372,7 @@ IN_LOOKUP_MULTI(igmp->igmp_group, ifp, inm); if (inm != NULL) { inm->inm_timer = 0; - ++igmpstat.igps_rcv_ourreports; + ++V_igmpstat.igps_rcv_ourreports; inm->inm_state = IGMP_OTHERMEMBER; } IN_MULTI_UNLOCK(); @@ -422,6 +440,8 @@ IN_MULTI_LOCK(); igmp_timers_are_running = 0; + VNET_ITERLOOP_BEGIN(); + INIT_VNET_INET(vnet_iter); IN_FIRST_MULTI(step, inm); while (inm != NULL) { if (inm->inm_timer == 0) { @@ -434,6 +454,7 @@ } IN_NEXT_MULTI(step, inm); } + VNET_ITERLOOP_END(); IN_MULTI_UNLOCK(); } @@ -444,13 +465,16 @@ IGMP_PRINTF("[igmp.c,_slowtimo] -- > entering \n"); mtx_lock(&igmp_mtx); - SLIST_FOREACH(rti, &router_info_head, rti_list) { + VNET_ITERLOOP_BEGIN() + INIT_VNET_INET(vnet_iter); + SLIST_FOREACH(rti, &V_router_info_head, rti_list) { if (rti->rti_type == IGMP_V1_ROUTER) { rti->rti_time++; if (rti->rti_time >= IGMP_AGE_THRESHOLD) rti->rti_type = IGMP_V2_ROUTER; } } + VNET_ITERLOOP_END() mtx_unlock(&igmp_mtx); IGMP_PRINTF("[igmp.c,_slowtimo] -- > exiting \n"); } @@ -458,6 +482,8 @@ static void igmp_sendpkt(struct in_multi *inm, int type, unsigned long addr) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); struct mbuf *m; struct igmp *igmp; struct ip *ip; @@ -469,7 +495,7 @@ if (m == NULL) return; - m->m_pkthdr.rcvif = loif; + m->m_pkthdr.rcvif = V_loif; #ifdef MAC mac_netinet_igmp_send(inm->inm_ifp, m); #endif @@ -501,12 +527,12 @@ * Request loopback of the report if we are acting as a multicast * router, so that the process-level routing daemon can hear it. */ - imo.imo_multicast_loop = (ip_mrouter != NULL); + imo.imo_multicast_loop = (V_ip_mrouter != NULL); /* * XXX: Do we have to worry about reentrancy here? Don't think so. */ ip_output(m, router_alert, &igmprt, 0, &imo, NULL); - ++igmpstat.igps_snd_reports; + ++V_igmpstat.igps_snd_reports; } --- /u/marko/p4/head/src/sys/netinet/in.c 2008-01-28 23:53:52.000000000 +0100 +++ src/sys/netinet/in.c 2008-02-27 11:49:02.000000000 +0100 @@ -34,6 +34,7 @@ __FBSDID("$FreeBSD: src/sys/netinet/in.c,v 1.103 2008/01/24 08:14:38 bz Exp $"); #include "opt_carp.h" +#include "opt_vimage.h" #include #include @@ -43,11 +44,14 @@ #include #include #include +#include +#include #include #include #include +#include #include #include #include @@ -65,16 +69,19 @@ struct in_ifaddr *, struct sockaddr_in *, int); static void in_purgemaddrs(struct ifnet *); -static int subnetsarelocal = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW, - &subnetsarelocal, 0, "Treat all subnets as directly connected"); -static int sameprefixcarponly = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, same_prefix_carp_only, CTLFLAG_RW, - &sameprefixcarponly, 0, - "Refuse to create same prefixes on different interfaces"); - +#ifndef VIMAGE +static int subnetsarelocal; +static int sameprefixcarponly; extern struct inpcbinfo ripcbinfo; extern struct inpcbinfo udbinfo; +#endif + +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, subnets_are_local, + CTLFLAG_RW, subnetsarelocal, 0, + "Treat all subnets as directly connected"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, same_prefix_carp_only, + CTLFLAG_RW, sameprefixcarponly, 0, + "Refuse to create same prefixes on different interfaces"); /* * Return 1 if an internet address is for a ``local'' host @@ -85,15 +92,16 @@ int in_localaddr(struct in_addr in) { + INIT_VNET_INET(curvnet); register u_long i = ntohl(in.s_addr); register struct in_ifaddr *ia; - if (subnetsarelocal) { - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) + if (V_subnetsarelocal) { + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) if ((i & ia->ia_netmask) == ia->ia_net) return (1); } else { - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) if ((i & ia->ia_subnetmask) == ia->ia_subnet) return (1); } @@ -107,6 +115,7 @@ int in_localip(struct in_addr in) { + INIT_VNET_INET(curvnet); struct in_ifaddr *ia; LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) { @@ -199,6 +208,7 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { + INIT_VNET_INET(curvnet); /* so and ifp can be 0 ! */ register struct ifreq *ifr = (struct ifreq *)data; register struct in_ifaddr *ia = 0, *iap; register struct ifaddr *ifa; @@ -328,7 +338,7 @@ } ia->ia_ifp = ifp; - TAILQ_INSERT_TAIL(&in_ifaddrhead, ia, ia_link); + TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); splx(s); iaIsNew = 1; } @@ -492,7 +502,7 @@ */ s = splnet(); TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); - TAILQ_REMOVE(&in_ifaddrhead, ia, ia_link); + TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); if (ia->ia_addr.sin_family == AF_INET) { LIST_REMOVE(ia, ia_hash); /* @@ -707,6 +717,7 @@ in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin, int scrub) { + INIT_VNET_INET(ifp->if_vnet); register u_long i = ntohl(sin->sin_addr.s_addr); struct sockaddr_in oldaddr; int s = splimp(), flags = RTF_UP, error = 0; @@ -801,6 +812,7 @@ static int in_addprefix(struct in_ifaddr *target, int flags) { + INIT_VNET_INET(curvnet); struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error; @@ -814,7 +826,7 @@ prefix.s_addr &= mask.s_addr; } - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_addr.sin_addr; @@ -835,7 +847,7 @@ * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { - if (sameprefixcarponly && + if (V_sameprefixcarponly && target->ia_ifp->if_type != IFT_CARP && ia->ia_ifp->if_type != IFT_CARP) return (EEXIST); @@ -861,6 +873,7 @@ static int in_scrubprefix(struct in_ifaddr *target) { + INIT_VNET_INET(curvnet); struct in_ifaddr *ia; struct in_addr prefix, mask, p; int error; @@ -876,7 +889,7 @@ prefix.s_addr &= mask.s_addr; } - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) p = ia->ia_dstaddr.sin_addr; else { @@ -967,6 +980,8 @@ static void in_purgemaddrs(struct ifnet *ifp) { + INIT_VNET_INET(ifp->if_vnet); + struct in_multi *inm; struct in_multi *oinm; @@ -975,7 +990,7 @@ #endif IFF_LOCKGIANT(ifp); IN_MULTI_LOCK(); - LIST_FOREACH_SAFE(inm, &in_multihead, inm_link, oinm) { + LIST_FOREACH_SAFE(inm, &V_in_multihead, inm_link, oinm) { if (inm->inm_ifp == ifp) in_delmulti_locked(inm); } @@ -989,8 +1004,9 @@ void in_ifdetach(struct ifnet *ifp) { + INIT_VNET_INET(ifp->if_vnet); - in_pcbpurgeif0(&ripcbinfo, ifp); - in_pcbpurgeif0(&udbinfo, ifp); + in_pcbpurgeif0(&V_ripcbinfo, ifp); + in_pcbpurgeif0(&V_udbinfo, ifp); in_purgemaddrs(ifp); } --- /u/marko/p4/head/src/sys/netinet/in_gif.c 2007-10-16 13:53:37.000000000 +0200 +++ src/sys/netinet/in_gif.c 2007-10-22 18:06:40.000000000 +0200 @@ -35,6 +35,7 @@ #include "opt_mrouting.h" #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -45,12 +46,13 @@ #include #include #include - #include +#include #include #include +#include #include #include #include @@ -85,13 +87,16 @@ .pr_usrreqs = &rip_usrreqs }; -static int ip_gif_ttl = GIF_TTL; -SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW, - &ip_gif_ttl, 0, ""); +#ifndef VIMAGE +int ip_gif_ttl; +#endif +SYSCTL_V_INT(V_NET, vnet_gif, _net_inet_ip, IPCTL_GIF_TTL, gifttl, + CTLFLAG_RW, ip_gif_ttl, 0, ""); int in_gif_output(struct ifnet *ifp, int family, struct mbuf *m) { + INIT_VNET_GIF(ifp->if_vnet); struct gif_softc *sc = ifp->if_softc; struct sockaddr_in *dst = (struct sockaddr_in *)&sc->gif_ro.ro_dst; struct sockaddr_in *sin_src = (struct sockaddr_in *)sc->gif_psrc; @@ -176,7 +181,7 @@ } iphdr.ip_p = proto; /* version will be set in ip_output() */ - iphdr.ip_ttl = ip_gif_ttl; + iphdr.ip_ttl = V_ip_gif_ttl; iphdr.ip_len = m->m_pkthdr.len + sizeof(struct ip); ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED : ECN_NOCARE, &iphdr.ip_tos, &tos); @@ -239,6 +244,7 @@ void in_gif_input(struct mbuf *m, int off) { + INIT_VNET_INET(curvnet); struct ifnet *gifp = NULL; struct gif_softc *sc; struct ip *ip; @@ -252,14 +258,14 @@ sc = (struct gif_softc *)encap_getarg(m); if (sc == NULL) { m_freem(m); - ipstat.ips_nogif++; + V_ipstat.ips_nogif++; return; } gifp = GIF2IFP(sc); if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) { m_freem(m); - ipstat.ips_nogif++; + V_ipstat.ips_nogif++; return; } @@ -319,7 +325,7 @@ break; default: - ipstat.ips_nogif++; + V_ipstat.ips_nogif++; m_freem(m); return; } @@ -333,6 +339,7 @@ static int gif_validate4(const struct ip *ip, struct gif_softc *sc, struct ifnet *ifp) { + INIT_VNET_INET(curvnet); struct sockaddr_in *src, *dst; struct in_ifaddr *ia4; @@ -352,7 +359,7 @@ return 0; } /* reject packets with broadcast on source */ - TAILQ_FOREACH(ia4, &in_ifaddrhead, ia_link) { + TAILQ_FOREACH(ia4, &V_in_ifaddrhead, ia_link) { if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) continue; if (ip->ip_src.s_addr == ia4->ia_broadaddr.sin_addr.s_addr) --- /u/marko/p4/head/src/sys/netinet/in_gif.h 2007-08-31 03:48:00.000000000 +0200 +++ src/sys/netinet/in_gif.h 2007-10-05 12:27:02.000000000 +0200 @@ -35,6 +35,9 @@ #define GIF_TTL 30 +#ifndef VIMAGE +extern int ip_gif_ttl; +#endif struct gif_softc; void in_gif_input(struct mbuf *, int); int in_gif_output(struct ifnet *, int, struct mbuf *); --- /u/marko/p4/head/src/sys/netinet/in_mcast.c 2007-11-07 23:37:16.000000000 +0100 +++ src/sys/netinet/in_mcast.c 2007-10-22 18:06:41.000000000 +0200 @@ -39,6 +39,8 @@ #include __FBSDID("$FreeBSD: src/sys/netinet/in_mcast.c,v 1.3 2007/08/06 22:06:36 csjp Exp $"); +#include "opt_vimage.h" + #include #include #include @@ -48,11 +50,14 @@ #include #include #include +#include #include #include #include +#include +#include #include #include #include @@ -85,7 +90,9 @@ * ip_output() to send IGMP packets while holding the lock; this probably is * not quite desirable. */ +#ifndef VIMAGE struct in_multihead in_multihead; /* XXX BSS initialization */ +#endif struct mtx in_multi_mtx; MTX_SYSINIT(in_multi_mtx, &in_multi_mtx, "in_multi_mtx", MTX_DEF | MTX_RECURSE); @@ -312,6 +319,7 @@ struct in_multi * in_addmulti(struct in_addr *ap, struct ifnet *ifp) { + INIT_VNET_INET(ifp->if_vnet); struct in_multi *inm; inm = NULL; @@ -373,7 +381,7 @@ ninm->inm_ifma = ifma; ninm->inm_refcount = 1; ifma->ifma_protospec = ninm; - LIST_INSERT_HEAD(&in_multihead, ninm, inm_link); + LIST_INSERT_HEAD(&V_in_multihead, ninm, inm_link); igmp_joingroup(ninm); @@ -464,6 +472,8 @@ static int inp_change_source_filter(struct inpcb *inp, struct sockopt *sopt) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; @@ -532,7 +542,7 @@ ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); - if (gsr.gsr_interface == 0 || if_index < gsr.gsr_interface) + if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); @@ -753,6 +763,7 @@ static int inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { + INIT_VNET_NET(curvnet); struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; @@ -776,7 +787,7 @@ if (error) return (error); - if (msfr.msfr_ifindex == 0 || if_index < msfr.msfr_ifindex) + if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EINVAL); ifp = ifnet_byindex(msfr.msfr_ifindex); @@ -850,6 +861,7 @@ int inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) { + INIT_VNET_INET(curvnet); struct ip_mreqn mreqn; struct ip_moptions *imo; struct ifnet *ifp; @@ -956,6 +968,8 @@ static int inp_join_group(struct inpcb *inp, struct sockopt *sopt) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; @@ -1036,7 +1050,7 @@ } else { struct in_ifaddr *ia; struct ifnet *mfp = NULL; - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { mfp = ia->ia_ifp; if (!(mfp->if_flags & IFF_LOOPBACK) && (mfp->if_flags & IFF_MULTICAST)) { @@ -1089,7 +1103,7 @@ /* * Obtain the ifp. */ - if (gsr.gsr_interface == 0 || if_index < gsr.gsr_interface) + if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); @@ -1211,6 +1225,8 @@ static int inp_leave_group(struct inpcb *inp, struct sockopt *sopt) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); struct group_source_req gsr; struct ip_mreq_source mreqs; sockunion_t *gsa, *ssa; @@ -1298,7 +1314,7 @@ return (EINVAL); } - if (gsr.gsr_interface == 0 || if_index < gsr.gsr_interface) + if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); @@ -1399,6 +1415,7 @@ static int inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { + INIT_VNET_NET(curvnet); struct in_addr addr; struct ip_mreqn mreqn; struct ifnet *ifp; @@ -1415,7 +1432,7 @@ if (error) return (error); - if (mreqn.imr_ifindex < 0 || if_index < mreqn.imr_ifindex) + if (mreqn.imr_ifindex < 0 || V_if_index < mreqn.imr_ifindex) return (EINVAL); if (mreqn.imr_ifindex == 0) { @@ -1467,6 +1484,7 @@ static int inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { + INIT_VNET_NET(curvnet); struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; @@ -1496,7 +1514,7 @@ gsa->sin.sin_port = 0; /* ignore port */ - if (msfr.msfr_ifindex == 0 || if_index < msfr.msfr_ifindex) + if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); @@ -1829,12 +1847,14 @@ static struct ifnet * ip_multicast_if(struct in_addr *a) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); int ifindex; struct ifnet *ifp; if (ntohl(a->s_addr) >> 24 == 0) { ifindex = ntohl(a->s_addr) & 0xffffff; - if (ifindex < 0 || if_index < ifindex) + if (ifindex < 0 || V_if_index < ifindex) return NULL; ifp = ifnet_byindex(ifindex); } else --- /u/marko/p4/head/src/sys/netinet/in_pcb.c 2007-12-27 19:32:24.000000000 +0100 +++ src/sys/netinet/in_pcb.c 2008-01-14 19:23:52.000000000 +0100 @@ -38,6 +38,7 @@ #include "opt_ipsec.h" #include "opt_inet6.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -52,6 +53,7 @@ #include #include #include +#include #ifdef DDB #include @@ -59,10 +61,12 @@ #include +#include #include #include #include +#include #include #include #include @@ -74,7 +78,7 @@ #include #include #endif /* INET6 */ - +#include #ifdef IPSEC #include @@ -83,50 +87,60 @@ #include +#ifndef VIMAGE /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ -int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ -int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ -int ipport_firstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ -int ipport_lastauto = IPPORT_HILASTAUTO; /* 65535 */ -int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ -int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ +int ipport_lowfirstauto; +int ipport_lowlastauto; +int ipport_firstauto; +int ipport_lastauto; +int ipport_hifirstauto; +int ipport_hilastauto; /* * Reserved ports accessible only to root. There are significant * security considerations that must be accounted for when changing these, * but the security benefits can be great. Please be careful. */ -int ipport_reservedhigh = IPPORT_RESERVED - 1; /* 1023 */ -int ipport_reservedlow = 0; +int ipport_reservedhigh; +int ipport_reservedlow; /* Variables dealing with random ephemeral port allocation. */ -int ipport_randomized = 1; /* user controlled via sysctl */ -int ipport_randomcps = 10; /* user controlled via sysctl */ -int ipport_randomtime = 45; /* user controlled via sysctl */ -int ipport_stoprandom = 0; /* toggled by ipport_tick */ +int ipport_randomized; +int ipport_randomcps; +int ipport_randomtime; +int ipport_stoprandom; int ipport_tcpallocs; int ipport_tcplastcount; +#endif #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } static int +#ifndef VIMAGE sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) +#else +sysctl_net_ipport_check(SYSCTL_HANDLER_V_ARGS) +#endif { +#ifdef VIMAGE + INIT_VNET_INET(curvnet); + SYSCTL_RESOLVE_V_ARG1(); +#endif int error; - error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); + error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0) { - RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); - RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1); - RANGECHK(ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); - RANGECHK(ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); - RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); - RANGECHK(ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); + RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); + RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); + RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); + RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); + RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); + RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); } return (error); } @@ -135,30 +149,37 @@ SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW, - &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW, - &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW, - &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW, - &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW, - &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW, - &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, - CTLFLAG_RW|CTLFLAG_SECURE, &ipport_reservedhigh, 0, ""); -SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, - CTLFLAG_RW|CTLFLAG_SECURE, &ipport_reservedlow, 0, ""); -SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW, - &ipport_randomized, 0, "Enable random port allocation"); -SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW, - &ipport_randomcps, 0, "Maximum number of random port " - "allocations before switching to a sequental one"); -SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW, - &ipport_randomtime, 0, "Minimum time to keep sequental port " - "allocation before switching to a random one"); +SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, + lowfirst, CTLTYPE_INT|CTLFLAG_RW, ipport_lowfirstauto, 0, + &sysctl_net_ipport_check, "I", ""); +SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, + lowlast, CTLTYPE_INT|CTLFLAG_RW, ipport_lowlastauto, 0, + &sysctl_net_ipport_check, "I", ""); +SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, + first, CTLTYPE_INT|CTLFLAG_RW, ipport_firstauto, 0, + &sysctl_net_ipport_check, "I", ""); +SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, + last, CTLTYPE_INT|CTLFLAG_RW, ipport_lastauto, 0, + &sysctl_net_ipport_check, "I", ""); +SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, + hifirst, CTLTYPE_INT|CTLFLAG_RW, ipport_hifirstauto, 0, + &sysctl_net_ipport_check, "I", ""); +SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, + hilast, CTLTYPE_INT|CTLFLAG_RW, ipport_hilastauto, 0, + &sysctl_net_ipport_check, "I", ""); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, + reservedhigh, CTLFLAG_RW|CTLFLAG_SECURE, ipport_reservedhigh, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, reservedlow, + CTLFLAG_RW|CTLFLAG_SECURE, ipport_reservedlow, 0, ""); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomized, + CTLFLAG_RW, ipport_randomized, 0, "Enable random port allocation"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomcps, + CTLFLAG_RW, ipport_randomcps, 0, "Maximum number of random port " + "allocations before switching to a sequental one"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomtime, + CTLFLAG_RW, ipport_randomtime, 0, + "Minimum time to keep sequental port " + "allocation before switching to a random one"); /* * in_pcb.c: manage the Protocol Control Blocks. @@ -175,6 +196,9 @@ int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) { +#ifdef INET6 + INIT_VNET_INET6(curvnet); +#endif struct inpcb *inp; int error; @@ -207,7 +231,7 @@ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { inp->inp_vflag |= INP_IPV6PROTO; - if (ip6_v6only) + if (V_ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; } #endif @@ -215,7 +239,7 @@ pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; #ifdef INET6 - if (ip6_auto_flowlabel) + if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif INP_LOCK(inp); @@ -268,6 +292,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, struct ucred *cred) { + INIT_VNET_INET(inp->inp_vnet); struct socket *so = inp->inp_socket; unsigned short *lastport; struct sockaddr_in *sin; @@ -281,7 +306,7 @@ INP_INFO_WLOCK_ASSERT(pcbinfo); INP_LOCK_ASSERT(inp); - if (TAILQ_EMPTY(&in_ifaddrhead)) /* XXX broken! */ + if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) @@ -332,8 +357,8 @@ struct tcptw *tw; /* GROSS */ - if (ntohs(lport) <= ipport_reservedhigh && - ntohs(lport) >= ipport_reservedlow && + if (ntohs(lport) <= V_ipport_reservedhigh && + ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) return (EACCES); @@ -401,20 +426,20 @@ return (EINVAL); if (inp->inp_flags & INP_HIGHPORT) { - first = ipport_hifirstauto; /* sysctl */ - last = ipport_hilastauto; + first = V_ipport_hifirstauto; /* sysctl */ + last = V_ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); if (error) return error; - first = ipport_lowfirstauto; /* 1023 */ - last = ipport_lowlastauto; /* 600 */ + first = V_ipport_lowfirstauto; /* 1023 */ + last = V_ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { - first = ipport_firstauto; /* sysctl */ - last = ipport_lastauto; + first = V_ipport_firstauto; /* sysctl */ + last = V_ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* @@ -423,8 +448,8 @@ * use random port allocation only if the user allows it AND * ipport_tick() allows it. */ - if (ipport_randomized && - (!ipport_stoprandom || pcbinfo == &udbinfo)) + if (V_ipport_randomized && + (!V_ipport_stoprandom || pcbinfo == &V_udbinfo)) dorandom = 1; else dorandom = 0; @@ -435,8 +460,8 @@ if (first == last) dorandom = 0; /* Make sure to not include UDP packets in the count. */ - if (pcbinfo != &udbinfo) - ipport_tcpallocs++; + if (pcbinfo != &V_udbinfo) + V_ipport_tcpallocs++; /* * Simple check to ensure all ports are not used up causing * a deadlock here. @@ -556,6 +581,7 @@ in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, struct inpcb **oinpp, struct ucred *cred) { + INIT_VNET_INET(inp->inp_vnet); struct sockaddr_in *sin = (struct sockaddr_in *)nam; struct in_ifaddr *ia; struct sockaddr_in sa; @@ -591,7 +617,7 @@ if (error) return (error); } - if (!TAILQ_EMPTY(&in_ifaddrhead)) { + if (!TAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. @@ -600,12 +626,12 @@ * choose the broadcast address for that interface. */ if (faddr.s_addr == INADDR_ANY) - faddr = IA_SIN(TAILQ_FIRST(&in_ifaddrhead))->sin_addr; + faddr = IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; else if (faddr.s_addr == (u_long)INADDR_BROADCAST && - (TAILQ_FIRST(&in_ifaddrhead)->ia_ifp->if_flags & + (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST)) faddr = satosin(&TAILQ_FIRST( - &in_ifaddrhead)->ia_broadaddr)->sin_addr; + &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; } if (laddr.s_addr == INADDR_ANY) { ia = (struct in_ifaddr *)0; @@ -650,7 +676,7 @@ imo = inp->inp_moptions; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) if (ia->ia_ifp == ifp) break; if (ia == 0) @@ -1213,13 +1239,15 @@ void ipport_tick(void *xtp) { - - if (ipport_tcpallocs <= ipport_tcplastcount + ipport_randomcps) { - if (ipport_stoprandom > 0) - ipport_stoprandom--; + VNET_ITERLOOP_BEGIN(); + INIT_VNET_INET(curvnet); + if (V_ipport_tcpallocs <= V_ipport_tcplastcount + V_ipport_randomcps) { + if (V_ipport_stoprandom > 0) + V_ipport_stoprandom--; } else - ipport_stoprandom = ipport_randomtime; - ipport_tcplastcount = ipport_tcpallocs; + V_ipport_stoprandom = V_ipport_randomtime; + V_ipport_tcplastcount = V_ipport_tcpallocs; + VNET_ITERLOOP_END(); callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); } --- /u/marko/p4/head/src/sys/netinet/in_pcb.h 2007-12-27 19:32:25.000000000 +0100 +++ src/sys/netinet/in_pcb.h 2008-01-14 19:23:52.000000000 +0100 @@ -189,6 +189,8 @@ #define in6p_lport inp_lport /* for KAME src sync over BSD*'s */ #define in6p_fport inp_fport /* for KAME src sync over BSD*'s */ #define in6p_ppcb inp_ppcb /* for KAME src sync over BSD*'s */ + +#define inp_vnet inp_pcbinfo->ipi_vnet }; /* * The range of the generation count, as used in this implementation, is 9e19. @@ -270,7 +272,8 @@ * vimage 1 * general use 1 */ - void *ipi_pspare[2]; + struct vnet *ipi_vnet; + void *ipi_pspare[1]; }; #define INP_LOCK_INIT(inp, d, t) \ @@ -355,6 +358,7 @@ #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) #ifdef _KERNEL +#ifndef VIMAGE extern int ipport_reservedhigh; extern int ipport_reservedlow; extern int ipport_lowfirstauto; @@ -363,6 +367,11 @@ extern int ipport_lastauto; extern int ipport_hifirstauto; extern int ipport_hilastauto; +extern int ipport_randomized; +extern int ipport_randomcps; +extern int ipport_randomtime; +extern int ipport_stoprandom; +#endif extern struct callout ipport_tick_callout; void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); --- /u/marko/p4/head/src/sys/netinet/in_proto.c 2007-10-16 13:53:37.000000000 +0200 +++ src/sys/netinet/in_proto.c 2007-10-22 18:06:41.000000000 +0200 @@ -39,6 +39,7 @@ #include "opt_pf.h" #include "opt_carp.h" #include "opt_sctp.h" +#include "opt_vimage.h" #include #include @@ -120,6 +121,9 @@ .pr_ctlinput = udp_ctlinput, .pr_ctloutput = ip_ctloutput, .pr_init = udp_init, +#ifdef VIMAGE + .pr_destroy = udp_destroy, +#endif .pr_usrreqs = &udp_usrreqs }, { @@ -131,6 +135,9 @@ .pr_ctlinput = tcp_ctlinput, .pr_ctloutput = tcp_ctloutput, .pr_init = tcp_init, +#ifdef VIMAGE + .pr_destroy = tcp_destroy, +#endif .pr_slowtimo = tcp_slowtimo, .pr_drain = tcp_drain, .pr_usrreqs = &tcp_usrreqs @@ -341,11 +348,15 @@ .pr_input = rip_input, .pr_ctloutput = rip_ctloutput, .pr_init = rip_init, +#ifdef VIMAGE + .pr_destroy = rip_destroy, +#endif .pr_usrreqs = &rip_usrreqs }, }; extern int in_inithead(void **, int); +extern int in_detachhead(void **, int); struct domain inetdomain = { .dom_family = AF_INET, @@ -353,6 +364,9 @@ .dom_protosw = inetsw, .dom_protoswNPROTOSW = &inetsw[sizeof(inetsw)/sizeof(inetsw[0])], .dom_rtattach = in_inithead, +#ifdef VIMAGE + .dom_rtdetach = in_detachhead, +#endif .dom_rtoffset = 32, .dom_maxrtkey = sizeof(struct sockaddr_in) }; --- /u/marko/p4/head/src/sys/netinet/in_rmx.c 2008-02-27 18:29:07.000000000 +0100 +++ src/sys/netinet/in_rmx.c 2008-02-27 17:59:33.000000000 +0100 @@ -43,6 +43,8 @@ #include __FBSDID("$FreeBSD: src/sys/netinet/in_rmx.c,v 1.58 2008/02/07 11:26:52 glebius Exp $"); +#include "opt_vimage.h" + #include #include #include @@ -51,14 +53,20 @@ #include #include #include +#include +#include #include #include +#include #include #include #include -extern int in_inithead(void **head, int off); +int in_inithead(void **head, int off); +#ifdef VIMAGE +int in_detachhead(void **head, int off); +#endif #define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ @@ -149,18 +157,23 @@ return rn; } -static int rtq_reallyold = 60*60; /* one hour is "really old" */ -SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW, - &rtq_reallyold, 0, "Default expiration time on dynamically learned routes"); +#ifndef VIMAGE +static int rtq_reallyold; +static int rtq_minreallyold; +static int rtq_toomany; +#endif -static int rtq_minreallyold = 10; /* never automatically crank down to less */ -SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW, - &rtq_minreallyold, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_RTEXPIRE, rtexpire, + CTLFLAG_RW, rtq_reallyold, 0, + "Default expiration time on dynamically learned routes"); + +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_RTMINEXPIRE, + rtminexpire, CTLFLAG_RW, rtq_minreallyold, 0, "Minimum time to attempt to hold onto dynamically learned routes"); -static int rtq_toomany = 128; /* 128 cached routes is "too many" */ -SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW, - &rtq_toomany, 0, "Upper limit on dynamically learned routes"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_RTMAXCACHE, + rtmaxcache, CTLFLAG_RW, rtq_toomany, 0, + "Upper limit on dynamically learned routes"); /* * On last reference drop, mark the route as belong to us so that it can be @@ -169,6 +182,7 @@ static void in_clsroute(struct radix_node *rn, struct radix_node_head *head) { + INIT_VNET_INET(curvnet); struct rtentry *rt = (struct rtentry *)rn; RT_LOCK_ASSERT(rt); @@ -189,9 +203,9 @@ * If rtq_reallyold is 0, just delete the route without * waiting for a timeout cycle to kill it. */ - if (rtq_reallyold != 0) { + if (V_rtq_reallyold != 0) { rt->rt_flags |= RTPRF_OURS; - rt->rt_rmx.rmx_expire = time_uptime + rtq_reallyold; + rt->rt_rmx.rmx_expire = time_uptime + V_rtq_reallyold; } else { rtexpunge(rt); } @@ -214,6 +228,7 @@ static int in_rtqkill(struct radix_node *rn, void *rock) { + INIT_VNET_INET(curvnet); struct rtqk_arg *ap = rock; struct rtentry *rt = (struct rtentry *)rn; int err; @@ -237,9 +252,9 @@ } else { if (ap->updating && (rt->rt_rmx.rmx_expire - time_uptime > - rtq_reallyold)) { + V_rtq_reallyold)) { rt->rt_rmx.rmx_expire = - time_uptime + rtq_reallyold; + time_uptime + V_rtq_reallyold; } ap->nextstop = lmin(ap->nextstop, rt->rt_rmx.rmx_expire); @@ -250,20 +265,25 @@ } #define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ -static int rtq_timeout = RTQ_TIMEOUT; +#ifndef VIMAGE +static int rtq_timeout; static struct callout rtq_timer; +#endif static void in_rtqtimo(void *rock) { - struct radix_node_head *rnh = rock; + CURVNET_SET_QUIET((struct vnet *) rock); + INIT_VNET_NET((struct vnet *) rock); + INIT_VNET_INET((struct vnet *) rock); + struct radix_node_head *rnh = V_rt_tables[AF_INET]; struct rtqk_arg arg; struct timeval atv; static time_t last_adjusted_timeout = 0; arg.found = arg.killed = 0; arg.rnh = rnh; - arg.nextstop = time_uptime + rtq_timeout; + arg.nextstop = time_uptime + V_rtq_timeout; arg.draining = arg.updating = 0; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, in_rtqkill, &arg); @@ -277,18 +297,18 @@ * than once in rtq_timeout seconds, to keep from cranking down too * hard. */ - if ((arg.found - arg.killed > rtq_toomany) && - (time_uptime - last_adjusted_timeout >= rtq_timeout) && - rtq_reallyold > rtq_minreallyold) { - rtq_reallyold = 2 * rtq_reallyold / 3; - if (rtq_reallyold < rtq_minreallyold) { - rtq_reallyold = rtq_minreallyold; + if ((arg.found - arg.killed > V_rtq_toomany) && + (time_uptime - last_adjusted_timeout >= V_rtq_timeout) && + V_rtq_reallyold > V_rtq_minreallyold) { + V_rtq_reallyold = 2 * V_rtq_reallyold / 3; + if (V_rtq_reallyold < V_rtq_minreallyold) { + V_rtq_reallyold = V_rtq_minreallyold; } last_adjusted_timeout = time_uptime; #ifdef DIAGNOSTIC log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n", - rtq_reallyold); + V_rtq_reallyold); #endif arg.found = arg.killed = 0; arg.updating = 1; @@ -299,13 +319,16 @@ atv.tv_usec = 0; atv.tv_sec = arg.nextstop - time_uptime; - callout_reset(&rtq_timer, tvtohz(&atv), in_rtqtimo, rock); + callout_reset(&V_rtq_timer, tvtohz(&atv), in_rtqtimo, rock); + CURVNET_RESTORE(); } void in_rtqdrain(void) { - struct radix_node_head *rnh = rt_tables[AF_INET]; + VNET_ITERLOOP_BEGIN(); + INIT_VNET_NET(vnet_iter); + struct radix_node_head *rnh = V_rt_tables[AF_INET]; struct rtqk_arg arg; arg.found = arg.killed = 0; @@ -316,6 +339,7 @@ RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, in_rtqkill, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); + VNET_ITERLOOP_END(); } /* @@ -324,23 +348,40 @@ int in_inithead(void **head, int off) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); struct radix_node_head *rnh; if (!rn_inithead(head, off)) return 0; - if (head != (void **)&rt_tables[AF_INET]) /* BOGUS! */ + if (head != (void **)&V_rt_tables[AF_INET]) /* BOGUS! */ return 1; /* only do this for the real routing table */ + V_rtq_reallyold = 60*60; /* one hour is "really old" */ + V_rtq_minreallyold = 10; /* never automatically crank down to less */ + V_rtq_toomany = 128; /* 128 cached routes is "too many" */ + V_rtq_timeout = RTQ_TIMEOUT; rnh = *head; rnh->rnh_addaddr = in_addroute; rnh->rnh_matchaddr = in_matroute; rnh->rnh_close = in_clsroute; - callout_init(&rtq_timer, CALLOUT_MPSAFE); - in_rtqtimo(rnh); /* kick off timeout first time */ + callout_init(&V_rtq_timer, CALLOUT_MPSAFE); + in_rtqtimo(curvnet); /* kick off timeout first time */ return 1; } +#ifdef VIMAGE +int +in_detachhead(void **head, int off) +{ + INIT_VNET_INET(curvnet); + + callout_drain(&V_rtq_timer); + return 1; +} +#endif + /* * This zaps old routes when the interface goes down or interface * address is deleted. In the latter case, it deletes static routes @@ -382,13 +423,14 @@ int in_ifadown(struct ifaddr *ifa, int delete) { + INIT_VNET_NET(curvnet); struct in_ifadown_arg arg; struct radix_node_head *rnh; if (ifa->ifa_addr->sa_family != AF_INET) return 1; - rnh = rt_tables[AF_INET]; + rnh = V_rt_tables[AF_INET]; arg.ifa = ifa; arg.del = delete; RADIX_NODE_HEAD_LOCK(rnh); --- /u/marko/p4/head/src/sys/netinet/in_var.h 2007-08-31 03:48:00.000000000 +0200 +++ src/sys/netinet/in_var.h 2007-10-05 12:27:02.000000000 +0200 @@ -84,20 +84,33 @@ /* * Hash table for IP addresses. */ -extern LIST_HEAD(in_ifaddrhashhead, in_ifaddr) *in_ifaddrhashtbl; -extern TAILQ_HEAD(in_ifaddrhead, in_ifaddr) in_ifaddrhead; +LIST_HEAD(in_ifaddrhashhead, in_ifaddr); +TAILQ_HEAD(in_ifaddrhead, in_ifaddr); +#ifndef VIMAGE +extern struct in_ifaddrhashhead *in_ifaddrhashtbl; +extern struct in_ifaddrhead in_ifaddrhead; extern u_long in_ifaddrhmask; /* mask for hash table */ +#endif -#define INADDR_NHASH_LOG2 9 -#define INADDR_NHASH (1 << INADDR_NHASH_LOG2) -#define INADDR_HASHVAL(x) fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT) -#define INADDR_HASH(x) \ - (&in_ifaddrhashtbl[INADDR_HASHVAL(x) & in_ifaddrhmask]) +/* + * IP datagram reassembly. + */ +#define IPREASS_NHASH_LOG2 6 +#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) +#define IPREASS_HMASK (IPREASS_NHASH - 1) +#define IPREASS_HASH(x,y) \ + (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) /* * Macro for finding the internet address structure (in_ifaddr) * corresponding to one of our IP addresses (in_addr). */ +#define INADDR_NHASH_LOG2 9 +#define INADDR_NHASH (1 << INADDR_NHASH_LOG2) +#define INADDR_HASHVAL(x) fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT) +#define INADDR_HASH(x) \ + (&V_in_ifaddrhashtbl[INADDR_HASHVAL(x) & V_in_ifaddrhmask]) + #define INADDR_TO_IFADDR(addr, ia) \ /* struct in_addr addr; */ \ /* struct in_ifaddr *ia; */ \ @@ -130,7 +143,7 @@ /* struct ifnet *ifp; */ \ /* struct in_ifaddr *ia; */ \ { \ - for ((ia) = TAILQ_FIRST(&in_ifaddrhead); \ + for ((ia) = TAILQ_FIRST(&V_in_ifaddrhead); \ (ia) != NULL && (ia)->ia_ifp != (ifp); \ (ia) = TAILQ_NEXT((ia), ia_link)) \ continue; \ @@ -218,7 +231,11 @@ SYSCTL_DECL(_net_inet_raw); #endif -extern LIST_HEAD(in_multihead, in_multi) in_multihead; +LIST_HEAD(in_multihead, in_multi); + +#ifndef VIMAGE +extern struct in_multihead in_multihead; +#endif /* * Lock macros for IPv4 layer multicast address lists. IPv4 lock goes @@ -283,7 +300,7 @@ /* struct in_multi *inm; */ \ do { \ IN_MULTI_LOCK_ASSERT(); \ - (step).i_inm = LIST_FIRST(&in_multihead); \ + (step).i_inm = LIST_FIRST(&V_in_multihead); \ IN_NEXT_MULTI((step), (inm)); \ } while(0) --- /u/marko/p4/head/src/sys/netinet/ip6.h 2007-08-31 03:48:00.000000000 +0200 +++ src/sys/netinet/ip6.h 2007-10-05 12:27:02.000000000 +0200 @@ -275,24 +275,24 @@ if (((m)->m_flags & M_LOOP) && \ ((m)->m_len < (off) + (hlen)) && \ (((m) = m_pullup((m), (off) + (hlen))) == NULL)) { \ - ip6stat.ip6s_exthdrtoolong++; \ + V_ip6stat.ip6s_exthdrtoolong++; \ return ret; \ } else if ((m)->m_flags & M_EXT) { \ if ((m)->m_len < (off) + (hlen)) { \ - ip6stat.ip6s_exthdrtoolong++; \ + V_ip6stat.ip6s_exthdrtoolong++; \ m_freem(m); \ return ret; \ } \ } else { \ if ((m)->m_len < (off) + (hlen)) { \ - ip6stat.ip6s_exthdrtoolong++; \ + V_ip6stat.ip6s_exthdrtoolong++; \ m_freem(m); \ return ret; \ } \ } \ } else { \ if ((m)->m_len < (off) + (hlen)) { \ - ip6stat.ip6s_tooshort++; \ + V_ip6stat.ip6s_tooshort++; \ in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); \ m_freem(m); \ return ret; \ --- /u/marko/p4/head/src/sys/netinet/ip_fw.h 2008-01-28 23:53:52.000000000 +0100 +++ src/sys/netinet/ip_fw.h 2008-02-27 11:49:06.000000000 +0100 @@ -28,6 +28,9 @@ #ifndef _IPFW2_H #define _IPFW2_H +#include +#include + /* * The kernel representation of ipfw rules is made of a list of * 'instructions' (for all practical purposes equivalent to BPF @@ -546,6 +549,34 @@ */ #ifdef _KERNEL +/* + * Data structure to cache our ucred related + * information. This structure only gets used if + * the user specified UID/GID based constraints in + * a firewall rule. + */ +struct ip_fw_ugid { + gid_t fw_groups[NGROUPS]; + int fw_ngroups; + uid_t fw_uid; + int fw_prid; +}; + +#define IPFW_TABLES_MAX 128 +struct ip_fw_chain { + struct ip_fw *rules; /* list of rules */ + struct ip_fw *reap; /* list of rules to reap */ + LIST_HEAD(, cfg_nat) nat; /* list of nat entries */ + struct radix_node_head *tables[IPFW_TABLES_MAX]; + struct rwlock rwmtx; +}; + +struct table_entry { + struct radix_node rn[2]; + struct sockaddr_in addr, mask; + u_int32_t value; +}; + /* Return values from ipfw_chk() */ enum { IP_FW_PASS = 0, @@ -615,16 +646,103 @@ typedef int ip_fw_ctl_t(struct sockopt *); extern ip_fw_ctl_t *ip_fw_ctl_ptr; + +#ifndef VIMAGE extern int fw_one_pass; extern int fw_enable; #ifdef INET6 extern int fw6_enable; #endif +#endif /* For kernel ipfw_ether and ipfw_bridge. */ typedef int ip_fw_chk_t(struct ip_fw_args *args); extern ip_fw_chk_t *ip_fw_chk_ptr; #define IPFW_LOADED (ip_fw_chk_ptr != NULL) +/* + * Stack virtualization support. + */ +#ifdef VIMAGE +struct vnet_ipfw { + int _fw_one_pass; + int _fw_enable; + int _fw6_enable; + + u_int32_t _set_disable; + int _fw_deny_unknown_exthdrs; + int _fw_verbose; + int _verbose_limit; + int _fw_debug; + int _autoinc_step; + + ipfw_dyn_rule **_ipfw_dyn_v; + struct ip_fw_chain _layer3_chain; + u_int32_t _dyn_buckets; + u_int32_t _curr_dyn_buckets; + + u_int32_t _dyn_ack_lifetime; + u_int32_t _dyn_syn_lifetime; + u_int32_t _dyn_fin_lifetime; + u_int32_t _dyn_rst_lifetime; + u_int32_t _dyn_udp_lifetime; + u_int32_t _dyn_short_lifetime; + u_int32_t _dyn_keepalive_interval; + u_int32_t _dyn_keepalive_period; + u_int32_t _dyn_keepalive; + u_int32_t _static_count; + u_int32_t _static_len; + u_int32_t _dyn_count; + u_int32_t _dyn_max; + + u_int64_t _norule_counter; + + struct callout _ipfw_timeout; +}; +#endif + +/* + * Symbol translation macros + */ + +#define INIT_VNET_IPFW(vnet) \ + INIT_FROM_VNET(vnet, VNET_MOD_IPFW, struct vnet_ipfw, vnet_ipfw) + +#define VNET_IPFW(sym) VSYM(vnet_ipfw, sym) + +#define V_fw_one_pass VNET_IPFW(fw_one_pass) +#define V_fw_enable VNET_IPFW(fw_enable) +#define V_fw6_enable VNET_IPFW(fw6_enable) + +#define V_set_disable VNET_IPFW(set_disable) +#define V_fw_deny_unknown_exthdrs VNET_IPFW(fw_deny_unknown_exthdrs) +#define V_fw_verbose VNET_IPFW(fw_verbose) +#define V_verbose_limit VNET_IPFW(verbose_limit) + +#define V_fw_debug VNET_IPFW(fw_debug) +#define V_autoinc_step VNET_IPFW(autoinc_step) + +#define V_ipfw_dyn_v VNET_IPFW(ipfw_dyn_v) +#define V_layer3_chain VNET_IPFW(layer3_chain) +#define V_dyn_buckets VNET_IPFW(dyn_buckets) +#define V_curr_dyn_buckets VNET_IPFW(curr_dyn_buckets) + +#define V_dyn_ack_lifetime VNET_IPFW(dyn_ack_lifetime) +#define V_dyn_syn_lifetime VNET_IPFW(dyn_syn_lifetime) +#define V_dyn_fin_lifetime VNET_IPFW(dyn_fin_lifetime) +#define V_dyn_rst_lifetime VNET_IPFW(dyn_rst_lifetime) +#define V_dyn_udp_lifetime VNET_IPFW(dyn_udp_lifetime) +#define V_dyn_short_lifetime VNET_IPFW(dyn_short_lifetime) +#define V_dyn_keepalive_interval VNET_IPFW(dyn_keepalive_interval) +#define V_dyn_keepalive_period VNET_IPFW(dyn_keepalive_period) +#define V_dyn_keepalive VNET_IPFW(dyn_keepalive) +#define V_static_count VNET_IPFW(static_count) +#define V_static_len VNET_IPFW(static_len) +#define V_dyn_count VNET_IPFW(dyn_count) +#define V_dyn_max VNET_IPFW(dyn_max) + +#define V_norule_counter VNET_IPFW(norule_counter) +#define V_ipfw_timeout VNET_IPFW(ipfw_timeout) + #endif /* _KERNEL */ #endif /* _IPFW2_H */ --- /u/marko/p4/head/src/sys/netinet/ip_divert.c 2007-10-29 17:17:43.000000000 +0100 +++ src/sys/netinet/ip_divert.c 2007-12-10 11:26:11.000000000 +0100 @@ -34,6 +34,7 @@ #include "opt_inet.h" #include "opt_ipfw.h" #include "opt_mac.h" +#include "opt_vimage.h" #ifndef INET #error "IPDIVERT requires INET." #endif @@ -61,6 +62,7 @@ #include +#include #include #include #include --- /u/marko/p4/head/src/sys/netinet/ip_dummynet.c 2008-02-27 18:29:08.000000000 +0100 +++ src/sys/netinet/ip_dummynet.c 2008-01-14 19:23:52.000000000 +0100 @@ -26,7 +26,7 @@ */ #include -__FBSDID("$FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.115 2008/02/27 13:52:33 dwmalone Exp $"); +__FBSDID("$FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.114 2007/12/25 09:36:51 oleg Exp $"); #define DUMMYNET_DEBUG @@ -98,9 +98,6 @@ static int pipe_expire = 1 ; /* expire queue if empty */ static int dn_max_ratio = 16 ; /* max queues/buckets ratio */ -static long pipe_slot_limit = 100; /* Foot shooting limit for pipe queues. */ -static long pipe_byte_limit = 1024 * 1024; - static int red_lookup_depth = 256; /* RED - default lookup table depth */ static int red_avg_pkt_size = 512; /* RED - default medium packet size */ static int red_max_pkt_size = 1500; /* RED - default max packet size */ @@ -201,10 +198,6 @@ SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, CTLFLAG_RD, &io_pkt_drop, 0, "Number of packets dropped by dummynet."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, - CTLFLAG_RW, &pipe_slot_limit, 0, "Upper limit in slots for pipe queue."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, - CTLFLAG_RW, &pipe_byte_limit, 0, "Upper limit in bytes for pipe queue."); #endif #ifdef DUMMYNET_DEBUG @@ -1699,12 +1692,12 @@ x->plr = src->plr; x->flow_mask = src->flow_mask; if (x->flags_fs & DN_QSIZE_IS_BYTES) { - if (x->qsize > pipe_byte_limit) + if (x->qsize > 1024 * 1024) x->qsize = 1024 * 1024; } else { if (x->qsize == 0) x->qsize = 50; - if (x->qsize > pipe_slot_limit) + if (x->qsize > 100) x->qsize = 50; } /* Configuring RED. */ --- /u/marko/p4/head/src/sys/netinet/ip_fastfwd.c 2007-10-16 13:53:37.000000000 +0200 +++ src/sys/netinet/ip_fastfwd.c 2007-10-22 18:06:41.000000000 +0200 @@ -78,6 +78,7 @@ #include "opt_ipfw.h" #include "opt_ipstealth.h" +#include "opt_vimage.h" #include #include @@ -87,7 +88,9 @@ #include #include #include +#include +#include #include #include #include @@ -95,6 +98,7 @@ #include #include +#include #include #include #include @@ -105,13 +109,16 @@ #include -static int ipfastforward_active = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, fastforwarding, CTLFLAG_RW, - &ipfastforward_active, 0, "Enable fast IP forwarding"); +#ifndef VIMAGE +static int ipfastforward_active; +#endif +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, fastforwarding, + CTLFLAG_RW, ipfastforward_active, 0, "Enable fast IP forwarding"); static struct sockaddr_in * ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m) { + INIT_VNET_INET(curvnet); struct sockaddr_in *dst; struct rtentry *rt; @@ -135,8 +142,8 @@ if (rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *)rt->rt_gateway; } else { - ipstat.ips_noroute++; - ipstat.ips_cantforward++; + V_ipstat.ips_noroute++; + V_ipstat.ips_cantforward++; if (rt) RTFREE(rt); icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); @@ -155,6 +162,7 @@ struct mbuf * ip_fastforward(struct mbuf *m) { + INIT_VNET_INET(curvnet); struct ip *ip; struct mbuf *m0 = NULL; struct route ro; @@ -171,7 +179,7 @@ /* * Are we active and forwarding packets? */ - if (!ipfastforward_active || !ipforwarding) + if (!V_ipfastforward_active || !V_ipforwarding) return m; M_ASSERTVALID(m); @@ -187,7 +195,7 @@ * Is entire packet big enough? */ if (m->m_pkthdr.len < sizeof(struct ip)) { - ipstat.ips_tooshort++; + V_ipstat.ips_tooshort++; goto drop; } @@ -196,7 +204,7 @@ */ if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { - ipstat.ips_toosmall++; + V_ipstat.ips_toosmall++; return NULL; /* mbuf already free'd */ } @@ -206,7 +214,7 @@ * Is it IPv4? */ if (ip->ip_v != IPVERSION) { - ipstat.ips_badvers++; + V_ipstat.ips_badvers++; goto drop; } @@ -215,12 +223,12 @@ */ hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ - ipstat.ips_badlen++; + V_ipstat.ips_badlen++; goto drop; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { - ipstat.ips_badhlen++; + V_ipstat.ips_badhlen++; return NULL; /* mbuf already free'd */ } ip = mtod(m, struct ip *); @@ -238,7 +246,7 @@ sum = in_cksum(m, hlen); } if (sum) { - ipstat.ips_badsum++; + V_ipstat.ips_badsum++; goto drop; } @@ -253,7 +261,7 @@ * Is IP length longer than packet we have got? */ if (m->m_pkthdr.len < ip_len) { - ipstat.ips_tooshort++; + V_ipstat.ips_tooshort++; goto drop; } @@ -273,7 +281,7 @@ */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { - ipstat.ips_badaddr++; + V_ipstat.ips_badaddr++; goto drop; } @@ -331,7 +339,7 @@ if (in_localip(ip->ip_dst)) return m; - ipstat.ips_total++; + V_ipstat.ips_total++; /* * Step 3: incoming packet firewall processing @@ -513,7 +521,7 @@ */ if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= ifp->if_snd.ifq_maxlen) { - ipstat.ips_odropped++; + V_ipstat.ips_odropped++; /* would send source quench here but that is depreciated */ goto drop; } @@ -552,7 +560,7 @@ * Handle EMSGSIZE with icmp reply needfrag for TCP MTU discovery */ if (ip->ip_off & IP_DF) { - ipstat.ips_cantfrag++; + V_ipstat.ips_cantfrag++; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, mtu); goto consumed; @@ -590,16 +598,16 @@ m_freem(m); } } else - ipstat.ips_fragmented++; + V_ipstat.ips_fragmented++; } } if (error != 0) - ipstat.ips_odropped++; + V_ipstat.ips_odropped++; else { ro.ro_rt->rt_rmx.rmx_pksent++; - ipstat.ips_forward++; - ipstat.ips_fastforward++; + V_ipstat.ips_forward++; + V_ipstat.ips_fastforward++; } consumed: RTFREE(ro.ro_rt); --- /u/marko/p4/head/src/sys/netinet/ip_fw2.c 2008-02-27 18:29:08.000000000 +0100 +++ src/sys/netinet/ip_fw2.c 2008-02-27 17:59:48.000000000 +0100 @@ -45,6 +45,7 @@ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -64,6 +65,9 @@ #include #include #include +#include + +#include #include #include #include @@ -71,6 +75,7 @@ #define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */ +#include #include #include #include @@ -110,6 +115,11 @@ #include +static int vnet_ipfw_iattach(const void *); +static int vnet_ipfw_idetach(const void *); + +VNET_MOD_DECLARE(IPFW, ipfw, vnet_ipfw_iattach, vnet_ipfw_idetach, INET, NULL) + /* * set_disable contains one bit per set value (0..31). * If the bit is set, all rules with the corresponding set @@ -118,36 +128,18 @@ * and CANNOT be disabled. * Rules in set RESVD_SET can only be deleted explicitly. */ +#ifndef VIMAGE static u_int32_t set_disable; static int fw_verbose; static int verbose_limit; static struct callout ipfw_timeout; +#endif + static uma_zone_t ipfw_dyn_rule_zone; #define IPFW_DEFAULT_RULE 65535 -/* - * Data structure to cache our ucred related - * information. This structure only gets used if - * the user specified UID/GID based constraints in - * a firewall rule. - */ -struct ip_fw_ugid { - gid_t fw_groups[NGROUPS]; - int fw_ngroups; - uid_t fw_uid; - int fw_prid; -}; - -#define IPFW_TABLES_MAX 128 -struct ip_fw_chain { - struct ip_fw *rules; /* list of rules */ - struct ip_fw *reap; /* list of rules to reap */ - LIST_HEAD(, cfg_nat) nat; /* list of nat entries */ - struct radix_node_head *tables[IPFW_TABLES_MAX]; - struct rwlock rwmtx; -}; #define IPFW_LOCK_INIT(_chain) \ rw_init(&(_chain)->rwmtx, "IPFW static rules") #define IPFW_LOCK_DESTROY(_chain) rw_destroy(&(_chain)->rwmtx) @@ -161,40 +153,42 @@ /* * list of rules for layer 3 */ +#ifndef VIMAGE static struct ip_fw_chain layer3_chain; +#endif MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); -struct table_entry { - struct radix_node rn[2]; - struct sockaddr_in addr, mask; - u_int32_t value; -}; - -static int fw_debug = 1; -static int autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ +#ifndef VIMAGE +static int fw_debug; +static int autoinc_step; +#endif +#ifdef VIMAGE +extern int ipfw_chg_hook(SYSCTL_HANDLER_V_ARGS); +#else extern int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); +#endif #ifdef SYSCTL_NODE SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); -SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &fw_enable, 0, +SYSCTL_V_PROC(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, enable, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, fw_enable, 0, ipfw_chg_hook, "I", "Enable ipfw"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW, - &autoinc_step, 0, "Rule number autincrement step"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, - CTLFLAG_RW | CTLFLAG_SECURE3, - &fw_one_pass, 0, +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, autoinc_step, + CTLFLAG_RW, autoinc_step, 0, "Rule number autincrement step"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, one_pass, + CTLFLAG_RW | CTLFLAG_SECURE3, fw_one_pass, 0, "Only do a single pass through ipfw when using dummynet(4)"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW, - &fw_debug, 0, "Enable printing of debug ip_fw statements"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW, + fw_debug, 0, "Enable printing of debug ip_fw statements"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_SECURE3, - &fw_verbose, 0, "Log matches to ipfw rules"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, - &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged"); + fw_verbose, 0, "Log matches to ipfw rules"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, verbose_limit, + CTLFLAG_RW, + verbose_limit, 0, "Set upper limit of matches of ipfw rules logged"); /* * Description of dynamic rules. @@ -232,9 +226,11 @@ * obey the 'randomized match', and we do not do multiple * passes through the firewall. XXX check the latter!!! */ +#ifndef VIMAGE static ipfw_dyn_rule **ipfw_dyn_v = NULL; -static u_int32_t dyn_buckets = 256; /* must be power of 2 */ -static u_int32_t curr_dyn_buckets = 256; /* must be power of 2 */ +static u_int32_t dyn_buckets; +static u_int32_t curr_dyn_buckets; +#endif static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */ #define IPFW_DYN_LOCK_INIT() \ @@ -247,12 +243,14 @@ /* * Timeouts for various events in handing dynamic rules. */ -static u_int32_t dyn_ack_lifetime = 300; -static u_int32_t dyn_syn_lifetime = 20; -static u_int32_t dyn_fin_lifetime = 1; -static u_int32_t dyn_rst_lifetime = 1; -static u_int32_t dyn_udp_lifetime = 10; -static u_int32_t dyn_short_lifetime = 5; +#ifndef VIMAGE +static u_int32_t dyn_ack_lifetime; +static u_int32_t dyn_syn_lifetime; +static u_int32_t dyn_fin_lifetime; +static u_int32_t dyn_rst_lifetime; +static u_int32_t dyn_udp_lifetime; +static u_int32_t dyn_short_lifetime; +#endif /* * Keepalives are sent if dyn_keepalive is set. They are sent every @@ -261,57 +259,68 @@ * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower * than dyn_keepalive_period. */ - -static u_int32_t dyn_keepalive_interval = 20; -static u_int32_t dyn_keepalive_period = 5; -static u_int32_t dyn_keepalive = 1; /* do send keepalives */ +#ifndef VIMAGE +static u_int32_t dyn_keepalive_interval; +static u_int32_t dyn_keepalive_period; +static u_int32_t dyn_keepalive; static u_int32_t static_count; /* # of static rules */ static u_int32_t static_len; /* size in bytes of static rules */ -static u_int32_t dyn_count; /* # of dynamic rules */ -static u_int32_t dyn_max = 4096; /* max # of dynamic rules */ +static u_int32_t dyn_count; /* # of dynamic rules */ +static u_int32_t dyn_max; /* max # of dynamic rules */ +#endif -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW, - &dyn_buckets, 0, "Number of dyn. buckets"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD, - &curr_dyn_buckets, 0, "Current Number of dyn. buckets"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD, - &dyn_count, 0, "Number of dyn. rules"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW, - &dyn_max, 0, "Max number of dyn. rules"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD, - &static_count, 0, "Number of static rules"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW, - &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW, - &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW, - &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW, - &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW, - &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW, - &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW, - &dyn_keepalive, 0, "Enable keepalives for dyn. rules"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_buckets, + CTLFLAG_RW, dyn_buckets, 0, "Number of dyn. buckets"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, + CTLFLAG_RD, curr_dyn_buckets, 0, "Current Number of dyn. buckets"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_count, + CTLFLAG_RD, dyn_count, 0, "Number of dyn. rules"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_max, + CTLFLAG_RW, dyn_max, 0, "Max number of dyn. rules"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, static_count, + CTLFLAG_RD, static_count, 0, "Number of static rules"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, + CTLFLAG_RW, dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, + CTLFLAG_RW, dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, + CTLFLAG_RW, dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, + CTLFLAG_RW, dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, + CTLFLAG_RW, dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, + CTLFLAG_RW, dyn_short_lifetime, 0, + "Lifetime of dyn. rules for other situations"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_keepalive, + CTLFLAG_RW, dyn_keepalive, 0, "Enable keepalives for dyn. rules"); + +#ifndef VIMAGE +static int fw_deny_unknown_exthdrs; +#endif #ifdef INET6 /* * IPv6 specific variables */ -SYSCTL_DECL(_net_inet6_ip6); -static struct sysctl_ctx_list ip6_fw_sysctl_ctx; -static struct sysctl_oid *ip6_fw_sysctl_tree; +SYSCTL_DECL(_net_inet6_ip6); +SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW | CTLFLAG_SECURE, + 0, "Firewall"); +SYSCTL_V_PROC(V_NET, vnet_ipfw, _net_inet6_ip6_fw, OID_AUTO, enable, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, fw6_enable, 0, + ipfw_chg_hook, "I", "Enable ipfw+6"); +SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet6_ip6_fw, OID_AUTO, + deny_unknown_exthdrs, CTLFLAG_RW | CTLFLAG_SECURE, + fw_deny_unknown_exthdrs, 0, + "Deny packets with unknown IPv6 Extension Headers"); #endif /* INET6 */ #endif /* SYSCTL_NODE */ #ifdef IPFIREWALL_NAT MODULE_DEPEND(ipfw, libalias, 1, 1, 1); #endif -static int fw_deny_unknown_exthdrs = 1; - /* * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T @@ -581,12 +590,13 @@ static int search_ip6_addr_net (struct in6_addr * ip6_addr) { + INIT_VNET_NET(curvnet); struct ifnet *mdc; struct ifaddr *mdc2; struct in6_ifaddr *fdm; struct in6_addr copia; - TAILQ_FOREACH(mdc, &ifnet, if_link) + TAILQ_FOREACH(mdc, &V_ifnet, if_link) TAILQ_FOREACH(mdc2, &mdc->if_addrlist, ifa_list) { if (mdc2->ifa_addr->sa_family == AF_INET6) { fdm = (struct in6_ifaddr *)mdc2; @@ -647,6 +657,7 @@ return 1; } + static __inline int hash_packet6(struct ipfw_flow_id *id) { @@ -757,7 +768,9 @@ #endif /* INET6 */ +#ifndef VIMAGE static u_int64_t norule_counter; /* counter for ipfw_log(NULL...) */ +#endif #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 #define SNP(buf) buf, sizeof(buf) @@ -771,6 +784,7 @@ struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, struct ip *ip) { + INIT_VNET_IPFW(curvnet); struct ether_header *eh = args->eh; char *action; int limit_reached = 0; @@ -780,11 +794,11 @@ proto[0] = '\0'; if (f == NULL) { /* bogus pkt */ - if (verbose_limit != 0 && norule_counter >= verbose_limit) + if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit) return; - norule_counter++; - if (norule_counter == verbose_limit) - limit_reached = verbose_limit; + V_norule_counter++; + if (V_norule_counter == V_verbose_limit) + limit_reached = V_verbose_limit; action = "Refuse"; } else { /* O_LOG is the first action, find the real one */ ipfw_insn *cmd = ACTION_PTR(f); @@ -1037,6 +1051,7 @@ static __inline int hash_packet(struct ipfw_flow_id *id) { + INIT_VNET_IPFW(curvnet); u_int32_t i; #ifdef INET6 @@ -1045,7 +1060,7 @@ else #endif /* INET6 */ i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); - i &= (curr_dyn_buckets - 1); + i &= (V_curr_dyn_buckets - 1); return i; } @@ -1063,12 +1078,12 @@ q->parent->count--; \ DEB(printf("ipfw: unlink entry 0x%08x %d -> 0x%08x %d, %d left\n",\ (q->id.src_ip), (q->id.src_port), \ - (q->id.dst_ip), (q->id.dst_port), dyn_count-1 ); ) \ + (q->id.dst_ip), (q->id.dst_port), V_dyn_count-1 ); ) \ if (prev != NULL) \ prev->next = q = q->next; \ else \ head = q = q->next; \ - dyn_count--; \ + V_dyn_count--; \ uma_zfree(ipfw_dyn_rule_zone, old_q); } #define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) @@ -1088,6 +1103,7 @@ static void remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me) { + INIT_VNET_IPFW(curvnet); static u_int32_t last_remove = 0; #define FORCE (keep_me == NULL) @@ -1097,7 +1113,7 @@ IPFW_DYN_LOCK_ASSERT(); - if (ipfw_dyn_v == NULL || dyn_count == 0) + if (V_ipfw_dyn_v == NULL || V_dyn_count == 0) return; /* do not expire more than once per second, it is useless */ if (!FORCE && last_remove == time_uptime) @@ -1110,8 +1126,8 @@ * them in a second pass. */ next_pass: - for (i = 0 ; i < curr_dyn_buckets ; i++) { - for (prev=NULL, q = ipfw_dyn_v[i] ; q ; ) { + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { + for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) { /* * Logic can become complex here, so we split tests. */ @@ -1138,7 +1154,7 @@ goto next; } if (q->dyn_type != O_LIMIT_PARENT || !q->count) { - UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q); + UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q); continue; } next: @@ -1158,6 +1174,7 @@ lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp) { + INIT_VNET_IPFW(curvnet); /* * stateful ipfw extensions. * Lookup into dynamic session queue @@ -1171,14 +1188,14 @@ IPFW_DYN_LOCK_ASSERT(); - if (ipfw_dyn_v == NULL) + if (V_ipfw_dyn_v == NULL) goto done; /* not found */ i = hash_packet( pkt ); - for (prev=NULL, q = ipfw_dyn_v[i] ; q != NULL ; ) { + for (prev=NULL, q = V_ipfw_dyn_v[i] ; q != NULL ; ) { if (q->dyn_type == O_LIMIT_PARENT && q->count) goto next; if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */ - UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q); + UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q); continue; } if (pkt->proto == q->id.proto && @@ -1228,8 +1245,8 @@ if ( prev != NULL) { /* found and not in front */ prev->next = q->next; - q->next = ipfw_dyn_v[i]; - ipfw_dyn_v[i] = q; + q->next = V_ipfw_dyn_v[i]; + V_ipfw_dyn_v[i] = q; } if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST); @@ -1239,7 +1256,7 @@ q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8); switch (q->state) { case TH_SYN: /* opening */ - q->expire = time_uptime + dyn_syn_lifetime; + q->expire = time_uptime + V_dyn_syn_lifetime; break; case BOTH_SYN: /* move to established */ @@ -1262,13 +1279,13 @@ } } } - q->expire = time_uptime + dyn_ack_lifetime; + q->expire = time_uptime + V_dyn_ack_lifetime; break; case BOTH_SYN | BOTH_FIN: /* both sides closed */ - if (dyn_fin_lifetime >= dyn_keepalive_period) - dyn_fin_lifetime = dyn_keepalive_period - 1; - q->expire = time_uptime + dyn_fin_lifetime; + if (V_dyn_fin_lifetime >= V_dyn_keepalive_period) + V_dyn_fin_lifetime = V_dyn_keepalive_period - 1; + q->expire = time_uptime + V_dyn_fin_lifetime; break; default: @@ -1280,16 +1297,16 @@ if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) printf("invalid state: 0x%x\n", q->state); #endif - if (dyn_rst_lifetime >= dyn_keepalive_period) - dyn_rst_lifetime = dyn_keepalive_period - 1; - q->expire = time_uptime + dyn_rst_lifetime; + if (V_dyn_rst_lifetime >= V_dyn_keepalive_period) + V_dyn_rst_lifetime = V_dyn_keepalive_period - 1; + q->expire = time_uptime + V_dyn_rst_lifetime; break; } } else if (pkt->proto == IPPROTO_UDP) { - q->expire = time_uptime + dyn_udp_lifetime; + q->expire = time_uptime + V_dyn_udp_lifetime; } else { /* other protocols */ - q->expire = time_uptime + dyn_short_lifetime; + q->expire = time_uptime + V_dyn_short_lifetime; } done: if (match_direction) @@ -1314,6 +1331,7 @@ static void realloc_dynamic_table(void) { + INIT_VNET_IPFW(curvnet); IPFW_DYN_LOCK_ASSERT(); /* @@ -1322,21 +1340,21 @@ * default to 1024. */ - if (dyn_buckets > 65536) - dyn_buckets = 1024; - if ((dyn_buckets & (dyn_buckets-1)) != 0) { /* not a power of 2 */ - dyn_buckets = curr_dyn_buckets; /* reset */ + if (V_dyn_buckets > 65536) + V_dyn_buckets = 1024; + if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */ + V_dyn_buckets = V_curr_dyn_buckets; /* reset */ return; } - curr_dyn_buckets = dyn_buckets; - if (ipfw_dyn_v != NULL) - free(ipfw_dyn_v, M_IPFW); + V_curr_dyn_buckets = V_dyn_buckets; + if (V_ipfw_dyn_v != NULL) + free(V_ipfw_dyn_v, M_IPFW); for (;;) { - ipfw_dyn_v = malloc(curr_dyn_buckets * sizeof(ipfw_dyn_rule *), + V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *), M_IPFW, M_NOWAIT | M_ZERO); - if (ipfw_dyn_v != NULL || curr_dyn_buckets <= 2) + if (V_ipfw_dyn_v != NULL || V_curr_dyn_buckets <= 2) break; - curr_dyn_buckets /= 2; + V_curr_dyn_buckets /= 2; } } @@ -1353,15 +1371,16 @@ static ipfw_dyn_rule * add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) { + INIT_VNET_IPFW(curvnet); ipfw_dyn_rule *r; int i; IPFW_DYN_LOCK_ASSERT(); - if (ipfw_dyn_v == NULL || - (dyn_count == 0 && dyn_buckets != curr_dyn_buckets)) { + if (V_ipfw_dyn_v == NULL || + (V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) { realloc_dynamic_table(); - if (ipfw_dyn_v == NULL) + if (V_ipfw_dyn_v == NULL) return NULL; /* failed ! */ } i = hash_packet(id); @@ -1383,21 +1402,21 @@ } r->id = *id; - r->expire = time_uptime + dyn_syn_lifetime; + r->expire = time_uptime + V_dyn_syn_lifetime; r->rule = rule; r->dyn_type = dyn_type; r->pcnt = r->bcnt = 0; r->count = 0; r->bucket = i; - r->next = ipfw_dyn_v[i]; - ipfw_dyn_v[i] = r; - dyn_count++; + r->next = V_ipfw_dyn_v[i]; + V_ipfw_dyn_v[i] = r; + V_dyn_count++; DEB(printf("ipfw: add dyn entry ty %d 0x%08x %d -> 0x%08x %d, total %d\n", dyn_type, (r->id.src_ip), (r->id.src_port), (r->id.dst_ip), (r->id.dst_port), - dyn_count ); ) + V_dyn_count ); ) return r; } @@ -1408,15 +1427,16 @@ static ipfw_dyn_rule * lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule) { + INIT_VNET_IPFW(curvnet); ipfw_dyn_rule *q; int i; IPFW_DYN_LOCK_ASSERT(); - if (ipfw_dyn_v) { + if (V_ipfw_dyn_v) { int is_v6 = IS_IP6_FLOW_ID(pkt); i = hash_packet( pkt ); - for (q = ipfw_dyn_v[i] ; q != NULL ; q=q->next) + for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next) if (q->dyn_type == O_LIMIT_PARENT && rule== q->rule && pkt->proto == q->id.proto && @@ -1433,7 +1453,7 @@ pkt->dst_ip == q->id.dst_ip) ) ) { - q->expire = time_uptime + dyn_short_lifetime; + q->expire = time_uptime + V_dyn_short_lifetime; DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);) return q; } @@ -1451,6 +1471,7 @@ install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg) { + INIT_VNET_IPFW(curvnet); static int last_log; ipfw_dyn_rule *q; struct in_addr da; @@ -1480,11 +1501,11 @@ return (0); } - if (dyn_count >= dyn_max) + if (V_dyn_count >= V_dyn_max) /* Run out of slots, try to remove any expired rule. */ remove_dyn_rule(NULL, (ipfw_dyn_rule *)1); - if (dyn_count >= dyn_max) { + if (V_dyn_count >= V_dyn_max) { if (last_log != time_uptime) { last_log = time_uptime; printf("ipfw: %s: Too many dynamic rules\n", __func__); @@ -1545,7 +1566,7 @@ /* See if we can remove some expired rule. */ remove_dyn_rule(rule, parent); if (parent->count >= conn_limit) { - if (fw_verbose && last_log != time_uptime) { + if (V_fw_verbose && last_log != time_uptime) { last_log = time_uptime; #ifdef INET6 /* @@ -1611,6 +1632,7 @@ send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) { + INIT_VNET_INET(curvnet); struct mbuf *m; struct ip *ip; struct tcphdr *tcp; @@ -1687,7 +1709,7 @@ /* * now fill fields left out earlier */ - ip->ip_ttl = ip_defttl; + ip->ip_ttl = V_ip_defttl; ip->ip_len = m->m_pkthdr.len; m->m_flags |= M_SKIP_FIREWALL; return (m); @@ -1777,6 +1799,7 @@ add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint8_t mlen, uint32_t value) { + INIT_VNET_IPFW(curvnet); struct radix_node_head *rnh; struct table_entry *ent; @@ -1790,14 +1813,14 @@ ent->addr.sin_len = ent->mask.sin_len = 8; ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr; - IPFW_WLOCK(&layer3_chain); + IPFW_WLOCK(&V_layer3_chain); if (rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent) == NULL) { - IPFW_WUNLOCK(&layer3_chain); + IPFW_WUNLOCK(&V_layer3_chain); free(ent, M_IPFW_TBL); return (EEXIST); } - IPFW_WUNLOCK(&layer3_chain); + IPFW_WUNLOCK(&V_layer3_chain); return (0); } @@ -1981,6 +2004,7 @@ u_int16_t src_port, struct ip_fw_ugid *ugp, int *lookup, struct inpcb *inp) { + INIT_VNET_INET(curvnet); struct inpcbinfo *pi; int wildcard; struct inpcb *pcb; @@ -2008,10 +2032,10 @@ return (0); if (proto == IPPROTO_TCP) { wildcard = 0; - pi = &tcbinfo; + pi = &V_tcbinfo; } else if (proto == IPPROTO_UDP) { wildcard = INPLOOKUP_WILDCARD; - pi = &udbinfo; + pi = &V_udbinfo; } else return 0; match = 0; @@ -2069,9 +2093,9 @@ struct cfg_nat *ptr; struct ifaddr *ifa; - IPFW_WLOCK(&layer3_chain); + IPFW_WLOCK(&V_layer3_chain); /* Check every nat entry... */ - LIST_FOREACH(ptr, &layer3_chain.nat, _next) { + LIST_FOREACH(ptr, &V_layer3_chain.nat, _next) { /* ...using nic 'ifp->if_xname' as dynamic alias address. */ if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) == 0) { mtx_lock(&ifp->if_addr_mtx); @@ -2087,7 +2111,7 @@ mtx_unlock(&ifp->if_addr_mtx); } } - IPFW_WUNLOCK(&layer3_chain); + IPFW_WUNLOCK(&V_layer3_chain); } static void @@ -2095,8 +2119,8 @@ { struct ip_fw *rule; - IPFW_WLOCK_ASSERT(&layer3_chain); - for (rule = layer3_chain.rules; rule; rule = rule->next) { + IPFW_WLOCK_ASSERT(&V_layer3_chain); + for (rule = V_layer3_chain.rules; rule; rule = rule->next) { ipfw_insn_nat *cmd = (ipfw_insn_nat *)ACTION_PTR(rule); if (cmd->o.opcode != O_NAT) continue; @@ -2110,19 +2134,19 @@ { struct cfg_nat *ptr; - LIST_FOREACH(ptr, &layer3_chain.nat, _next) + LIST_FOREACH(ptr, &V_layer3_chain.nat, _next) if (ptr->id == i) return(ptr); return (NULL); } #define HOOK_NAT(b, p) do { \ - IPFW_WLOCK_ASSERT(&layer3_chain); \ + IPFW_WLOCK_ASSERT(&V_layer3_chain); \ LIST_INSERT_HEAD(b, p, _next); \ } while (0) #define UNHOOK_NAT(p) do { \ - IPFW_WLOCK_ASSERT(&layer3_chain); \ + IPFW_WLOCK_ASSERT(&V_layer3_chain); \ LIST_REMOVE(p, _next); \ } while (0) @@ -2276,6 +2300,9 @@ int ipfw_chk(struct ip_fw_args *args) { + INIT_VNET_INET(curvnet); + INIT_VNET_IPFW(curvnet); + /* * Local variables holding state during the processing of a packet: * @@ -2378,7 +2405,7 @@ */ int dyn_dir = MATCH_UNKNOWN; ipfw_dyn_rule *q = NULL; - struct ip_fw_chain *chain = &layer3_chain; + struct ip_fw_chain *chain = &V_layer3_chain; struct m_tag *mtag; /* @@ -2481,7 +2508,7 @@ printf("IPFW2: IPV6 - Unknown Routing " "Header type(%d)\n", ((struct ip6_rthdr *)ulp)->ip6r_type); - if (fw_deny_unknown_exthdrs) + if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } @@ -2505,7 +2532,7 @@ if (offset == 0) { printf("IPFW2: IPV6 - Invalid Fragment " "Header\n"); - if (fw_deny_unknown_exthdrs) + if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } @@ -2577,7 +2604,7 @@ default: printf("IPFW2: IPV6 - Unknown Extension " "Header(%d), ext_hd=%x\n", proto, ext_hd); - if (fw_deny_unknown_exthdrs) + if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); PULLUP_TO(hlen, ulp, struct ip6_ext); break; @@ -2658,7 +2685,7 @@ * XXX should not happen here, but optimized out in * the caller. */ - if (fw_one_pass) { + if (V_fw_one_pass) { IPFW_RUNLOCK(chain); return (IP_FW_PASS); } @@ -2703,7 +2730,7 @@ int l, cmdlen, skip_or; /* skip rest of OR block */ again: - if (set_disable & (1 << f->set) ) + if (V_set_disable & (1 << f->set) ) continue; skip_or = 0; @@ -3089,7 +3116,7 @@ } case O_LOG: - if (fw_verbose) + if (V_fw_verbose) ipfw_log(f, hlen, args, m, oif, offset, tablearg, ip); match = 1; @@ -3689,7 +3716,7 @@ return (retval); pullup_failed: - if (fw_verbose) + if (V_fw_verbose) printf("ipfw: pullup failed\n"); return (IP_FW_DENY); } @@ -3717,6 +3744,7 @@ static int add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule) { + INIT_VNET_IPFW(curvnet); struct ip_fw *rule, *f, *prev; int l = RULESIZE(input_rule); @@ -3747,10 +3775,10 @@ * If rulenum is 0, find highest numbered rule before the * default rule, and add autoinc_step */ - if (autoinc_step < 1) - autoinc_step = 1; - else if (autoinc_step > 1000) - autoinc_step = 1000; + if (V_autoinc_step < 1) + V_autoinc_step = 1; + else if (V_autoinc_step > 1000) + V_autoinc_step = 1000; if (rule->rulenum == 0) { /* * locate the highest numbered rule before default @@ -3760,8 +3788,8 @@ break; rule->rulenum = f->rulenum; } - if (rule->rulenum < IPFW_DEFAULT_RULE - autoinc_step) - rule->rulenum += autoinc_step; + if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step) + rule->rulenum += V_autoinc_step; input_rule->rulenum = rule->rulenum; } @@ -3782,11 +3810,11 @@ } flush_rule_ptrs(chain); done: - static_count++; - static_len += l; + V_static_count++; + V_static_len += l; IPFW_WUNLOCK(chain); DEB(printf("ipfw: installed rule %d, static count now %d\n", - rule->rulenum, static_count);) + rule->rulenum, V_static_count);) return (0); } @@ -3802,6 +3830,7 @@ remove_rule(struct ip_fw_chain *chain, struct ip_fw *rule, struct ip_fw *prev) { + INIT_VNET_IPFW(curvnet); struct ip_fw *n; int l = RULESIZE(rule); @@ -3815,8 +3844,8 @@ chain->rules = n; else prev->next = n; - static_count--; - static_len -= l; + V_static_count--; + V_static_len -= l; rule->next = chain->reap; chain->reap = rule; @@ -4016,6 +4045,7 @@ static int zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only) { + INIT_VNET_IPFW(curvnet); struct ip_fw *rule; char *msg; @@ -4030,7 +4060,7 @@ IPFW_WLOCK(chain); if (rulenum == 0) { - norule_counter = 0; + V_norule_counter = 0; for (rule = chain->rules; rule; rule = rule->next) { /* Skip rules from another set. */ if (cmd == 1 && rule->set != set) @@ -4064,7 +4094,7 @@ } IPFW_WUNLOCK(chain); - if (fw_verbose) + if (V_fw_verbose) log(LOG_SECURITY | LOG_NOTICE, msg, rulenum); return (0); } @@ -4365,6 +4395,7 @@ static size_t ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) { + INIT_VNET_IPFW(curvnet); char *bp = buf; char *ep = bp + space; struct ip_fw *rule; @@ -4389,20 +4420,21 @@ * in a wild attempt to keep the ABI the same. * Why do we do this on EVERY rule? */ - bcopy(&set_disable, &(((struct ip_fw *)bp)->next_rule), - sizeof(set_disable)); + bcopy(&V_set_disable, + &(((struct ip_fw *)bp)->next_rule), + sizeof(V_set_disable)); if (((struct ip_fw *)bp)->timestamp) ((struct ip_fw *)bp)->timestamp += boot_seconds; bp += i; } } IPFW_RUNLOCK(chain); - if (ipfw_dyn_v) { + if (V_ipfw_dyn_v) { ipfw_dyn_rule *p, *last = NULL; IPFW_DYN_LOCK(); - for (i = 0 ; i < curr_dyn_buckets; i++) - for (p = ipfw_dyn_v[i] ; p != NULL; p = p->next) { + for (i = 0 ; i < V_curr_dyn_buckets; i++) + for (p = V_ipfw_dyn_v[i] ; p != NULL; p = p->next) { if (bp + sizeof *p <= ep) { ipfw_dyn_rule *dst = (ipfw_dyn_rule *)bp; @@ -4445,6 +4477,7 @@ static int ipfw_ctl(struct sockopt *sopt) { + INIT_VNET_IPFW(curvnet); #define RULE_MAXSIZE (256*sizeof(u_int32_t)) int error; size_t size; @@ -4481,9 +4514,9 @@ * change between calculating the size and returning the * data in which case we'll just return what fits. */ - size = static_len; /* size of static rules */ - if (ipfw_dyn_v) /* add size of dyn.rules */ - size += (dyn_count * sizeof(ipfw_dyn_rule)); + size = V_static_len; /* size of static rules */ + if (V_ipfw_dyn_v) /* add size of dyn.rules */ + size += (V_dyn_count * sizeof(ipfw_dyn_rule)); /* * XXX todo: if the user passes a short length just to know @@ -4492,7 +4525,7 @@ */ buf = malloc(size, M_TEMP, M_WAITOK); error = sooptcopyout(sopt, buf, - ipfw_getrules(&layer3_chain, buf, size)); + ipfw_getrules(&V_layer3_chain, buf, size)); free(buf, M_TEMP); break; @@ -4510,12 +4543,12 @@ * the old list without the need for a lock. */ - IPFW_WLOCK(&layer3_chain); - layer3_chain.reap = NULL; - free_chain(&layer3_chain, 0 /* keep default rule */); - rule = layer3_chain.reap; - layer3_chain.reap = NULL; - IPFW_WUNLOCK(&layer3_chain); + IPFW_WLOCK(&V_layer3_chain); + V_layer3_chain.reap = NULL; + free_chain(&V_layer3_chain, 0 /* keep default rule */); + rule = V_layer3_chain.reap; + V_layer3_chain.reap = NULL; + IPFW_WUNLOCK(&V_layer3_chain); if (rule != NULL) reap_rules(rule); break; @@ -4527,7 +4560,7 @@ if (error == 0) error = check_ipfw_struct(rule, sopt->sopt_valsize); if (error == 0) { - error = add_rule(&layer3_chain, rule); + error = add_rule(&V_layer3_chain, rule); size = RULESIZE(rule); if (!error && sopt->sopt_dir == SOPT_GET) error = sooptcopyout(sopt, rule, size); @@ -4554,10 +4587,10 @@ break; size = sopt->sopt_valsize; if (size == sizeof(u_int32_t)) /* delete or reassign */ - error = del_entry(&layer3_chain, rulenum[0]); + error = del_entry(&V_layer3_chain, rulenum[0]); else if (size == 2*sizeof(u_int32_t)) /* set enable/disable */ - set_disable = - (set_disable | rulenum[0]) & ~rulenum[1] & + V_set_disable = + (V_set_disable | rulenum[0]) & ~rulenum[1] & ~(1<sopt_name == IP_FW_RESETLOG); break; @@ -4584,7 +4617,7 @@ sizeof(ent), sizeof(ent)); if (error) break; - error = add_table_entry(&layer3_chain, ent.tbl, + error = add_table_entry(&V_layer3_chain, ent.tbl, ent.addr, ent.masklen, ent.value); } break; @@ -4597,7 +4630,7 @@ sizeof(ent), sizeof(ent)); if (error) break; - error = del_table_entry(&layer3_chain, ent.tbl, + error = del_table_entry(&V_layer3_chain, ent.tbl, ent.addr, ent.masklen); } break; @@ -4610,9 +4643,9 @@ sizeof(tbl), sizeof(tbl)); if (error) break; - IPFW_WLOCK(&layer3_chain); - error = flush_table(&layer3_chain, tbl); - IPFW_WUNLOCK(&layer3_chain); + IPFW_WLOCK(&V_layer3_chain); + error = flush_table(&V_layer3_chain, tbl); + IPFW_WUNLOCK(&V_layer3_chain); } break; @@ -4623,9 +4656,9 @@ if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), sizeof(tbl)))) break; - IPFW_RLOCK(&layer3_chain); - error = count_table(&layer3_chain, tbl, &cnt); - IPFW_RUNLOCK(&layer3_chain); + IPFW_RLOCK(&V_layer3_chain); + error = count_table(&V_layer3_chain, tbl, &cnt); + IPFW_RUNLOCK(&V_layer3_chain); if (error) break; error = sooptcopyout(sopt, &cnt, sizeof(cnt)); @@ -4649,9 +4682,9 @@ } tbl->size = (size - sizeof(*tbl)) / sizeof(ipfw_table_entry); - IPFW_RLOCK(&layer3_chain); - error = dump_table(&layer3_chain, tbl); - IPFW_RUNLOCK(&layer3_chain); + IPFW_RLOCK(&V_layer3_chain); + error = dump_table(&V_layer3_chain, tbl); + IPFW_RUNLOCK(&V_layer3_chain); if (error) { free(tbl, M_TEMP); break; @@ -4675,20 +4708,20 @@ /* * Find/create nat rule. */ - IPFW_WLOCK(&layer3_chain); + IPFW_WLOCK(&V_layer3_chain); ptr = lookup_nat(ser_n->id); if (ptr == NULL) { /* New rule: allocate and init new instance. */ ptr = malloc(sizeof(struct cfg_nat), M_IPFW, M_NOWAIT | M_ZERO); if (ptr == NULL) { - IPFW_WUNLOCK(&layer3_chain); + IPFW_WUNLOCK(&V_layer3_chain); free(buf, M_IPFW); return (ENOSPC); } ptr->lib = LibAliasInit(NULL); if (ptr->lib == NULL) { - IPFW_WUNLOCK(&layer3_chain); + IPFW_WUNLOCK(&V_layer3_chain); free(ptr, M_IPFW); free(buf, M_IPFW); return (EINVAL); @@ -4699,7 +4732,7 @@ UNHOOK_NAT(ptr); flush_nat_ptrs(ser_n->id); } - IPFW_WUNLOCK(&layer3_chain); + IPFW_WUNLOCK(&V_layer3_chain); /* * Basic nat configuration. @@ -4725,9 +4758,9 @@ /* Add new entries. */ add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr); free(buf, M_IPFW); - IPFW_WLOCK(&layer3_chain); - HOOK_NAT(&layer3_chain.nat, ptr); - IPFW_WUNLOCK(&layer3_chain); + IPFW_WLOCK(&V_layer3_chain); + HOOK_NAT(&V_layer3_chain.nat, ptr); + IPFW_WUNLOCK(&V_layer3_chain); } break; @@ -4737,16 +4770,16 @@ int i; error = sooptcopyin(sopt, &i, sizeof i, sizeof i); - IPFW_WLOCK(&layer3_chain); + IPFW_WLOCK(&V_layer3_chain); ptr = lookup_nat(i); if (ptr == NULL) { error = EINVAL; - IPFW_WUNLOCK(&layer3_chain); + IPFW_WUNLOCK(&V_layer3_chain); break; } UNHOOK_NAT(ptr); flush_nat_ptrs(i); - IPFW_WUNLOCK(&layer3_chain); + IPFW_WUNLOCK(&V_layer3_chain); del_redir_spool_cfg(ptr, &ptr->redir_chain); LibAliasUninit(ptr->lib); free(ptr, M_IPFW); @@ -4765,9 +4798,9 @@ off = sizeof(nat_cnt); data = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO); - IPFW_RLOCK(&layer3_chain); + IPFW_RLOCK(&V_layer3_chain); /* Serialize all the data. */ - LIST_FOREACH(n, &layer3_chain.nat, _next) { + LIST_FOREACH(n, &V_layer3_chain.nat, _next) { nat_cnt++; if (off + SOF_NAT < NAT_BUF_LEN) { bcopy(n, &data[off], SOF_NAT); @@ -4796,12 +4829,12 @@ goto nospace; } bcopy(&nat_cnt, data, sizeof(nat_cnt)); - IPFW_RUNLOCK(&layer3_chain); + IPFW_RUNLOCK(&V_layer3_chain); error = sooptcopyout(sopt, data, NAT_BUF_LEN); free(data, M_IPFW); break; nospace: - IPFW_RUNLOCK(&layer3_chain); + IPFW_RUNLOCK(&V_layer3_chain); printf("serialized data buffer not big enough:" "please increase NAT_BUF_LEN\n"); free(data, M_IPFW); @@ -4818,16 +4851,16 @@ sof = LIBALIAS_BUF_SIZE; cnt = 0; - IPFW_RLOCK(&layer3_chain); + IPFW_RLOCK(&V_layer3_chain); size = i = 0; - LIST_FOREACH(ptr, &layer3_chain.nat, _next) { + LIST_FOREACH(ptr, &V_layer3_chain.nat, _next) { if (ptr->lib->logDesc == NULL) continue; cnt++; size = cnt * (sof + sizeof(int)); data = realloc(data, size, M_IPFW, M_NOWAIT | M_ZERO); if (data == NULL) { - IPFW_RUNLOCK(&layer3_chain); + IPFW_RUNLOCK(&V_layer3_chain); return (ENOSPC); } bcopy(&ptr->id, &data[i], sizeof(int)); @@ -4835,7 +4868,7 @@ bcopy(ptr->lib->logDesc, &data[i], sof); i += sof; } - IPFW_RUNLOCK(&layer3_chain); + IPFW_RUNLOCK(&V_layer3_chain); error = sooptcopyout(sopt, data, size); free(data, M_IPFW); } @@ -4865,13 +4898,16 @@ * every dyn_keepalive_period */ static void -ipfw_tick(void * __unused unused) +ipfw_tick(void *arg) { +#ifdef VIMAGE + struct vnet_ipfw *vnet_ipfw = arg; +#endif struct mbuf *m0, *m, *mnext, **mtailp; int i; ipfw_dyn_rule *q; - if (dyn_keepalive == 0 || ipfw_dyn_v == NULL || dyn_count == 0) + if (V_dyn_keepalive == 0 || V_ipfw_dyn_v == NULL || V_dyn_count == 0) goto done; /* @@ -4883,15 +4919,15 @@ m0 = NULL; mtailp = &m0; IPFW_DYN_LOCK(); - for (i = 0 ; i < curr_dyn_buckets ; i++) { - for (q = ipfw_dyn_v[i] ; q ; q = q->next ) { + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { + for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) { if (q->dyn_type == O_LIMIT_PARENT) continue; if (q->id.proto != IPPROTO_TCP) continue; if ( (q->state & BOTH_SYN) != BOTH_SYN) continue; - if (TIME_LEQ( time_uptime+dyn_keepalive_interval, + if (TIME_LEQ( time_uptime + V_dyn_keepalive_interval, q->expire)) continue; /* too early */ if (TIME_LEQ(q->expire, time_uptime)) @@ -4914,37 +4950,40 @@ ip_output(m, NULL, NULL, 0, NULL, NULL); } done: - callout_reset(&ipfw_timeout, dyn_keepalive_period*hz, ipfw_tick, NULL); + callout_reset(&V_ipfw_timeout, V_dyn_keepalive_period * hz, + ipfw_tick, arg); } -int -ipfw_init(void) +static int vnet_ipfw_iattach(const void *unused) { + INIT_VNET_IPFW(curvnet); struct ip_fw default_rule; int error; -#ifdef INET6 - /* Setup IPv6 fw sysctl tree. */ - sysctl_ctx_init(&ip6_fw_sysctl_ctx); - ip6_fw_sysctl_tree = SYSCTL_ADD_NODE(&ip6_fw_sysctl_ctx, - SYSCTL_STATIC_CHILDREN(_net_inet6_ip6), OID_AUTO, "fw", - CTLFLAG_RW | CTLFLAG_SECURE, 0, "Firewall"); - SYSCTL_ADD_PROC(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree), - OID_AUTO, "enable", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, - &fw6_enable, 0, ipfw_chg_hook, "I", "Enable ipfw+6"); - SYSCTL_ADD_INT(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree), - OID_AUTO, "deny_unknown_exthdrs", CTLFLAG_RW | CTLFLAG_SECURE, - &fw_deny_unknown_exthdrs, 0, - "Deny packets with unknown IPv6 Extension Headers"); -#endif + V_fw_debug = 1; + V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ + V_dyn_buckets = 256; /* must be power of 2 */ + V_curr_dyn_buckets = 256; /* must be power of 2 */ + V_dyn_ack_lifetime = 300; + V_dyn_syn_lifetime = 20; + V_dyn_fin_lifetime = 1; + V_dyn_rst_lifetime = 1; + V_dyn_udp_lifetime = 10; + V_dyn_short_lifetime = 5; + V_dyn_keepalive_interval = 20; + V_dyn_keepalive_period = 5; + V_dyn_keepalive = 1; /* do send keepalives */ + V_dyn_max = 4096; /* max # of dynamic rules */ + V_fw_deny_unknown_exthdrs = 1; - layer3_chain.rules = NULL; - IPFW_LOCK_INIT(&layer3_chain); + V_layer3_chain.rules = NULL; + IPFW_LOCK_INIT(&V_layer3_chain); +#if 0 /* XXX Marko fix this! */ ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - IPFW_DYN_LOCK_INIT(); - callout_init(&ipfw_timeout, CALLOUT_MPSAFE); +#endif + callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE); bzero(&default_rule, sizeof default_rule); @@ -4960,17 +4999,66 @@ #endif O_DENY; - error = add_rule(&layer3_chain, &default_rule); + error = add_rule(&V_layer3_chain, &default_rule); if (error != 0) { printf("ipfw2: error %u initializing default rule " "(support disabled)\n", error); IPFW_DYN_LOCK_DESTROY(); - IPFW_LOCK_DESTROY(&layer3_chain); + IPFW_LOCK_DESTROY(&V_layer3_chain); uma_zdestroy(ipfw_dyn_rule_zone); return (error); } - ip_fw_default_rule = layer3_chain.rules; + ip_fw_default_rule = V_layer3_chain.rules; + +#ifdef IPFIREWALL_VERBOSE + V_fw_verbose = 1; +#endif +#ifdef IPFIREWALL_VERBOSE_LIMIT + V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; +#endif + + error = init_tables(&V_layer3_chain); + if (error) { + panic("init_tables"); /* XXX Marko fix this ! */ + } +#ifdef VIMAGE + callout_reset(&V_ipfw_timeout, hz, ipfw_tick, (void *) vnet_ipfw); +#else + callout_reset(&V_ipfw_timeout, hz, ipfw_tick, NULL); +#endif + +#ifdef IPFIREWALL_NAT + LIST_INIT(&V_layer3_chain.nat); +#endif + + return 0; +} + +int +ipfw_init(void) +{ + ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", + sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + IPFW_DYN_LOCK_INIT(); + +#if 0 /* MARKO XXX */ + /* error = init_tables(&V_layer3_chain); moved to _iattach() */ + if (error) { + IPFW_DYN_LOCK_DESTROY(); + IPFW_LOCK_DESTROY(&V_layer3_chain); + uma_zdestroy(ipfw_dyn_rule_zone); + return (error); + } +#endif + +#ifdef VIMAGE + vnet_mod_register(&vnet_ipfw_modinfo); +#else + vnet_ipfw_iattach(NULL); +#endif + printf("ipfw2 " #ifdef INET6 "(+ipv6) " @@ -4988,78 +5076,82 @@ #else "loadable", #endif - default_rule.cmd[0].opcode == O_ACCEPT ? "accept" : "deny"); +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT + "accept" +#else + "deny" +#endif + ); #ifdef IPFIREWALL_VERBOSE - fw_verbose = 1; -#endif -#ifdef IPFIREWALL_VERBOSE_LIMIT - verbose_limit = IPFIREWALL_VERBOSE_LIMIT; -#endif - if (fw_verbose == 0) printf("disabled\n"); - else if (verbose_limit == 0) +#else +# ifndef IPFIREWALL_VERBOSE_LIMIT printf("unlimited\n"); - else +# else printf("limited to %d packets/entry by default\n", - verbose_limit); + IPFIREWALL_VERBOSE_LIMIT); +# endif +#endif - error = init_tables(&layer3_chain); - if (error) { - IPFW_DYN_LOCK_DESTROY(); - IPFW_LOCK_DESTROY(&layer3_chain); - uma_zdestroy(ipfw_dyn_rule_zone); - return (error); - } ip_fw_ctl_ptr = ipfw_ctl; ip_fw_chk_ptr = ipfw_chk; - callout_reset(&ipfw_timeout, hz, ipfw_tick, NULL); #ifdef IPFIREWALL_NAT - LIST_INIT(&layer3_chain.nat); ifaddr_event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_change, NULL, EVENTHANDLER_PRI_ANY); #endif return (0); } -void -ipfw_destroy(void) +static int vnet_ipfw_idetach(const void *unused) { + INIT_VNET_IPFW(curvnet); struct ip_fw *reap; #ifdef IPFIREWALL_NAT struct cfg_nat *ptr, *ptr_temp; #endif - ip_fw_chk_ptr = NULL; - ip_fw_ctl_ptr = NULL; - callout_drain(&ipfw_timeout); - IPFW_WLOCK(&layer3_chain); - flush_tables(&layer3_chain); + callout_drain(&V_ipfw_timeout); + IPFW_WLOCK(&V_layer3_chain); + flush_tables(&V_layer3_chain); #ifdef IPFIREWALL_NAT - LIST_FOREACH_SAFE(ptr, &layer3_chain.nat, _next, ptr_temp) { + LIST_FOREACH_SAFE(ptr, &V_layer3_chain.nat, _next, ptr_temp) { LIST_REMOVE(ptr, _next); del_redir_spool_cfg(ptr, &ptr->redir_chain); LibAliasUninit(ptr->lib); free(ptr, M_IPFW); } - EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag); #endif - layer3_chain.reap = NULL; - free_chain(&layer3_chain, 1 /* kill default rule */); - reap = layer3_chain.reap, layer3_chain.reap = NULL; - IPFW_WUNLOCK(&layer3_chain); + V_layer3_chain.reap = NULL; + free_chain(&V_layer3_chain, 1 /* kill default rule */); + reap = V_layer3_chain.reap, V_layer3_chain.reap = NULL; + IPFW_WUNLOCK(&V_layer3_chain); if (reap != NULL) reap_rules(reap); - IPFW_DYN_LOCK_DESTROY(); - uma_zdestroy(ipfw_dyn_rule_zone); - if (ipfw_dyn_v != NULL) - free(ipfw_dyn_v, M_IPFW); - IPFW_LOCK_DESTROY(&layer3_chain); + IPFW_LOCK_DESTROY(&V_layer3_chain); + if (V_ipfw_dyn_v != NULL) + free(V_ipfw_dyn_v, M_IPFW); -#ifdef INET6 - /* Free IPv6 fw sysctl tree. */ - sysctl_ctx_free(&ip6_fw_sysctl_ctx); + return 0; +} + +void +ipfw_destroy(void) +{ + ip_fw_chk_ptr = NULL; + ip_fw_ctl_ptr = NULL; + +#ifdef VIMAGE + vnet_mod_deregister(&vnet_ipfw_modinfo); +#else + vnet_ipfw_idetach(NULL); +#endif + +#ifdef IPFIREWALL_NAT + EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag); #endif + IPFW_DYN_LOCK_DESTROY(); + uma_zdestroy(ipfw_dyn_rule_zone); printf("IP firewall unloaded\n"); } --- /u/marko/p4/head/src/sys/netinet/ip_fw_pfil.c 2007-11-13 02:49:10.000000000 +0100 +++ src/sys/netinet/ip_fw_pfil.c 2007-12-10 11:26:11.000000000 +0100 @@ -36,6 +36,7 @@ #endif /* INET */ #endif /* KLD_MODULE */ #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -46,7 +47,9 @@ #include #include #include +#include #include +#include #include #include @@ -65,12 +68,18 @@ #include +#ifndef VIMAGE int fw_enable = 1; -#ifdef INET6 +# ifdef INET6 int fw6_enable = 1; +# endif #endif +#ifdef VIMAGE +int ipfw_chg_hook(SYSCTL_HANDLER_V_ARGS); +#else int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); +#endif /* Dummynet hooks. */ ip_dn_ruledel_t *ip_dn_ruledel_ptr = NULL; @@ -484,8 +493,16 @@ #endif /* INET6 */ int +#ifdef VIMAGE +ipfw_chg_hook(SYSCTL_HANDLER_V_ARGS) +#else ipfw_chg_hook(SYSCTL_HANDLER_ARGS) +#endif { +#ifdef VIMAGE + INIT_VNET_IPFW(curvnet); + SYSCTL_RESOLVE_V_ARG1(); +#endif int enable = *(int *)arg1; int error; @@ -498,14 +515,14 @@ if (enable == *(int *)arg1) return (0); - if (arg1 == &fw_enable) { + if (arg1 == &V_fw_enable) { if (enable) error = ipfw_hook(); else error = ipfw_unhook(); } #ifdef INET6 - if (arg1 == &fw6_enable) { + if (arg1 == &V_fw6_enable) { if (enable) error = ipfw6_hook(); else --- /u/marko/p4/head/src/sys/netinet/ip_icmp.c 2007-10-29 17:17:43.000000000 +0100 +++ src/sys/netinet/ip_icmp.c 2007-12-10 11:26:11.000000000 +0100 @@ -34,6 +34,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -43,11 +44,14 @@ #include #include #include +#include +#include #include #include #include +#include #include #include #include @@ -76,9 +80,11 @@ * host table maintenance routines. */ +#ifndef VIMAGE struct icmpstat icmpstat; -SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RW, - &icmpstat, icmpstat, ""); +#endif +SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_icmp, ICMPCTL_STATS, stats, + CTLFLAG_RW, icmpstat, icmpstat, ""); static int icmpmaskrepl = 0; SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW, @@ -143,6 +149,7 @@ void icmp_error(struct mbuf *n, int type, int code, n_long dest, int mtu) { + INIT_VNET_INET(curvnet); register struct ip *oip = mtod(n, struct ip *), *nip; register unsigned oiphlen = oip->ip_hl << 2; register struct icmp *icp; @@ -155,7 +162,7 @@ printf("icmp_error(%p, %x, %d)\n", oip, type, code); #endif if (type != ICMP_REDIRECT) - icmpstat.icps_error++; + V_icmpstat.icps_error++; /* * Don't send error: * if the original packet was encrypted. @@ -172,7 +179,7 @@ if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && n->m_len >= oiphlen + ICMP_MINLEN && !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiphlen))->icmp_type)) { - icmpstat.icps_oldicmp++; + V_icmpstat.icps_oldicmp++; goto freeit; } /* Drop if IP header plus 8 bytes is not contignous in first mbuf. */ @@ -228,7 +235,7 @@ m->m_len = ICMP_MINLEN + icmplen; icp = mtod(m, struct icmp *); - icmpstat.icps_outhist[type]++; + V_icmpstat.icps_outhist[type]++; icp->icmp_type = type; if (type == ICMP_REDIRECT) icp->icmp_gwaddr.s_addr = dest; @@ -287,6 +294,7 @@ void icmp_input(struct mbuf *m, int off) { + INIT_VNET_INET(curvnet); struct icmp *icp; struct in_ifaddr *ia; struct ip *ip = mtod(m, struct ip *); @@ -309,12 +317,12 @@ } #endif if (icmplen < ICMP_MINLEN) { - icmpstat.icps_tooshort++; + V_icmpstat.icps_tooshort++; goto freeit; } i = hlen + min(icmplen, ICMP_ADVLENMIN); if (m->m_len < i && (m = m_pullup(m, i)) == 0) { - icmpstat.icps_tooshort++; + V_icmpstat.icps_tooshort++; return; } ip = mtod(m, struct ip *); @@ -322,7 +330,7 @@ m->m_data += hlen; icp = mtod(m, struct icmp *); if (in_cksum(m, icmplen)) { - icmpstat.icps_checksum++; + V_icmpstat.icps_checksum++; goto freeit; } m->m_len += hlen; @@ -364,7 +372,7 @@ icmpgw.sin_len = sizeof(struct sockaddr_in); icmpgw.sin_family = AF_INET; - icmpstat.icps_inhist[icp->icmp_type]++; + V_icmpstat.icps_inhist[icp->icmp_type]++; code = icp->icmp_code; switch (icp->icmp_type) { @@ -429,7 +437,7 @@ */ if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { - icmpstat.icps_badlen++; + V_icmpstat.icps_badlen++; goto freeit; } icp->icmp_ip.ip_len = ntohs(icp->icmp_ip.ip_len); @@ -452,13 +460,13 @@ break; badcode: - icmpstat.icps_badcode++; + V_icmpstat.icps_badcode++; break; case ICMP_ECHO: if (!icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { - icmpstat.icps_bmcastecho++; + V_icmpstat.icps_bmcastecho++; break; } icp->icmp_type = ICMP_ECHOREPLY; @@ -470,11 +478,11 @@ case ICMP_TSTAMP: if (!icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { - icmpstat.icps_bmcasttstamp++; + V_icmpstat.icps_bmcasttstamp++; break; } if (icmplen < ICMP_TSLEN) { - icmpstat.icps_badlen++; + V_icmpstat.icps_badlen++; break; } icp->icmp_type = ICMP_TSTAMPREPLY; @@ -523,8 +531,8 @@ } reflect: ip->ip_len += hlen; /* since ip_input deducts this */ - icmpstat.icps_reflect++; - icmpstat.icps_outhist[icp->icmp_type]++; + V_icmpstat.icps_reflect++; + V_icmpstat.icps_outhist[icp->icmp_type]++; icmp_reflect(m); return; @@ -548,13 +556,13 @@ * RFC1812 says we must ignore ICMP redirects if we * are acting as router. */ - if (drop_redirect || ipforwarding) + if (drop_redirect || V_ipforwarding) break; if (code > 3) goto badcode; if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { - icmpstat.icps_badlen++; + V_icmpstat.icps_badlen++; break; } /* @@ -614,6 +622,7 @@ static void icmp_reflect(struct mbuf *m) { + INIT_VNET_INET(curvnet); struct ip *ip = mtod(m, struct ip *); struct ifaddr *ifa; struct ifnet *ifn; @@ -626,7 +635,7 @@ ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) != (IN_LOOPBACKNET << IN_CLASSA_NSHIFT))) { m_freem(m); /* Bad return address */ - icmpstat.icps_badaddr++; + V_icmpstat.icps_badaddr++; goto done; /* Ip_output() will check for broadcast */ } t = ip->ip_dst; @@ -694,7 +703,7 @@ ia = ip_rtaddr(ip->ip_dst); if (ia == NULL) { m_freem(m); - icmpstat.icps_noroute++; + V_icmpstat.icps_noroute++; goto done; } match: @@ -703,7 +712,7 @@ #endif t = IA_SIN(ia)->sin_addr; ip->ip_src = t; - ip->ip_ttl = ip_defttl; + ip->ip_ttl = V_ip_defttl; if (optlen > 0) { register u_char *cp; --- /u/marko/p4/head/src/sys/netinet/ip_input.c 2007-12-03 11:00:09.000000000 +0100 +++ src/sys/netinet/ip_input.c 2007-12-10 11:26:11.000000000 +0100 @@ -38,6 +38,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_carp.h" +#include "opt_vimage.h" #include #include @@ -51,7 +52,9 @@ #include #include #include +#include +#include #include #include #include @@ -60,6 +63,7 @@ #include #include +#include #include #include #include @@ -84,34 +88,38 @@ #include -int rsvp_on = 0; +#ifndef VIMAGE +int rsvp_on; +int ipforwarding; +static int ipsendredirects; +int ip_defttl; +static int ip_keepfaith; +static int ip_sendsourcequench; +int ip_do_randomid; +static int ip_checkinterface; +#endif -int ipforwarding = 0; -SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW, - &ipforwarding, 0, "Enable IP forwarding between interfaces"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_FORWARDING, + forwarding, CTLFLAG_RW, ipforwarding, 0, + "Enable IP forwarding between interfaces"); -static int ipsendredirects = 1; /* XXX */ -SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW, - &ipsendredirects, 0, "Enable sending IP redirects"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_SENDREDIRECTS, + redirect, CTLFLAG_RW, ipsendredirects, 0, + "Enable sending IP redirects"); -int ip_defttl = IPDEFTTL; -SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, - &ip_defttl, 0, "Maximum TTL on IP packets"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_DEFTTL, + ttl, CTLFLAG_RW, ip_defttl, 0, "Maximum TTL on IP packets"); -static int ip_keepfaith = 0; -SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW, - &ip_keepfaith, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_KEEPFAITH, + keepfaith, CTLFLAG_RW, ip_keepfaith, 0, "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); -static int ip_sendsourcequench = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_RW, - &ip_sendsourcequench, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, + sendsourcequench, CTLFLAG_RW, ip_sendsourcequench, 0, "Enable the transmission of source quench packets"); -int ip_do_randomid = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW, - &ip_do_randomid, 0, - "Assign random ip_id values"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, random_id, + CTLFLAG_RW, ip_do_randomid, 0, "Assign random ip_id values"); /* * XXX - Setting ip_checkinterface mostly implements the receive side of @@ -126,9 +134,9 @@ * to the loopback interface instead of the interface where the * packets for those addresses are received. */ -static int ip_checkinterface = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW, - &ip_checkinterface, 0, "Verify packet arrives on correct interface"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, + check_interface, CTLFLAG_RW, ip_checkinterface, 0, + "Verify packet arrives on correct interface"); struct pfil_head inet_pfil_hook; /* Packet filter hooks */ @@ -138,9 +146,11 @@ extern struct domain inetdomain; extern struct protosw inetsw[]; u_char ip_protox[IPPROTO_MAX]; +#ifndef VIMAGE struct in_ifaddrhead in_ifaddrhead; /* first inet address */ struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ u_long in_ifaddrhmask; /* mask for hash table */ +#endif SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW, &ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue"); @@ -148,22 +158,20 @@ &ipintrq.ifq_drops, 0, "Number of packets dropped from the IP input queue"); +#ifndef VIMAGE struct ipstat ipstat; -SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW, - &ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); - -/* - * IP datagram reassembly. - */ -#define IPREASS_NHASH_LOG2 6 -#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) -#define IPREASS_HMASK (IPREASS_NHASH - 1) -#define IPREASS_HASH(x,y) \ - (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) +#endif +SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW, + ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); -static uma_zone_t ipq_zone; -static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; static struct mtx ipqlock; +#ifndef VIMAGE +static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; +static uma_zone_t ipq_zone; +static int nipq; +static int maxnipq; +static int maxfragsperpacket; +#endif #define IPQ_LOCK() mtx_lock(&ipqlock) #define IPQ_UNLOCK() mtx_unlock(&ipqlock) @@ -173,14 +181,12 @@ static void maxnipq_update(void); static void ipq_zone_change(void *); -static int maxnipq; /* Administrative limit on # reass queues. */ -static int nipq = 0; /* Total # of reass queues */ -SYSCTL_INT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD, - &nipq, 0, "Current number of IPv4 fragment reassembly queue entries"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, fragpackets, + CTLFLAG_RD, nipq, 0, + "Current number of IPv4 fragment reassembly queue entries"); -static int maxfragsperpacket; -SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW, - &maxfragsperpacket, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, maxfragsperpacket, + CTLFLAG_RW, maxfragsperpacket, 0, "Maximum number of IPv4 fragments allowed per packet"); struct callout ipport_tick_callout; @@ -191,9 +197,11 @@ #endif #ifdef IPSTEALTH -int ipstealth = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, - &ipstealth, 0, "IP stealth mode, no TTL decrementation on forwarding"); +#ifndef VIMAGE +int ipstealth; +#endif +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, + ipstealth, 0, "IP stealth mode, no TTL decrementation on forwarding"); #endif /* @@ -206,6 +214,19 @@ static void ip_freef(struct ipqhead *, struct ipq *); +#ifdef VIMAGE +static void vnet_inet_register(void); + +VNET_MOD_DECLARE(INET, inet, NULL, NULL, NET, NULL) + +static void vnet_inet_register() +{ + vnet_mod_register(&vnet_inet_modinfo); +} + +SYSINIT(inet, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, vnet_inet_register, 0); +#endif + /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. @@ -213,11 +234,58 @@ void ip_init(void) { + INIT_VNET_INET(curvnet); struct protosw *pr; int i; - TAILQ_INIT(&in_ifaddrhead); - in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &in_ifaddrhmask); + TAILQ_INIT(&V_in_ifaddrhead); + V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, + &V_in_ifaddrhmask); + + /* Initialize IP reassembly queue. */ + for (i = 0; i < IPREASS_NHASH; i++) + TAILQ_INIT(&V_ipq[i]); + V_nipq = 0; + V_maxnipq = nmbclusters / 32; + V_maxfragsperpacket = 16; + V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, + NULL, UMA_ALIGN_PTR, 0); + maxnipq_update(); + + V_ipsendredirects = 1; /* XXX */ + V_ip_defttl = IPDEFTTL; +#ifdef IPSTEALTH + V_ipstealth = 0; +#endif + + V_ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ + V_ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ + V_ipport_firstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ + V_ipport_lastauto = IPPORT_HILASTAUTO; /* 65535 */ + V_ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ + V_ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ + V_ipport_reservedhigh = IPPORT_RESERVED - 1; /* 1023 */ + V_ipport_reservedlow = 0; + V_ipport_randomized = 1; /* user controlled via sysctl */ + V_ipport_randomcps = 10; /* user controlled via sysctl */ + V_ipport_randomtime = 45; /* user controlled via sysctl */ + V_ipport_stoprandom = 0; /* toggled by ipport_tick */ + + V_rsvp_on = 0; + V_ipforwarding = 0; + V_ipsendredirects = 1; /* XXX */ + V_ip_defttl = IPDEFTTL; + V_ip_keepfaith = 0; + V_ip_sendsourcequench = 0; + V_ip_do_randomid = 0; + V_ip_checkinterface = 0; + +#ifdef VIMAGE + /* Skip initialization of globals for non-default instances. */ + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif + pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip_init: PF_INET not found"); @@ -245,26 +313,17 @@ printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); - /* Initialize IP reassembly queue. */ - IPQ_LOCK_INIT(); - for (i = 0; i < IPREASS_NHASH; i++) - TAILQ_INIT(&ipq[i]); - maxnipq = nmbclusters / 32; - maxfragsperpacket = 16; - ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, - NULL, UMA_ALIGN_PTR, 0); - maxnipq_update(); - /* Start ipport_tick. */ callout_init(&ipport_tick_callout, CALLOUT_MPSAFE); - ipport_tick(NULL); + callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change, NULL, EVENTHANDLER_PRI_ANY); /* Initialize various other remaining things. */ - ip_id = time_second & 0xffff; + IPQ_LOCK_INIT(); + V_ip_id = time_second & 0xffff; ipintrq.ifq_maxlen = ipqmaxlen; mtx_init(&ipintrq.ifq_mtx, "ip_inq", NULL, MTX_DEF); netisr_register(NETISR_IP, ip_input, &ipintrq, NETISR_MPSAFE); @@ -284,6 +343,7 @@ void ip_input(struct mbuf *m) { + INIT_VNET_INET(curvnet); struct ip *ip = NULL; struct in_ifaddr *ia = NULL; struct ifaddr *ifa; @@ -306,31 +366,31 @@ goto ours; } - ipstat.ips_total++; + V_ipstat.ips_total++; if (m->m_pkthdr.len < sizeof(struct ip)) goto tooshort; if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { - ipstat.ips_toosmall++; + V_ipstat.ips_toosmall++; return; } ip = mtod(m, struct ip *); if (ip->ip_v != IPVERSION) { - ipstat.ips_badvers++; + V_ipstat.ips_badvers++; goto bad; } hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ - ipstat.ips_badhlen++; + V_ipstat.ips_badhlen++; goto bad; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { - ipstat.ips_badhlen++; + V_ipstat.ips_badhlen++; return; } ip = mtod(m, struct ip *); @@ -340,7 +400,7 @@ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) { - ipstat.ips_badaddr++; + V_ipstat.ips_badaddr++; goto bad; } } @@ -355,7 +415,7 @@ } } if (sum) { - ipstat.ips_badsum++; + V_ipstat.ips_badsum++; goto bad; } @@ -370,7 +430,7 @@ */ ip->ip_len = ntohs(ip->ip_len); if (ip->ip_len < hlen) { - ipstat.ips_badlen++; + V_ipstat.ips_badlen++; goto bad; } ip->ip_off = ntohs(ip->ip_off); @@ -383,7 +443,7 @@ */ if (m->m_pkthdr.len < ip->ip_len) { tooshort: - ipstat.ips_tooshort++; + V_ipstat.ips_tooshort++; goto bad; } if (m->m_pkthdr.len > ip->ip_len) { @@ -455,7 +515,7 @@ * anywhere else. Also checks if the rsvp daemon is running before * grabbing the packet. */ - if (rsvp_on && ip->ip_p==IPPROTO_RSVP) + if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) goto ours; /* @@ -464,7 +524,7 @@ * we receive might be for us (and let the upper layers deal * with it). */ - if (TAILQ_EMPTY(&in_ifaddrhead) && + if (TAILQ_EMPTY(&V_in_ifaddrhead) && (m->m_flags & (M_MCAST|M_BCAST)) == 0) goto ours; @@ -486,7 +546,7 @@ * insert a workaround. If the packet got here, we already * checked with carp_iamatch() and carp_forus(). */ - checkif = ip_checkinterface && (ipforwarding == 0) && + checkif = V_ip_checkinterface && (V_ipforwarding == 0) && m->m_pkthdr.rcvif != NULL && ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) && #ifdef DEV_CARP @@ -534,13 +594,13 @@ } /* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */ if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { - ipstat.ips_cantforward++; + V_ipstat.ips_cantforward++; m_freem(m); return; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct in_multi *inm; - if (ip_mrouter) { + if (V_ip_mrouter) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the @@ -551,7 +611,7 @@ */ if (ip_mforward && ip_mforward(ip, m->m_pkthdr.rcvif, m, 0) != 0) { - ipstat.ips_cantforward++; + V_ipstat.ips_cantforward++; m_freem(m); return; } @@ -563,7 +623,7 @@ */ if (ip->ip_p == IPPROTO_IGMP) goto ours; - ipstat.ips_forward++; + V_ipstat.ips_forward++; } /* * See if we belong to the destination multicast group on the @@ -573,7 +633,7 @@ IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm); IN_MULTI_UNLOCK(); if (inm == NULL) { - ipstat.ips_notmember++; + V_ipstat.ips_notmember++; m_freem(m); return; } @@ -588,7 +648,7 @@ * FAITH(Firewall Aided Internet Translator) */ if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { - if (ip_keepfaith) { + if (V_ip_keepfaith) { if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP) goto ours; } @@ -599,8 +659,8 @@ /* * Not for us; forward if possible and desirable. */ - if (ipforwarding == 0) { - ipstat.ips_cantforward++; + if (V_ipforwarding == 0) { + V_ipstat.ips_cantforward++; m_freem(m); } else { #ifdef IPSEC @@ -660,7 +720,7 @@ /* * Switch out to protocol's input routine. */ - ipstat.ips_delivered++; + V_ipstat.ips_delivered++; (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen); return; @@ -676,32 +736,34 @@ static void maxnipq_update(void) { + INIT_VNET_INET(curvnet); /* * -1 for unlimited allocation. */ - if (maxnipq < 0) - uma_zone_set_max(ipq_zone, 0); + if (V_maxnipq < 0) + uma_zone_set_max(V_ipq_zone, 0); /* * Positive number for specific bound. */ - if (maxnipq > 0) - uma_zone_set_max(ipq_zone, maxnipq); + if (V_maxnipq > 0) + uma_zone_set_max(V_ipq_zone, V_maxnipq); /* * Zero specifies no further fragment queue allocation -- set the * bound very low, but rely on implementation elsewhere to actually * prevent allocation and reclaim current queues. */ - if (maxnipq == 0) - uma_zone_set_max(ipq_zone, 1); + if (V_maxnipq == 0) + uma_zone_set_max(V_ipq_zone, 1); } static void ipq_zone_change(void *tag) { + INIT_VNET_INET(curvnet); - if (maxnipq > 0 && maxnipq < (nmbclusters / 32)) { - maxnipq = nmbclusters / 32; + if (V_maxnipq > 0 && V_maxnipq < (nmbclusters / 32)) { + V_maxnipq = nmbclusters / 32; maxnipq_update(); } } @@ -709,9 +771,10 @@ static int sysctl_maxnipq(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET(curvnet); int error, i; - i = maxnipq; + i = V_maxnipq; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) return (error); @@ -722,7 +785,7 @@ */ if (i < -1) return (EINVAL); - maxnipq = i; + V_maxnipq = i; maxnipq_update(); return (0); } @@ -744,6 +807,7 @@ struct mbuf * ip_reass(struct mbuf *m) { + INIT_VNET_INET(curvnet); struct ip *ip; struct mbuf *p, *q, *nq, *t; struct ipq *fp = NULL; @@ -753,9 +817,9 @@ u_short hash; /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ - if (maxnipq == 0 || maxfragsperpacket == 0) { - ipstat.ips_fragments++; - ipstat.ips_fragdropped++; + if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { + V_ipstat.ips_fragments++; + V_ipstat.ips_fragdropped++; m_freem(m); return (NULL); } @@ -764,7 +828,7 @@ hlen = ip->ip_hl << 2; hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); - head = &ipq[hash]; + head = &V_ipq[hash]; IPQ_LOCK(); /* @@ -787,7 +851,7 @@ * Attempt to trim the number of allocated fragment queues if it * exceeds the administrative limit. */ - if ((nipq > maxnipq) && (maxnipq > 0)) { + if ((V_nipq > V_maxnipq) && (V_maxnipq > 0)) { /* * drop something from the tail of the current queue * before proceeding further @@ -795,15 +859,16 @@ struct ipq *q = TAILQ_LAST(head, ipqhead); if (q == NULL) { /* gak */ for (i = 0; i < IPREASS_NHASH; i++) { - struct ipq *r = TAILQ_LAST(&ipq[i], ipqhead); + struct ipq *r = TAILQ_LAST(&V_ipq[i], ipqhead); if (r) { - ipstat.ips_fragtimeout += r->ipq_nfrags; - ip_freef(&ipq[i], r); + V_ipstat.ips_fragtimeout += + r->ipq_nfrags; + ip_freef(&V_ipq[i], r); break; } } } else { - ipstat.ips_fragtimeout += q->ipq_nfrags; + V_ipstat.ips_fragtimeout += q->ipq_nfrags; ip_freef(head, q); } } @@ -820,7 +885,7 @@ * that's a non-zero multiple of 8 bytes. */ if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) { - ipstat.ips_toosmall++; /* XXX */ + V_ipstat.ips_toosmall++; /* XXX */ goto dropfrag; } m->m_flags |= M_FRAG; @@ -833,7 +898,7 @@ * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ - ipstat.ips_fragments++; + V_ipstat.ips_fragments++; m->m_pkthdr.header = ip; /* Previous ip_reass() started here. */ @@ -848,19 +913,19 @@ * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { - fp = uma_zalloc(ipq_zone, M_NOWAIT); + fp = uma_zalloc(V_ipq_zone, M_NOWAIT); if (fp == NULL) goto dropfrag; #ifdef MAC if (mac_ipq_init(fp, M_NOWAIT) != 0) { - uma_zfree(ipq_zone, fp); + uma_zfree(V_ipq_zone, fp); fp = NULL; goto dropfrag; } mac_ipq_create(m, fp); #endif TAILQ_INSERT_HEAD(head, fp, ipq_list); - nipq++; + V_nipq++; fp->ipq_nfrags = 1; fp->ipq_ttl = IPFRAGTTL; fp->ipq_p = ip->ip_p; @@ -944,7 +1009,7 @@ } nq = q->m_nextpkt; m->m_nextpkt = nq; - ipstat.ips_fragdropped++; + V_ipstat.ips_fragdropped++; fp->ipq_nfrags--; m_freem(q); } @@ -962,8 +1027,8 @@ next = 0; for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { if (GETIP(q)->ip_off != next) { - if (fp->ipq_nfrags > maxfragsperpacket) { - ipstat.ips_fragdropped += fp->ipq_nfrags; + if (fp->ipq_nfrags > V_maxfragsperpacket) { + V_ipstat.ips_fragdropped += fp->ipq_nfrags; ip_freef(head, fp); } goto done; @@ -972,8 +1037,8 @@ } /* Make sure the last packet didn't have the IP_MF flag */ if (p->m_flags & M_FRAG) { - if (fp->ipq_nfrags > maxfragsperpacket) { - ipstat.ips_fragdropped += fp->ipq_nfrags; + if (fp->ipq_nfrags > V_maxfragsperpacket) { + V_ipstat.ips_fragdropped += fp->ipq_nfrags; ip_freef(head, fp); } goto done; @@ -985,8 +1050,8 @@ q = fp->ipq_frags; ip = GETIP(q); if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { - ipstat.ips_toolong++; - ipstat.ips_fragdropped += fp->ipq_nfrags; + V_ipstat.ips_toolong++; + V_ipstat.ips_fragdropped += fp->ipq_nfrags; ip_freef(head, fp); goto done; } @@ -1028,19 +1093,19 @@ ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; TAILQ_REMOVE(head, fp, ipq_list); - nipq--; - uma_zfree(ipq_zone, fp); + V_nipq--; + uma_zfree(V_ipq_zone, fp); m->m_len += (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2); /* some debugging cruft by sklower, below, will go away soon */ if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ m_fixhdr(m); - ipstat.ips_reassembled++; + V_ipstat.ips_reassembled++; IPQ_UNLOCK(); return (m); dropfrag: - ipstat.ips_fragdropped++; + V_ipstat.ips_fragdropped++; if (fp != NULL) fp->ipq_nfrags--; m_freem(m); @@ -1058,6 +1123,7 @@ static void ip_freef(struct ipqhead *fhp, struct ipq *fp) { + INIT_VNET_INET(curvnet); struct mbuf *q; IPQ_LOCK_ASSERT(); @@ -1068,8 +1134,8 @@ m_freem(q); } TAILQ_REMOVE(fhp, fp, ipq_list); - uma_zfree(ipq_zone, fp); - nipq--; + uma_zfree(V_ipq_zone, fp); + V_nipq--; } /* @@ -1084,15 +1150,17 @@ int i; IPQ_LOCK(); + VNET_ITERLOOP_BEGIN(); + INIT_VNET_INET(vnet_iter); for (i = 0; i < IPREASS_NHASH; i++) { - for(fp = TAILQ_FIRST(&ipq[i]); fp;) { + for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) { struct ipq *fpp; fpp = fp; fp = TAILQ_NEXT(fp, ipq_list); if(--fpp->ipq_ttl == 0) { - ipstat.ips_fragtimeout += fpp->ipq_nfrags; - ip_freef(&ipq[i], fpp); + V_ipstat.ips_fragtimeout += fpp->ipq_nfrags; + ip_freef(&V_ipq[i], fpp); } } } @@ -1101,15 +1169,16 @@ * (due to the limit being lowered), drain off * enough to get down to the new limit. */ - if (maxnipq >= 0 && nipq > maxnipq) { + if (V_maxnipq >= 0 && V_nipq > V_maxnipq) { for (i = 0; i < IPREASS_NHASH; i++) { - while (nipq > maxnipq && !TAILQ_EMPTY(&ipq[i])) { - ipstat.ips_fragdropped += - TAILQ_FIRST(&ipq[i])->ipq_nfrags; - ip_freef(&ipq[i], TAILQ_FIRST(&ipq[i])); + while (V_nipq > V_maxnipq && !TAILQ_EMPTY(&V_ipq[i])) { + V_ipstat.ips_fragdropped += + TAILQ_FIRST(&V_ipq[i])->ipq_nfrags; + ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i])); } } } + VNET_ITERLOOP_END(); IPQ_UNLOCK(); } @@ -1122,13 +1191,16 @@ int i; IPQ_LOCK(); + VNET_ITERLOOP_BEGIN(); + INIT_VNET_INET(vnet_iter); for (i = 0; i < IPREASS_NHASH; i++) { - while(!TAILQ_EMPTY(&ipq[i])) { - ipstat.ips_fragdropped += - TAILQ_FIRST(&ipq[i])->ipq_nfrags; - ip_freef(&ipq[i], TAILQ_FIRST(&ipq[i])); + while(!TAILQ_EMPTY(&V_ipq[i])) { + V_ipstat.ips_fragdropped += + TAILQ_FIRST(&V_ipq[i])->ipq_nfrags; + ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i])); } } + VNET_ITERLOOP_END(); IPQ_UNLOCK(); in_rtqdrain(); } @@ -1245,6 +1317,7 @@ void ip_forward(struct mbuf *m, int srcrt) { + INIT_VNET_INET(curvnet); struct ip *ip = mtod(m, struct ip *); struct in_ifaddr *ia = NULL; struct mbuf *mcopy; @@ -1252,7 +1325,7 @@ int error, type = 0, code = 0, mtu = 0; if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { - ipstat.ips_cantforward++; + V_ipstat.ips_cantforward++; m_freem(m); return; } @@ -1324,7 +1397,7 @@ * or a route modified by a redirect. */ dest.s_addr = 0; - if (!srcrt && ipsendredirects && ia->ia_ifp == m->m_pkthdr.rcvif) { + if (!srcrt && V_ipsendredirects && ia->ia_ifp == m->m_pkthdr.rcvif) { struct sockaddr_in *sin; struct route ro; struct rtentry *rt; @@ -1360,11 +1433,11 @@ error = ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); if (error) - ipstat.ips_cantforward++; + V_ipstat.ips_cantforward++; else { - ipstat.ips_forward++; + V_ipstat.ips_forward++; if (type) - ipstat.ips_redirectsent++; + V_ipstat.ips_redirectsent++; else { if (mcopy) m_freem(mcopy); @@ -1407,7 +1480,7 @@ else mtu = ip_next_mtu(ip->ip_len, 0); } - ipstat.ips_cantfrag++; + V_ipstat.ips_cantfrag++; break; case ENOBUFS: @@ -1419,7 +1492,7 @@ * Those who need source quench packets may re-enable them * via the net.inet.ip.sendsourcequench sysctl. */ - if (ip_sendsourcequench == 0) { + if (V_ip_sendsourcequench == 0) { m_freem(mcopy); return; } else { @@ -1439,6 +1512,8 @@ ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, struct mbuf *m) { + INIT_VNET_NET(inp->inp_vnet); + if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) { struct bintime bt; @@ -1501,7 +1576,7 @@ struct sockaddr_dl *sdl2 = &sdlbuf.sdl; if (((ifp = m->m_pkthdr.rcvif)) - && ( ifp->if_index && (ifp->if_index <= if_index))) { + && ( ifp->if_index && (ifp->if_index <= V_if_index))) { sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; /* * Change our mind and don't try copy. @@ -1532,26 +1607,30 @@ * locking. This code remains in ip_input.c as ip_mroute.c is optionally * compiled. */ +#ifndef VIMAGE static int ip_rsvp_on; struct socket *ip_rsvpd; +#endif int ip_rsvp_init(struct socket *so) { + INIT_VNET_INET(so->so_vnet); + if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; - if (ip_rsvpd != NULL) + if (V_ip_rsvpd != NULL) return EADDRINUSE; - ip_rsvpd = so; + V_ip_rsvpd = so; /* * This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ - if (!ip_rsvp_on) { - ip_rsvp_on = 1; - rsvp_on++; + if (!V_ip_rsvp_on) { + V_ip_rsvp_on = 1; + V_rsvp_on++; } return 0; @@ -1560,14 +1639,16 @@ int ip_rsvp_done(void) { - ip_rsvpd = NULL; + INIT_VNET_INET(curvnet); + + V_ip_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ - if (ip_rsvp_on) { - ip_rsvp_on = 0; - rsvp_on--; + if (V_ip_rsvp_on) { + V_ip_rsvp_on = 0; + V_rsvp_on--; } return 0; } @@ -1575,6 +1656,8 @@ void rsvp_input(struct mbuf *m, int off) /* XXX must fixup manually */ { + INIT_VNET_INET(curvnet); + if (rsvp_input_p) { /* call the real one if loaded */ rsvp_input_p(m, off); return; @@ -1585,12 +1668,12 @@ * case we want to throw the packet away. */ - if (!rsvp_on) { + if (!V_rsvp_on) { m_freem(m); return; } - if (ip_rsvpd != NULL) { + if (V_ip_rsvpd != NULL) { rip_input(m, off); return; } --- /u/marko/p4/head/src/sys/netinet/ip_ipsec.c 2007-10-16 13:53:38.000000000 +0200 +++ src/sys/netinet/ip_ipsec.c 2007-10-22 18:06:42.000000000 +0200 @@ -31,6 +31,7 @@ __FBSDID("$FreeBSD: src/sys/netinet/ip_ipsec.c,v 1.8 2007/10/07 20:44:23 silby Exp $"); #include "opt_ipsec.h" +#include "opt_vimage.h" #include #include @@ -41,10 +42,12 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -60,6 +63,7 @@ #include #include #include +#include #endif /*IPSEC*/ extern struct protosw inetsw[]; @@ -92,6 +96,8 @@ ip_ipsec_fwd(struct mbuf *m) { #ifdef IPSEC + INIT_VNET_INET(curvnet); + INIT_VNET_IPSEC(curvnet); struct m_tag *mtag; struct tdb_ident *tdbi; struct secpolicy *sp; @@ -120,7 +126,7 @@ KEY_FREESP(&sp); splx(s); if (error) { - ipstat.ips_cantforward++; + V_ipstat.ips_cantforward++; return 1; } #endif /* IPSEC */ @@ -137,6 +143,7 @@ int ip_ipsec_input(struct mbuf *m) { + INIT_VNET_IPSEC(curvnet); struct ip *ip = mtod(m, struct ip *); #ifdef IPSEC struct m_tag *mtag; --- /u/marko/p4/head/src/sys/netinet/ip_mroute.c 2007-10-16 13:53:38.000000000 +0200 +++ src/sys/netinet/ip_mroute.c 2007-10-22 18:06:42.000000000 +0200 @@ -60,6 +60,7 @@ #include "opt_inet6.h" #include "opt_mac.h" #include "opt_mrouting.h" +#include "opt_vimage.h" #define _PIM_VT 1 @@ -80,9 +81,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -421,6 +424,7 @@ static int X_ip_mrouter_set(struct socket *so, struct sockopt *sopt) { + INIT_VNET_INET(curvnet); int error, optval; vifi_t vifi; struct vifctl vifc; @@ -428,7 +432,7 @@ struct bw_upcall bw_upcall; uint32_t i; - if (so != ip_mrouter && sopt->sopt_name != MRT_INIT) + if (so != V_ip_mrouter && sopt->sopt_name != MRT_INIT) return EPERM; error = 0; @@ -645,6 +649,7 @@ static void if_detached_event(void *arg __unused, struct ifnet *ifp) { + INIT_VNET_INET(curvnet); vifi_t vifi; int i; struct mfc *mfc; @@ -654,7 +659,7 @@ struct rtdetq *npq; MROUTER_LOCK(); - if (ip_mrouter == NULL) { + if (V_ip_mrouter == NULL) { MROUTER_UNLOCK(); } @@ -708,6 +713,8 @@ static int ip_mrouter_init(struct socket *so, int version) { + INIT_VNET_INET(curvnet); + if (mrtdebug) log(LOG_DEBUG, "ip_mrouter_init: so_type = %d, pr_protocol = %d\n", so->so_type, so->so_proto->pr_protocol); @@ -720,7 +727,7 @@ MROUTER_LOCK(); - if (ip_mrouter != NULL) { + if (V_ip_mrouter != NULL) { MROUTER_UNLOCK(); return EADDRINUSE; } @@ -738,7 +745,7 @@ expire_bw_upcalls_send, NULL); callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL); - ip_mrouter = so; + V_ip_mrouter = so; MROUTER_UNLOCK(); @@ -754,6 +761,7 @@ static int X_ip_mrouter_done(void) { + INIT_VNET_INET(curvnet); vifi_t vifi; int i; struct ifnet *ifp; @@ -763,7 +771,7 @@ MROUTER_LOCK(); - if (ip_mrouter == NULL) { + if (V_ip_mrouter == NULL) { MROUTER_UNLOCK(); return EINVAL; } @@ -771,7 +779,7 @@ /* * Detach/disable hooks to the reset of the system. */ - ip_mrouter = NULL; + V_ip_mrouter = NULL; mrt_api_config = 0; VIF_LOCK(); @@ -1285,6 +1293,7 @@ X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo) { + INIT_VNET_INET(curvnet); struct mfc *rt; int error; vifi_t vifi; @@ -1449,7 +1458,7 @@ mrtstat.mrts_upcalls++; k_igmpsrc.sin_addr = ip->ip_src; - if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { + if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); ++mrtstat.mrts_upq_sockfull; fail1: @@ -1589,6 +1598,7 @@ static int ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) { + INIT_VNET_INET(curvnet); struct ip *ip = mtod(m, struct ip *); vifi_t vifi; int plen = ip->ip_len; @@ -1668,7 +1678,7 @@ mrtstat.mrts_upcalls++; k_igmpsrc.sin_addr = im->im_src; - if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { + if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); ++mrtstat.mrts_upq_sockfull; @@ -1800,6 +1810,7 @@ static int X_ip_rsvp_vif(struct socket *so, struct sockopt *sopt) { + INIT_VNET_INET(curvnet); int error, vifi; if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) @@ -1829,7 +1840,7 @@ */ if (!viftable[vifi].v_rsvp_on) { viftable[vifi].v_rsvp_on = 1; - rsvp_on++; + V_rsvp_on++; } } else { /* must be VIF_OFF */ /* @@ -1844,7 +1855,7 @@ */ if (viftable[vifi].v_rsvp_on) { viftable[vifi].v_rsvp_on = 0; - rsvp_on--; + V_rsvp_on--; } } VIF_UNLOCK(); @@ -1854,6 +1865,7 @@ static void X_ip_rsvp_force_done(struct socket *so) { + INIT_VNET_INET(curvnet); int vifi; /* Don't bother if it is not the right type of socket. */ @@ -1873,7 +1885,7 @@ */ if (viftable[vifi].v_rsvp_on) { viftable[vifi].v_rsvp_on = 0; - rsvp_on--; + V_rsvp_on--; } } } @@ -1884,19 +1896,20 @@ static void X_rsvp_input(struct mbuf *m, int off) { + INIT_VNET_INET(curvnet); int vifi; struct ip *ip = mtod(m, struct ip *); struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET }; struct ifnet *ifp; if (rsvpdebug) - printf("rsvp_input: rsvp_on %d\n",rsvp_on); + printf("rsvp_input: rsvp_on %d\n", V_rsvp_on); /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ - if (!rsvp_on) { + if (!V_rsvp_on) { m_freem(m); return; } @@ -1928,7 +1941,7 @@ * then use it. Otherwise, drop packet since there * is no specific socket for this vif. */ - if (ip_rsvpd != NULL) { + if (V_ip_rsvpd != NULL) { if (rsvpdebug) printf("rsvp_input: Sending packet up old-style socket\n"); rip_input(m, off); /* xxx */ @@ -2285,6 +2298,7 @@ static void bw_upcalls_send(void) { + INIT_VNET_INET(curvnet); struct mbuf *m; int len = bw_upcalls_n * sizeof(bw_upcalls[0]); struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; @@ -2323,7 +2337,7 @@ * XXX do we need to set the address in k_igmpsrc ? */ mrtstat.mrts_upcalls++; - if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) { + if (socket_send(V_ip_mrouter, m, &k_igmpsrc) < 0) { log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); ++mrtstat.mrts_upq_sockfull; } @@ -2645,6 +2659,7 @@ pim_register_send_upcall(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { + INIT_VNET_INET(curvnet); struct mbuf *mb_first; int len = ntohs(ip->ip_len); struct igmpmsg *im; @@ -2677,7 +2692,7 @@ mrtstat.mrts_upcalls++; - if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) { + if (socket_send(V_ip_mrouter, mb_first, &k_igmpsrc) < 0) { if (mrtdebug & DEBUG_PIM) log(LOG_WARNING, "mcast: pim_register_send_upcall: ip_mrouter socket queue full"); @@ -2699,6 +2714,7 @@ pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { + INIT_VNET_INET(curvnet); struct mbuf *mb_first; struct ip *ip_outer; struct pim_encap_pimhdr *pimhdr; @@ -3028,6 +3044,7 @@ static int ip_mroute_modevent(module_t mod, int type, void *unused) { + INIT_VNET_INET(curvnet); switch (type) { case MOD_LOAD: MROUTER_LOCK_INIT(); @@ -3094,7 +3111,7 @@ * just loaded and then unloaded w/o starting up a user * process we still need to cleanup. */ - if (ip_mrouter + if (V_ip_mrouter #ifdef INET6 || ip6_mrouter #endif --- /u/marko/p4/head/src/sys/netinet/ip_options.c 2008-01-28 23:53:52.000000000 +0100 +++ src/sys/netinet/ip_options.c 2008-02-27 11:49:12.000000000 +0100 @@ -34,6 +34,7 @@ #include "opt_ipstealth.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -45,7 +46,9 @@ #include #include #include +#include +#include #include #include #include @@ -53,6 +56,7 @@ #include #include +#include #include #include #include @@ -97,6 +101,7 @@ int ip_dooptions(struct mbuf *m, int pass) { + INIT_VNET_INET(curvnet); struct ip *ip = mtod(m, struct ip *); u_char *cp; struct in_ifaddr *ia; @@ -193,7 +198,7 @@ goto dropit; #endif if (!ip_dosourceroute) { - if (ipforwarding) { + if (V_ipforwarding) { char buf[16]; /* aaa.bbb.ccc.ddd\0 */ /* * Acting as a router, so generate @@ -215,7 +220,7 @@ #ifdef IPSTEALTH dropit: #endif - ipstat.ips_cantforward++; + V_ipstat.ips_cantforward++; m_freem(m); return (1); } @@ -355,14 +360,14 @@ cp[IPOPT_OFFSET] += sizeof(n_time); } } - if (forward && ipforwarding) { + if (forward && V_ipforwarding) { ip_forward(m, 1); return (1); } return (0); bad: icmp_error(m, type, code, 0, 0); - ipstat.ips_badoptions++; + V_ipstat.ips_badoptions++; return (1); } --- /u/marko/p4/head/src/sys/netinet/ip_output.c 2008-02-03 08:16:01.000000000 +0100 +++ src/sys/netinet/ip_output.c 2008-02-27 18:00:07.000000000 +0100 @@ -36,6 +36,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_mbuf_stress_test.h" +#include "opt_vimage.h" #include #include @@ -49,12 +50,15 @@ #include #include #include +#include +#include #include #include #include #include +#include #include #include #include @@ -78,7 +82,9 @@ (ntohl(a.s_addr)>>8)&0xFF,\ (ntohl(a.s_addr))&0xFF, y); +#ifndef VIMAGE u_short ip_id; +#endif #ifdef MBUF_STRESS_TEST int mbuf_frag_size = 0; @@ -104,6 +110,8 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; @@ -151,7 +159,7 @@ ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; ip->ip_id = ip_newid(); - ipstat.ips_localout++; + V_ipstat.ips_localout++; } else { hlen = ip->ip_hl << 2; } @@ -190,7 +198,7 @@ if (flags & IP_SENDONES) { if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL && (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) { - ipstat.ips_noroute++; + V_ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } @@ -202,7 +210,7 @@ } else if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) { - ipstat.ips_noroute++; + V_ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } @@ -227,7 +235,7 @@ if (ro->ro_rt == NULL) rtalloc_ign(ro, 0); if (ro->ro_rt == NULL) { - ipstat.ips_noroute++; + V_ipstat.ips_noroute++; error = EHOSTUNREACH; goto bad; } @@ -286,7 +294,7 @@ */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { - ipstat.ips_noroute++; + V_ipstat.ips_noroute++; error = ENETUNREACH; goto bad; } @@ -327,14 +335,14 @@ * above, will be forwarded by the ip_input() routine, * if necessary. */ - if (ip_mrouter && (flags & IP_FORWARDING) == 0) { + if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * If rsvp daemon is not running, do not * set ip_moptions. This ensures that the packet * is multicast and not just sent down one link * as prescribed by rsvpd. */ - if (!rsvp_on) + if (!V_rsvp_on) imo = NULL; if (ip_mforward && ip_mforward(ip, ifp, m, imo) != 0) { @@ -386,7 +394,7 @@ #endif /* ALTQ */ { error = ENOBUFS; - ipstat.ips_odropped++; + V_ipstat.ips_odropped++; ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1); goto bad; } @@ -450,7 +458,7 @@ if (in_localip(ip->ip_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) - m->m_pkthdr.rcvif = loif; + m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; @@ -469,7 +477,7 @@ /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) - m->m_pkthdr.rcvif = loif; + m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; @@ -497,7 +505,7 @@ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { - ipstat.ips_badaddr++; + V_ipstat.ips_badaddr++; error = EADDRNOTAVAIL; goto bad; } @@ -556,7 +564,7 @@ /* Balk when DF bit is set or the interface didn't support TSO. */ if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; - ipstat.ips_cantfrag++; + V_ipstat.ips_cantfrag++; goto bad; } @@ -589,7 +597,7 @@ } if (error == 0) - ipstat.ips_fragmented++; + V_ipstat.ips_fragmented++; done: if (ro == &iproute && ro->ro_rt) { @@ -614,6 +622,7 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags, int sw_csum) { + INIT_VNET_INET(curvnet); int error = 0; int hlen = ip->ip_hl << 2; int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ @@ -624,7 +633,7 @@ int nfrags; if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */ - ipstat.ips_cantfrag++; + V_ipstat.ips_cantfrag++; return EMSGSIZE; } @@ -699,7 +708,7 @@ MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; - ipstat.ips_odropped++; + V_ipstat.ips_odropped++; goto done; } m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; @@ -729,7 +738,7 @@ if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ - ipstat.ips_odropped++; + V_ipstat.ips_odropped++; goto done; } m->m_pkthdr.len = mhlen + len; @@ -745,7 +754,7 @@ *mnext = m; mnext = &m->m_nextpkt; } - ipstat.ips_ofragments += nfrags; + V_ipstat.ips_ofragments += nfrags; /* set first marker for fragment chain */ m0->m_flags |= M_FIRSTFRAG | M_FRAG; --- /u/marko/p4/head/src/sys/netinet/ip_var.h 2007-08-31 03:48:02.000000000 +0200 +++ src/sys/netinet/ip_var.h 2007-10-19 01:49:35.000000000 +0200 @@ -172,19 +172,22 @@ struct route; struct sockopt; +#ifndef VIMAGE extern struct ipstat ipstat; extern u_short ip_id; /* ip packet ctr, for ids */ extern int ip_defttl; /* default IP ttl */ extern int ipforwarding; /* ip forwarding */ +extern int ip_do_randomid; #ifdef IPSTEALTH extern int ipstealth; /* stealth forwarding */ #endif -extern u_char ip_protox[]; +extern int rsvp_on; extern struct socket *ip_rsvpd; /* reservation protocol daemon */ extern struct socket *ip_mrouter; /* multicast routing daemon */ +#endif +extern u_char ip_protox[]; extern int (*legal_vif_num)(int); extern u_long (*ip_mcast_src)(int); -extern int rsvp_on; extern struct pr_usrreqs rip_usrreqs; void inp_freemoptions(struct ip_moptions *); @@ -217,6 +220,9 @@ int rip_ctloutput(struct socket *, struct sockopt *); void rip_ctlinput(int, struct sockaddr *, void *); void rip_init(void); +#ifdef VIMAGE +void rip_destroy(void); +#endif void rip_input(struct mbuf *, int); int rip_output(struct mbuf *, struct socket *, u_long); void ipip_input(struct mbuf *, int); @@ -231,9 +237,7 @@ void in_delayed_cksum(struct mbuf *m); -static __inline uint16_t ip_newid(void); -extern int ip_do_randomid; - +#if 0 static __inline uint16_t ip_newid(void) { @@ -242,6 +246,9 @@ return htons(ip_id++); } +#else +#define ip_newid() (V_ip_do_randomid ? ip_randomid() : V_ip_id++) +#endif #endif /* _KERNEL */ --- /u/marko/p4/head/src/sys/netinet/ipprotosw.h 2007-08-31 03:48:02.000000000 +0200 +++ src/sys/netinet/ipprotosw.h 2007-10-05 12:27:05.000000000 +0200 @@ -87,6 +87,7 @@ void *pr_ousrreq; /* utility hooks */ pr_init_t *pr_init; + pr_destroy_t *pr_destroy; pr_fasttimo_t *pr_fasttimo; /* fast timeout (200ms) */ pr_slowtimo_t *pr_slowtimo; /* slow timeout (500ms) */ pr_drain_t *pr_drain; /* flush any excess space possible */ --- /u/marko/p4/head/src/sys/netinet/raw_ip.c 2007-10-29 17:17:43.000000000 +0100 +++ src/sys/netinet/raw_ip.c 2007-12-10 11:26:11.000000000 +0100 @@ -35,6 +35,7 @@ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_mac.h" +#include "opt_vimage.h" #include #include @@ -51,12 +52,15 @@ #include #include #include +#include #include +#include #include #include +#include #include #include #include @@ -74,8 +78,11 @@ #include +#ifndef VIMAGE struct inpcbhead ripcb; struct inpcbinfo ripcbinfo; +#endif +static struct uma_zone *ripcb_zone; /* control hooks for ipfw and dummynet */ ip_fw_ctl_t *ip_fw_ctl_ptr = NULL; @@ -87,7 +94,9 @@ */ /* The socket used to communicate with the multicast routing daemon. */ +#ifndef VIMAGE struct socket *ip_mrouter; +#endif /* The various mrouter and rsvp functions */ int (*ip_mrouter_set)(struct socket *, struct sockopt *); @@ -113,8 +122,9 @@ static void rip_zone_change(void *tag) { + INIT_VNET_INET(curvnet); - uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets); + uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); } static int @@ -129,25 +139,49 @@ void rip_init(void) { + INIT_VNET_INET(curvnet); - INP_INFO_LOCK_INIT(&ripcbinfo, "rip"); - LIST_INIT(&ripcb); - ripcbinfo.ipi_listhead = &ripcb; +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif + ripcb_zone = uma_zcreate("ripcb", sizeof(struct inpcb), + NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); +#ifdef VIMAGE + } + V_ripcbinfo.ipi_vnet = curvnet; +#endif + + INP_INFO_LOCK_INIT(&V_ripcbinfo, "rip"); + LIST_INIT(&V_ripcb); + V_ripcbinfo.ipi_listhead = &V_ripcb; /* * XXX We don't use the hash list for raw IP, but it's easier * to allocate a one entry hash list than it is to check all * over the place for hashbase == NULL. */ - ripcbinfo.ipi_hashbase = hashinit(1, M_PCB, &ripcbinfo.ipi_hashmask); - ripcbinfo.ipi_porthashbase = hashinit(1, M_PCB, - &ripcbinfo.ipi_porthashmask); - ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb), - NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets); + V_ripcbinfo.ipi_hashbase = + hashinit(1, M_PCB, &V_ripcbinfo.ipi_hashmask); + V_ripcbinfo.ipi_porthashbase = + hashinit(1, M_PCB, &V_ripcbinfo.ipi_porthashmask); + V_ripcbinfo.ipi_zone = ripcb_zone; + uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } +#ifdef VIMAGE +void +rip_destroy(void) +{ + INIT_VNET_INET(curvnet); + + hashdestroy(V_ripcbinfo.ipi_hashbase, M_PCB, + V_ripcbinfo.ipi_hashmask); + hashdestroy(V_ripcbinfo.ipi_porthashbase, M_PCB, + V_ripcbinfo.ipi_porthashmask); +} +#endif + static struct sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET }; static int @@ -201,14 +235,15 @@ void rip_input(struct mbuf *m, int off) { + INIT_VNET_INET(curvnet); struct ip *ip = mtod(m, struct ip *); int proto = ip->ip_p; struct inpcb *inp, *last; - INP_INFO_RLOCK(&ripcbinfo); + INP_INFO_RLOCK(&V_ripcbinfo); ripsrc.sin_addr = ip->ip_src; last = NULL; - LIST_FOREACH(inp, &ripcb, inp_list) { + LIST_FOREACH(inp, &V_ripcb, inp_list) { INP_LOCK(inp); if (inp->inp_ip_p && inp->inp_ip_p != proto) { docontinue: @@ -242,14 +277,14 @@ } if (last != NULL) { if (raw_append(last, ip, m) != 0) - ipstat.ips_delivered--; + V_ipstat.ips_delivered--; INP_UNLOCK(last); } else { m_freem(m); - ipstat.ips_noproto++; - ipstat.ips_delivered--; + V_ipstat.ips_noproto++; + V_ipstat.ips_delivered--; } - INP_INFO_RUNLOCK(&ripcbinfo); + INP_INFO_RUNLOCK(&V_ripcbinfo); } /* @@ -259,6 +294,7 @@ int rip_output(struct mbuf *m, struct socket *so, u_long dst) { + INIT_VNET_INET(so->so_vnet); struct ip *ip; int error; struct inpcb *inp = sotoinpcb(so); @@ -323,7 +359,7 @@ ip->ip_id = ip_newid(); /* XXX prevent ip_output from overwriting header fields */ flags |= IP_RAWOUTPUT; - ipstat.ips_rawout++; + V_ipstat.ips_rawout++; } if (inp->inp_flags & INP_ONESBCAST) @@ -538,6 +574,7 @@ void rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) { + INIT_VNET_INET(curvnet); struct in_ifaddr *ia; struct ifnet *ifp; int err; @@ -545,7 +582,7 @@ switch (cmd) { case PRC_IFDOWN: - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa && (ia->ia_flags & IFA_ROUTE)) { /* @@ -565,7 +602,7 @@ break; case PRC_IFUP: - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa) break; } @@ -596,6 +633,7 @@ static int rip_attach(struct socket *so, int proto, struct thread *td) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; int error; @@ -610,17 +648,17 @@ error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return error; - INP_INFO_WLOCK(&ripcbinfo); - error = in_pcballoc(so, &ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); + error = in_pcballoc(so, &V_ripcbinfo); if (error) { - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); return error; } inp = (struct inpcb *)so->so_pcb; - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); inp->inp_vflag |= INP_IPV4; inp->inp_ip_p = proto; - inp->inp_ip_ttl = ip_defttl; + inp->inp_ip_ttl = V_ip_defttl; INP_UNLOCK(inp); return 0; } @@ -628,6 +666,7 @@ static void rip_detach(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); @@ -635,17 +674,17 @@ KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("rip_detach: not closed")); - INP_INFO_WLOCK(&ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); INP_LOCK(inp); - if (so == ip_mrouter && ip_mrouter_done) + if (so == V_ip_mrouter && ip_mrouter_done) ip_mrouter_done(); if (ip_rsvp_force_done) ip_rsvp_force_done(so); - if (so == ip_rsvpd) + if (so == V_ip_rsvpd) ip_rsvp_done(); in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); } static void @@ -663,36 +702,39 @@ static void rip_abort(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_abort: inp == NULL")); - INP_INFO_WLOCK(&ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); INP_LOCK(inp); rip_dodisconnect(so, inp); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); } static void rip_close(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_close: inp == NULL")); - INP_INFO_WLOCK(&ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); INP_LOCK(inp); rip_dodisconnect(so, inp); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); } static int rip_disconnect(struct socket *so) { + INIT_VNET_INET(so->so_vnet); struct inpcb *inp; if ((so->so_state & SS_ISCONNECTED) == 0) @@ -700,17 +742,19 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); - INP_INFO_WLOCK(&ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); INP_LOCK(inp); rip_dodisconnect(so, inp); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { + INIT_VNET_NET(so->so_vnet); + INIT_VNET_INET(so->so_vnet); struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; @@ -725,7 +769,7 @@ return (EADDRNOTAVAIL); } - if (TAILQ_EMPTY(&ifnet) || + if (TAILQ_EMPTY(&V_ifnet) || (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || (addr->sin_addr.s_addr && ifa_ifwithaddr((struct sockaddr *)addr) == 0)) @@ -733,35 +777,37 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_bind: inp == NULL")); - INP_INFO_WLOCK(&ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); INP_LOCK(inp); inp->inp_laddr = addr->sin_addr; INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); return 0; } static int rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { + INIT_VNET_NET(so->so_vnet); + INIT_VNET_INET(so->so_vnet); struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; if (nam->sa_len != sizeof(*addr)) return EINVAL; - if (TAILQ_EMPTY(&ifnet)) + if (TAILQ_EMPTY(&V_ifnet)) return EADDRNOTAVAIL; if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) return EAFNOSUPPORT; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_connect: inp == NULL")); - INP_INFO_WLOCK(&ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); INP_LOCK(inp); inp->inp_faddr = addr->sin_addr; soisconnected(so); INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); return 0; } @@ -809,6 +855,7 @@ static int rip_pcblist(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET(curvnet); int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; @@ -819,7 +866,7 @@ * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { - n = ripcbinfo.ipi_count; + n = V_ripcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xinpcb); return 0; @@ -831,10 +878,10 @@ /* * OK, now we're committed to doing something. */ - INP_INFO_RLOCK(&ripcbinfo); - gencnt = ripcbinfo.ipi_gencnt; - n = ripcbinfo.ipi_count; - INP_INFO_RUNLOCK(&ripcbinfo); + INP_INFO_RLOCK(&V_ripcbinfo); + gencnt = V_ripcbinfo.ipi_gencnt; + n = V_ripcbinfo.ipi_count; + INP_INFO_RUNLOCK(&V_ripcbinfo); xig.xig_len = sizeof xig; xig.xig_count = n; @@ -848,8 +895,8 @@ if (inp_list == 0) return ENOMEM; - INP_INFO_RLOCK(&ripcbinfo); - for (inp = LIST_FIRST(ripcbinfo.ipi_listhead), i = 0; inp && i < n; + INP_INFO_RLOCK(&V_ripcbinfo); + for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_LOCK(inp); if (inp->inp_gencnt <= gencnt && @@ -859,7 +906,7 @@ } INP_UNLOCK(inp); } - INP_INFO_RUNLOCK(&ripcbinfo); + INP_INFO_RUNLOCK(&V_ripcbinfo); n = i; error = 0; @@ -887,11 +934,11 @@ * while we were processing this request, and it * might be necessary to retry. */ - INP_INFO_RLOCK(&ripcbinfo); - xig.xig_gen = ripcbinfo.ipi_gencnt; + INP_INFO_RLOCK(&V_ripcbinfo); + xig.xig_gen = V_ripcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; - xig.xig_count = ripcbinfo.ipi_count; - INP_INFO_RUNLOCK(&ripcbinfo); + xig.xig_count = V_ripcbinfo.ipi_count; + INP_INFO_RUNLOCK(&V_ripcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); --- /u/marko/p4/head/src/sys/netinet/sctp_output.c 2008-02-27 18:29:12.000000000 +0100 +++ src/sys/netinet/sctp_output.c 2008-02-27 11:49:22.000000000 +0100 @@ -33,8 +33,12 @@ #include __FBSDID("$FreeBSD: src/sys/netinet/sctp_output.c,v 1.67 2008/02/22 15:06:25 rrs Exp $"); +#include "opt_vimage.h" + #include #include +#include +#include #include #include #include --- /u/marko/p4/head/src/sys/netinet/tcp_hostcache.c 2007-10-16 13:53:39.000000000 +0200 +++ src/sys/netinet/tcp_hostcache.c 2007-10-22 18:06:43.000000000 +0200 @@ -57,15 +57,11 @@ * of bucket limit memory constrains. */ -/* - * Many thanks to jlemon for basic structure of tcp_syncache which is being - * followed here. - */ - #include __FBSDID("$FreeBSD: src/sys/netinet/tcp_hostcache.c,v 1.17 2007/10/07 20:44:23 silby Exp $"); #include "opt_inet6.h" +#include "opt_vimage.h" #include #include @@ -76,9 +72,12 @@ #include #include #include +#include +#include #include +#include #include #include #include @@ -94,88 +93,56 @@ #ifdef INET6 #include #endif +#include #include -TAILQ_HEAD(hc_qhead, hc_metrics); - -struct hc_head { - struct hc_qhead hch_bucket; - u_int hch_length; - struct mtx hch_mtx; -}; - -struct hc_metrics { - /* housekeeping */ - TAILQ_ENTRY(hc_metrics) rmx_q; - struct hc_head *rmx_head; /* head of bucket tail queue */ - struct in_addr ip4; /* IP address */ - struct in6_addr ip6; /* IP6 address */ - /* endpoint specific values for TCP */ - u_long rmx_mtu; /* MTU for this path */ - u_long rmx_ssthresh; /* outbound gateway buffer limit */ - u_long rmx_rtt; /* estimated round trip time */ - u_long rmx_rttvar; /* estimated rtt variance */ - u_long rmx_bandwidth; /* estimated bandwidth */ - u_long rmx_cwnd; /* congestion window */ - u_long rmx_sendpipe; /* outbound delay-bandwidth product */ - u_long rmx_recvpipe; /* inbound delay-bandwidth product */ - /* TCP hostcache internal data */ - int rmx_expire; /* lifetime for object */ - u_long rmx_hits; /* number of hits */ - u_long rmx_updates; /* number of updates */ -}; - /* Arbitrary values */ #define TCP_HOSTCACHE_HASHSIZE 512 #define TCP_HOSTCACHE_BUCKETLIMIT 30 #define TCP_HOSTCACHE_EXPIRE 60*60 /* one hour */ #define TCP_HOSTCACHE_PRUNE 5*60 /* every 5 minutes */ -struct tcp_hostcache { - struct hc_head *hashbase; - uma_zone_t zone; - u_int hashsize; - u_int hashmask; - u_int bucket_limit; - u_int cache_count; - u_int cache_limit; - int expire; - int prune; - int purgeall; -}; +#ifndef VIMAGE static struct tcp_hostcache tcp_hostcache; - static struct callout tcp_hc_callout; +#endif static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *); static struct hc_metrics *tcp_hc_insert(struct in_conninfo *); static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS); static void tcp_hc_purge(void *); -SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0, "TCP Host cache"); +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0, + "TCP Host cache"); -SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_RDTUN, - &tcp_hostcache.cache_limit, 0, "Overall entry limit for hostcache"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, cachelimit, + CTLFLAG_RDTUN, tcp_hostcache.cache_limit, 0, + "Overall entry limit for hostcache"); -SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_RDTUN, - &tcp_hostcache.hashsize, 0, "Size of TCP hostcache hashtable"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, hashsize, + CTLFLAG_RDTUN, tcp_hostcache.hashsize, 0, + "Size of TCP hostcache hashtable"); -SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN, - &tcp_hostcache.bucket_limit, 0, "Per-bucket hash limit for hostcache"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, bucketlimit, + CTLFLAG_RDTUN, tcp_hostcache.bucket_limit, 0, + "Per-bucket hash limit for hostcache"); -SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_RD, - &tcp_hostcache.cache_count, 0, "Current number of entries in hostcache"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, count, + CTLFLAG_RD, tcp_hostcache.cache_count, 0, + "Current number of entries in hostcache"); -SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_RW, - &tcp_hostcache.expire, 0, "Expire time of TCP hostcache entries"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, expire, + CTLFLAG_RW, tcp_hostcache.expire, 0, + "Expire time of TCP hostcache entries"); -SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_RW, - &tcp_hostcache.prune, 0, "Time between purge runs"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, prune, + CTLFLAG_RW, tcp_hostcache.prune, 0, "Time between purge runs"); -SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_RW, - &tcp_hostcache.purgeall, 0, "Expire all entires on next purge run"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, purge, + CTLFLAG_RW, tcp_hostcache.purgeall, 0, + "Expire all entires on next purge run"); SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP, 0, 0, @@ -186,7 +153,7 @@ #define HOSTCACHE_HASH(ip) \ (((ip)->s_addr ^ ((ip)->s_addr >> 7) ^ ((ip)->s_addr >> 17)) & \ - tcp_hostcache.hashmask) + V_tcp_hostcache.hashmask) /* XXX: What is the recommended hash to get good entropy for IPv6 addresses? */ #define HOSTCACHE_HASH6(ip6) \ @@ -194,7 +161,7 @@ (ip6)->s6_addr32[1] ^ \ (ip6)->s6_addr32[2] ^ \ (ip6)->s6_addr32[3]) & \ - tcp_hostcache.hashmask) + V_tcp_hostcache.hashmask) #define THC_LOCK(lp) mtx_lock(lp) #define THC_UNLOCK(lp) mtx_unlock(lp) @@ -202,60 +169,75 @@ void tcp_hc_init(void) { + INIT_VNET_INET(curvnet); int i; /* * Initialize hostcache structures. */ - tcp_hostcache.cache_count = 0; - tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; - tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT; - tcp_hostcache.cache_limit = - tcp_hostcache.hashsize * tcp_hostcache.bucket_limit; - tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE; - tcp_hostcache.prune = TCP_HOSTCACHE_PRUNE; + V_tcp_hostcache.cache_count = 0; + V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; + V_tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT; + V_tcp_hostcache.cache_limit = + V_tcp_hostcache.hashsize * V_tcp_hostcache.bucket_limit; + V_tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE; + V_tcp_hostcache.prune = TCP_HOSTCACHE_PRUNE; TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize", - &tcp_hostcache.hashsize); + &V_tcp_hostcache.hashsize); TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit", - &tcp_hostcache.cache_limit); + &V_tcp_hostcache.cache_limit); TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit", - &tcp_hostcache.bucket_limit); - if (!powerof2(tcp_hostcache.hashsize)) { + &V_tcp_hostcache.bucket_limit); + if (!powerof2(V_tcp_hostcache.hashsize)) { printf("WARNING: hostcache hash size is not a power of 2.\n"); - tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; /* default */ + V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; /* default */ } - tcp_hostcache.hashmask = tcp_hostcache.hashsize - 1; + V_tcp_hostcache.hashmask = V_tcp_hostcache.hashsize - 1; /* * Allocate the hash table. */ - tcp_hostcache.hashbase = (struct hc_head *) - malloc(tcp_hostcache.hashsize * sizeof(struct hc_head), + V_tcp_hostcache.hashbase = (struct hc_head *) + malloc(V_tcp_hostcache.hashsize * sizeof(struct hc_head), M_HOSTCACHE, M_WAITOK | M_ZERO); /* * Initialize the hash buckets. */ - for (i = 0; i < tcp_hostcache.hashsize; i++) { - TAILQ_INIT(&tcp_hostcache.hashbase[i].hch_bucket); - tcp_hostcache.hashbase[i].hch_length = 0; - mtx_init(&tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry", + for (i = 0; i < V_tcp_hostcache.hashsize; i++) { + TAILQ_INIT(&V_tcp_hostcache.hashbase[i].hch_bucket); + V_tcp_hostcache.hashbase[i].hch_length = 0; + mtx_init(&V_tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry", NULL, MTX_DEF); } /* * Allocate the hostcache entries. + * + * XXX don't need a separate zone for each hc instance - revisit!!! */ - tcp_hostcache.zone = uma_zcreate("hostcache", sizeof(struct hc_metrics), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - uma_zone_set_max(tcp_hostcache.zone, tcp_hostcache.cache_limit); + V_tcp_hostcache.zone = + uma_zcreate("hostcache", sizeof(struct hc_metrics), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + uma_zone_set_max(V_tcp_hostcache.zone, V_tcp_hostcache.cache_limit); /* * Set up periodic cache cleanup. */ - callout_init(&tcp_hc_callout, CALLOUT_MPSAFE); - callout_reset(&tcp_hc_callout, tcp_hostcache.prune * hz, tcp_hc_purge, 0); + callout_init(&V_tcp_hc_callout, CALLOUT_MPSAFE); + callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, + tcp_hc_purge, curvnet); +} + +void +tcp_hc_destroy(void) +{ + INIT_VNET_INET(curvnet); + + /* XXX TODO walk the hashtable and free all entries */ + + callout_drain(&V_tcp_hc_callout); } /* @@ -267,6 +249,7 @@ static struct hc_metrics * tcp_hc_lookup(struct in_conninfo *inc) { + INIT_VNET_INET(curvnet); int hash; struct hc_head *hc_head; struct hc_metrics *hc_entry; @@ -281,7 +264,7 @@ else hash = HOSTCACHE_HASH(&inc->inc_faddr); - hc_head = &tcp_hostcache.hashbase[hash]; + hc_head = &V_tcp_hostcache.hashbase[hash]; /* * Acquire lock for this bucket row; we release the lock if we don't @@ -322,6 +305,7 @@ static struct hc_metrics * tcp_hc_insert(struct in_conninfo *inc) { + INIT_VNET_INET(curvnet); int hash; struct hc_head *hc_head; struct hc_metrics *hc_entry; @@ -336,7 +320,7 @@ else hash = HOSTCACHE_HASH(&inc->inc_faddr); - hc_head = &tcp_hostcache.hashbase[hash]; + hc_head = &V_tcp_hostcache.hashbase[hash]; /* * Acquire lock for this bucket row; we release the lock if we don't @@ -348,8 +332,8 @@ /* * If the bucket limit is reached, reuse the least-used element. */ - if (hc_head->hch_length >= tcp_hostcache.bucket_limit || - tcp_hostcache.cache_count >= tcp_hostcache.cache_limit) { + if (hc_head->hch_length >= V_tcp_hostcache.bucket_limit || + V_tcp_hostcache.cache_count >= V_tcp_hostcache.cache_limit) { hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead); /* * At first we were dropping the last element, just to @@ -365,17 +349,17 @@ return NULL; } TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q); - tcp_hostcache.hashbase[hash].hch_length--; - tcp_hostcache.cache_count--; - tcpstat.tcps_hc_bucketoverflow++; + V_tcp_hostcache.hashbase[hash].hch_length--; + V_tcp_hostcache.cache_count--; + V_tcpstat.tcps_hc_bucketoverflow++; #if 0 - uma_zfree(tcp_hostcache.zone, hc_entry); + uma_zfree(V_tcp_hostcache.zone, hc_entry); #endif } else { /* * Allocate a new entry, or balk if not possible. */ - hc_entry = uma_zalloc(tcp_hostcache.zone, M_NOWAIT); + hc_entry = uma_zalloc(V_tcp_hostcache.zone, M_NOWAIT); if (hc_entry == NULL) { THC_UNLOCK(&hc_head->hch_mtx); return NULL; @@ -391,15 +375,15 @@ else hc_entry->ip4 = inc->inc_faddr; hc_entry->rmx_head = hc_head; - hc_entry->rmx_expire = tcp_hostcache.expire; + hc_entry->rmx_expire = V_tcp_hostcache.expire; /* * Put it upfront. */ TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q); - tcp_hostcache.hashbase[hash].hch_length++; - tcp_hostcache.cache_count++; - tcpstat.tcps_hc_added++; + V_tcp_hostcache.hashbase[hash].hch_length++; + V_tcp_hostcache.cache_count++; + V_tcpstat.tcps_hc_added++; return hc_entry; } @@ -412,6 +396,7 @@ void tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite) { + INIT_VNET_INET(curvnet); struct hc_metrics *hc_entry; /* @@ -427,7 +412,7 @@ return; } hc_entry->rmx_hits++; - hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */ + hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu; hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh; @@ -452,6 +437,7 @@ u_long tcp_hc_getmtu(struct in_conninfo *inc) { + INIT_VNET_INET(curvnet); struct hc_metrics *hc_entry; u_long mtu; @@ -460,7 +446,7 @@ return 0; } hc_entry->rmx_hits++; - hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */ + hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ mtu = hc_entry->rmx_mtu; THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); @@ -474,6 +460,7 @@ void tcp_hc_updatemtu(struct in_conninfo *inc, u_long mtu) { + INIT_VNET_INET(curvnet); struct hc_metrics *hc_entry; /* @@ -490,7 +477,7 @@ return; } hc_entry->rmx_updates++; - hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */ + hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ hc_entry->rmx_mtu = mtu; @@ -513,6 +500,7 @@ void tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml) { + INIT_VNET_INET(curvnet); struct hc_metrics *hc_entry; hc_entry = tcp_hc_lookup(inc); @@ -522,7 +510,7 @@ return; } hc_entry->rmx_updates++; - hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */ + hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ if (hcml->rmx_rtt != 0) { if (hc_entry->rmx_rtt == 0) @@ -530,7 +518,7 @@ else hc_entry->rmx_rtt = (hc_entry->rmx_rtt + hcml->rmx_rtt) / 2; - tcpstat.tcps_cachedrtt++; + V_tcpstat.tcps_cachedrtt++; } if (hcml->rmx_rttvar != 0) { if (hc_entry->rmx_rttvar == 0) @@ -538,7 +526,7 @@ else hc_entry->rmx_rttvar = (hc_entry->rmx_rttvar + hcml->rmx_rttvar) / 2; - tcpstat.tcps_cachedrttvar++; + V_tcpstat.tcps_cachedrttvar++; } if (hcml->rmx_ssthresh != 0) { if (hc_entry->rmx_ssthresh == 0) @@ -546,7 +534,7 @@ else hc_entry->rmx_ssthresh = (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2; - tcpstat.tcps_cachedssthresh++; + V_tcpstat.tcps_cachedssthresh++; } if (hcml->rmx_bandwidth != 0) { if (hc_entry->rmx_bandwidth == 0) @@ -554,7 +542,7 @@ else hc_entry->rmx_bandwidth = (hc_entry->rmx_bandwidth + hcml->rmx_bandwidth) / 2; - /* tcpstat.tcps_cachedbandwidth++; */ + /* V_tcpstat.tcps_cachedbandwidth++; */ } if (hcml->rmx_cwnd != 0) { if (hc_entry->rmx_cwnd == 0) @@ -562,7 +550,7 @@ else hc_entry->rmx_cwnd = (hc_entry->rmx_cwnd + hcml->rmx_cwnd) / 2; - /* tcpstat.tcps_cachedcwnd++; */ + /* V_tcpstat.tcps_cachedcwnd++; */ } if (hcml->rmx_sendpipe != 0) { if (hc_entry->rmx_sendpipe == 0) @@ -570,7 +558,7 @@ else hc_entry->rmx_sendpipe = (hc_entry->rmx_sendpipe + hcml->rmx_sendpipe) /2; - /* tcpstat.tcps_cachedsendpipe++; */ + /* V_tcpstat.tcps_cachedsendpipe++; */ } if (hcml->rmx_recvpipe != 0) { if (hc_entry->rmx_recvpipe == 0) @@ -578,7 +566,7 @@ else hc_entry->rmx_recvpipe = (hc_entry->rmx_recvpipe + hcml->rmx_recvpipe) /2; - /* tcpstat.tcps_cachedrecvpipe++; */ + /* V_tcpstat.tcps_cachedrecvpipe++; */ } TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); @@ -593,6 +581,7 @@ static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET(curvnet); int bufsize; int linesize = 128; char *p, *buf; @@ -602,7 +591,7 @@ char ip6buf[INET6_ADDRSTRLEN]; #endif - bufsize = linesize * (tcp_hostcache.cache_count + 1); + bufsize = linesize * (V_tcp_hostcache.cache_count + 1); p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO); @@ -612,9 +601,9 @@ p += len; #define msec(u) (((u) + 500) / 1000) - for (i = 0; i < tcp_hostcache.hashsize; i++) { - THC_LOCK(&tcp_hostcache.hashbase[i].hch_mtx); - TAILQ_FOREACH(hc_entry, &tcp_hostcache.hashbase[i].hch_bucket, + for (i = 0; i < V_tcp_hostcache.hashsize; i++) { + THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); + TAILQ_FOREACH(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q) { len = snprintf(p, linesize, "%-15s %5lu %8lu %6lums %6lums %9lu %8lu %8lu %8lu " @@ -640,7 +629,7 @@ hc_entry->rmx_expire); p += len; } - THC_UNLOCK(&tcp_hostcache.hashbase[i].hch_mtx); + THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); } #undef msec error = SYSCTL_OUT(req, buf, p - buf); @@ -655,29 +644,36 @@ static void tcp_hc_purge(void *arg) { + CURVNET_SET((struct vnet *) arg); + INIT_VNET_INET(curvnet); struct hc_metrics *hc_entry, *hc_next; - int all = (intptr_t)arg; + int all = 0; int i; - if (tcp_hostcache.purgeall) { + if (V_tcp_hostcache.purgeall) { all = 1; - tcp_hostcache.purgeall = 0; + V_tcp_hostcache.purgeall = 0; } - for (i = 0; i < tcp_hostcache.hashsize; i++) { - THC_LOCK(&tcp_hostcache.hashbase[i].hch_mtx); - TAILQ_FOREACH_SAFE(hc_entry, &tcp_hostcache.hashbase[i].hch_bucket, - rmx_q, hc_next) { + for (i = 0; i < V_tcp_hostcache.hashsize; i++) { + THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); + TAILQ_FOREACH_SAFE(hc_entry, + &V_tcp_hostcache.hashbase[i].hch_bucket, + rmx_q, hc_next) { if (all || hc_entry->rmx_expire <= 0) { - TAILQ_REMOVE(&tcp_hostcache.hashbase[i].hch_bucket, + TAILQ_REMOVE(&V_tcp_hostcache.hashbase[i].hch_bucket, hc_entry, rmx_q); - uma_zfree(tcp_hostcache.zone, hc_entry); - tcp_hostcache.hashbase[i].hch_length--; - tcp_hostcache.cache_count--; + uma_zfree(V_tcp_hostcache.zone, hc_entry); + V_tcp_hostcache.hashbase[i].hch_length--; + V_tcp_hostcache.cache_count--; } else - hc_entry->rmx_expire -= tcp_hostcache.prune; + hc_entry->rmx_expire -= V_tcp_hostcache.prune; } - THC_UNLOCK(&tcp_hostcache.hashbase[i].hch_mtx); + THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); } - callout_reset(&tcp_hc_callout, tcp_hostcache.prune * hz, tcp_hc_purge, 0); + + callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, + tcp_hc_purge, arg); + + CURVNET_RESTORE(); } --- /dev/null 2008-02-27 21:11:00.000000000 +0100 +++ src/sys/netinet/tcp_hostcache.h 2007-10-05 12:27:16.000000000 +0200 @@ -0,0 +1,82 @@ +/*- + * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * XXX RCS ID + */ + +/* + * Many thanks to jlemon for basic structure of tcp_syncache which is being + * followed here. + */ + +#ifndef _NETINET_TCP_HOSTCACHE_H_ +#define _NETINET_TCP_HOSTCACHE_H_ + +TAILQ_HEAD(hc_qhead, hc_metrics); + +struct hc_head { + struct hc_qhead hch_bucket; + u_int hch_length; + struct mtx hch_mtx; +}; + +struct hc_metrics { + /* housekeeping */ + TAILQ_ENTRY(hc_metrics) rmx_q; + struct hc_head *rmx_head; /* head of bucket tail queue */ + struct in_addr ip4; /* IP address */ + struct in6_addr ip6; /* IP6 address */ + /* endpoint specific values for tcp */ + u_long rmx_mtu; /* MTU for this path */ + u_long rmx_ssthresh; /* outbound gateway buffer limit */ + u_long rmx_rtt; /* estimated round trip time */ + u_long rmx_rttvar; /* estimated rtt variance */ + u_long rmx_bandwidth; /* estimated bandwidth */ + u_long rmx_cwnd; /* congestion window */ + u_long rmx_sendpipe; /* outbound delay-bandwidth product */ + u_long rmx_recvpipe; /* inbound delay-bandwidth product */ + /* TCP hostcache internal data */ + int rmx_expire; /* lifetime for object */ + u_long rmx_hits; /* number of hits */ + u_long rmx_updates; /* number of updates */ +}; + +struct tcp_hostcache { + struct hc_head *hashbase; + uma_zone_t zone; + u_int hashsize; + u_int hashmask; + u_int bucket_limit; + u_int cache_count; + u_int cache_limit; + int expire; + int prune; + int purgeall; +}; + +#endif /* !_NETINET_TCP_HOSTCACHE_H_*/ --- /u/marko/p4/head/src/sys/netinet/tcp_input.c 2007-10-29 17:17:43.000000000 +0100 +++ src/sys/netinet/tcp_input.c 2007-12-10 11:26:13.000000000 +0100 @@ -38,6 +38,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" +#include "opt_vimage.h" #include #include @@ -51,16 +52,19 @@ #include #include #include +#include #include /* before tcp_seq.h, for tcp_random18() */ #include +#include #include #include #define TCPSTATES /* for logging */ +#include #include #include #include @@ -72,6 +76,7 @@ #include #include #include +#include #include #include #include @@ -90,6 +95,7 @@ #ifdef IPSEC #include #include +#include #endif /*IPSEC*/ #include @@ -98,57 +104,62 @@ static const int tcprexmtthresh = 3; -struct tcpstat tcpstat; -SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, - &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); +#ifndef VIMAGE +struct inpcbhead tcb; +struct inpcbinfo tcbinfo; +struct tcpstat tcpstat; +int blackhole; +int tcp_delack_enabled; +int drop_synfin; +int tcp_do_rfc3042; +int tcp_do_rfc3390; +int tcp_insecure_rst; +int tcp_do_autorcvbuf; +int tcp_autorcvbuf_inc; +int tcp_autorcvbuf_max; +#endif + +SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_STATS, stats, + CTLFLAG_RW, tcpstat , tcpstat, + "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); int tcp_log_in_vain = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, &tcp_log_in_vain, 0, "Log all incoming TCP segments to closed ports"); -static int blackhole = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, - &blackhole, 0, "Do not send RST on segments to closed ports"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, + blackhole, 0, "Do not send RST on segments to closed ports"); -int tcp_delack_enabled = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, - &tcp_delack_enabled, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, delayed_ack, + CTLFLAG_RW, tcp_delack_enabled, 0, "Delay ACK to try and piggyback it onto a data packet"); -static int drop_synfin = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, - &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, drop_synfin, + CTLFLAG_RW, drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); -static int tcp_do_rfc3042 = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, - &tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, + tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)"); -static int tcp_do_rfc3390 = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, - &tcp_do_rfc3390, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, + tcp_do_rfc3390, 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); -static int tcp_insecure_rst = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, - &tcp_insecure_rst, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, insecure_rst, + CTLFLAG_RW, tcp_insecure_rst, 0, "Follow the old (insecure) criteria for accepting RST packets"); -int tcp_do_autorcvbuf = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, - &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_auto, + CTLFLAG_RW, tcp_do_autorcvbuf, 0, + "Enable automatic receive buffer sizing"); -int tcp_autorcvbuf_inc = 16*1024; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, - &tcp_autorcvbuf_inc, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_inc, + CTLFLAG_RW, tcp_autorcvbuf_inc, 0, "Incrementor step size of automatic receive buffer"); -int tcp_autorcvbuf_max = 256*1024; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, - &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_max, + CTLFLAG_RW, tcp_autorcvbuf_max, 0, + "Max size of automatic receive buffer"); -struct inpcbhead tcb; -#define tcb6 tcb /* for KAME src sync over BSD*'s */ -struct inpcbinfo tcbinfo; static void tcp_dooptions(struct tcpopt *, u_char *, int, int); static void tcp_do_segment(struct mbuf *, struct tcphdr *, @@ -183,8 +194,7 @@ #define DELAY_ACK(tp) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ - (tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) - + (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) /* * TCP input handling is split into multiple parts: @@ -199,6 +209,7 @@ int tcp6_input(struct mbuf **mp, int *offp, int proto) { + INIT_VNET_INET6(curvnet); struct mbuf *m = *mp; struct in6_ifaddr *ia6; @@ -226,6 +237,13 @@ void tcp_input(struct mbuf *m, int off0) { + INIT_VNET_INET(curvnet); +#ifdef INET6 + INIT_VNET_INET6(curvnet); +#endif +#ifdef IPSEC + INIT_VNET_IPSEC(curvnet); +#endif struct tcphdr *th; struct ip *ip = NULL; struct ipovly *ipov; @@ -266,7 +284,7 @@ #endif to.to_flags = 0; - tcpstat.tcps_rcvtotal++; + V_tcpstat.tcps_rcvtotal++; if (isipv6) { #ifdef INET6 @@ -274,7 +292,7 @@ ip6 = mtod(m, struct ip6_hdr *); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { - tcpstat.tcps_rcvbadsum++; + V_tcpstat.tcps_rcvbadsum++; goto drop; } th = (struct tcphdr *)((caddr_t)ip6 + off0); @@ -306,7 +324,7 @@ if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) { - tcpstat.tcps_rcvshort++; + V_tcpstat.tcps_rcvshort++; return; } } @@ -340,7 +358,7 @@ th->th_sum = in_cksum(m, len); } if (th->th_sum) { - tcpstat.tcps_rcvbadsum++; + V_tcpstat.tcps_rcvbadsum++; goto drop; } /* Re-initialization for later version check */ @@ -353,7 +371,7 @@ */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { - tcpstat.tcps_rcvbadoff++; + V_tcpstat.tcps_rcvbadoff++; goto drop; } tlen -= off; /* tlen is used instead of ti->ti_len */ @@ -368,7 +386,7 @@ if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == NULL) { - tcpstat.tcps_rcvshort++; + V_tcpstat.tcps_rcvshort++; return; } ip = mtod(m, struct ip *); @@ -397,9 +415,9 @@ /* * Locate pcb for segment. */ - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); findpcb: - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); #ifdef IPFIREWALL_FORWARD /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. @@ -414,13 +432,13 @@ * Transparently forwarded. Pretend to be the destination. * already got one like this? */ - inp = in_pcblookup_hash(&tcbinfo, + inp = in_pcblookup_hash(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); if (!inp) { /* It's new. Try to find the ambushing socket. */ - inp = in_pcblookup_hash(&tcbinfo, + inp = in_pcblookup_hash(&V_tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? @@ -436,14 +454,14 @@ { if (isipv6) { #ifdef INET6 - inp = in6_pcblookup_hash(&tcbinfo, + inp = in6_pcblookup_hash(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif); #endif } else - inp = in_pcblookup_hash(&tcbinfo, + inp = in_pcblookup_hash(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, INPLOOKUP_WILDCARD, @@ -469,8 +487,8 @@ * When blackholing do not respond with a RST but * completely ignore the segment and drop it. */ - if ((blackhole == 1 && (thflags & TH_SYN)) || - blackhole == 2) + if ((V_blackhole == 1 && (thflags & TH_SYN)) || + V_blackhole == 2) goto dropunlock; rstreason = BANDLIM_RST_CLOSEDPORT; @@ -481,12 +499,12 @@ #ifdef IPSEC #ifdef INET6 if (isipv6 && ipsec6_in_reject(m, inp)) { - ipsec6stat.in_polvio++; + V_ipsec6stat.in_polvio++; goto dropunlock; } else #endif /* INET6 */ if (ipsec4_in_reject(m, inp) != 0) { - ipsec4stat.in_polvio++; + V_ipsec4stat.in_polvio++; goto dropunlock; } #endif /* IPSEC */ @@ -518,7 +536,7 @@ */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return; } /* @@ -618,9 +636,10 @@ log(LOG_DEBUG, "%s; %s: Listen socket: " "Socket allocation failed due to " "limits or memory shortage, %s\n", - s, __func__, (tcp_sc_rst_sock_fail ? - "sending RST" : "try again")); - if (tcp_sc_rst_sock_fail) { + s, __func__, + V_tcp_sc_rst_sock_fail ? + "sending RST" : "try again"); + if (V_tcp_sc_rst_sock_fail) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } else @@ -643,7 +662,7 @@ * the mbuf chain and unlocks the inpcb. */ tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen); - INP_INFO_UNLOCK_ASSERT(&tcbinfo); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return; } /* @@ -668,7 +687,7 @@ log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN is missing, segment ignored\n", s, __func__); - tcpstat.tcps_badsyn++; + V_tcpstat.tcps_badsyn++; goto dropunlock; } /* @@ -680,7 +699,7 @@ "SYN|ACK invalid, segment rejected\n", s, __func__); syncache_badack(&inc); /* XXX: Not needed! */ - tcpstat.tcps_badsyn++; + V_tcpstat.tcps_badsyn++; rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } @@ -695,12 +714,12 @@ * XXX: This is a violation of the TCP specification * and was used by RFC1644. */ - if ((thflags & TH_FIN) && drop_synfin) { + if ((thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); - tcpstat.tcps_badsyn++; + V_tcpstat.tcps_badsyn++; goto dropunlock; } /* @@ -745,7 +764,7 @@ * handling - worse, they are not exactly the same. * I believe 5.5.4 is the best one, so we follow 5.5.4. */ - if (isipv6 && !ip6_use_deprecated) { + if (isipv6 && !V_ip6_use_deprecated) { struct in6_ifaddr *ia6; if ((ia6 = ip6_getdstifaddr(m)) && @@ -833,7 +852,7 @@ * Entry added to syncache and mbuf consumed. * Everything already unlocked by syncache_add(). */ - INP_INFO_UNLOCK_ASSERT(&tcbinfo); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return; } @@ -843,20 +862,20 @@ * the inpcb, and unlocks pcbinfo. */ tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen); - INP_INFO_UNLOCK_ASSERT(&tcbinfo); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return; dropwithreset: - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); tcp_dropwithreset(m, th, tp, tlen, rstreason); m = NULL; /* mbuf chain got consumed. */ dropunlock: - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); if (inp != NULL) INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); drop: - INP_INFO_UNLOCK_ASSERT(&tcbinfo); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); if (s != NULL) free(s, M_TCPLOG); if (m != NULL) @@ -868,6 +887,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen) { + INIT_VNET_INET(tp->t_vnet); int thflags, acked, ourfinisacked, needoutput = 0; int headlocked = 1; int rstreason, todrop, win; @@ -885,7 +905,7 @@ #endif thflags = th->th_flags; - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); @@ -1000,28 +1020,28 @@ if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && - ((!tcp_do_newreno && + ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT) && tp->t_dupacks < tcprexmtthresh) || - ((tcp_do_newreno || + ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && !IN_FASTRECOVERY(tp) && (to.to_flags & TOF_SACK) == 0 && TAILQ_EMPTY(&tp->snd_holes)))) { KASSERT(headlocked, ("%s: headlocked", __func__)); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; /* * This is a pure ack for outstanding data. */ - ++tcpstat.tcps_predack; + ++V_tcpstat.tcps_predack; /* * "bad retransmit" recovery. */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { - ++tcpstat.tcps_sndrexmitbad; + ++V_tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; @@ -1057,8 +1077,8 @@ } tcp_xmit_bandwidth_limit(tp, th->th_ack); acked = th->th_ack - tp->snd_una; - tcpstat.tcps_rcvackpack++; - tcpstat.tcps_rcvackbyte += acked; + V_tcpstat.tcps_rcvackpack++; + V_tcpstat.tcps_rcvackbyte += acked; sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) @@ -1103,7 +1123,7 @@ int newsize = 0; /* automatic sockbuf scaling */ KASSERT(headlocked, ("%s: headlocked", __func__)); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; /* * This is a pure, in-sequence data packet @@ -1113,7 +1133,7 @@ /* Clean receiver SACK report if present */ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) tcp_clean_sackreport(tp); - ++tcpstat.tcps_preddat; + ++V_tcpstat.tcps_preddat; tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to @@ -1125,8 +1145,8 @@ * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; - tcpstat.tcps_rcvpack++; - tcpstat.tcps_rcvbyte += tlen; + V_tcpstat.tcps_rcvpack++; + V_tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); /* Some progress has been made */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) @@ -1166,7 +1186,7 @@ * TODO: Only step up if the application is actually serving * the buffer to better manage the socket buffer resources. */ - if (tcp_do_autorcvbuf && + if (V_tcp_do_autorcvbuf && to.to_tsecr && (so->so_rcv.sb_flags & SB_AUTOSIZE)) { if (to.to_tsecr > tp->rfbuf_ts && @@ -1174,11 +1194,11 @@ if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) && so->so_rcv.sb_hiwat < - tcp_autorcvbuf_max) { + V_tcp_autorcvbuf_max) { newsize = min(so->so_rcv.sb_hiwat + - tcp_autorcvbuf_inc, - tcp_autorcvbuf_max); + V_tcp_autorcvbuf_inc, + V_tcp_autorcvbuf_max); } /* Start over with next RTT. */ tp->rfbuf_ts = 0; @@ -1274,7 +1294,7 @@ tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { - tcpstat.tcps_connects++; + V_tcpstat.tcps_connects++; soisconnected(so); #ifdef MAC SOCK_LOCK(so); @@ -1343,8 +1363,8 @@ m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; - tcpstat.tcps_rcvpackafterwin++; - tcpstat.tcps_rcvbyteafterwin += todrop; + V_tcpstat.tcps_rcvpackafterwin++; + V_tcpstat.tcps_rcvbyteafterwin += todrop; } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; @@ -1443,12 +1463,12 @@ goto close; case TCPS_ESTABLISHED: - if (tcp_insecure_rst == 0 && + if (V_tcp_insecure_rst == 0 && !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) && SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) && !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) { - tcpstat.tcps_badrst++; + V_tcpstat.tcps_badrst++; goto drop; } /* FALLTHROUGH */ @@ -1458,7 +1478,7 @@ so->so_error = ECONNRESET; close: tp->t_state = TCPS_CLOSED; - tcpstat.tcps_drops++; + V_tcpstat.tcps_drops++; KASSERT(headlocked, ("%s: trimthenstep6: " "tcp_close: head not locked", __func__)); tp = tcp_close(tp); @@ -1497,9 +1517,9 @@ */ tp->ts_recent = 0; } else { - tcpstat.tcps_rcvduppack++; - tcpstat.tcps_rcvdupbyte += tlen; - tcpstat.tcps_pawsdrop++; + V_tcpstat.tcps_rcvduppack++; + V_tcpstat.tcps_rcvdupbyte += tlen; + V_tcpstat.tcps_pawsdrop++; if (tlen) goto dropafterack; goto drop; @@ -1547,11 +1567,11 @@ */ tp->t_flags |= TF_ACKNOW; todrop = tlen; - tcpstat.tcps_rcvduppack++; - tcpstat.tcps_rcvdupbyte += todrop; + V_tcpstat.tcps_rcvduppack++; + V_tcpstat.tcps_rcvdupbyte += todrop; } else { - tcpstat.tcps_rcvpartduppack++; - tcpstat.tcps_rcvpartdupbyte += todrop; + V_tcpstat.tcps_rcvpartduppack++; + V_tcpstat.tcps_rcvpartdupbyte += todrop; } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; @@ -1581,7 +1601,7 @@ free(s, M_TCPLOG); } tp = tcp_close(tp); - tcpstat.tcps_rcvafterclose++; + V_tcpstat.tcps_rcvafterclose++; rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } @@ -1592,9 +1612,9 @@ */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { - tcpstat.tcps_rcvpackafterwin++; + V_tcpstat.tcps_rcvpackafterwin++; if (todrop >= tlen) { - tcpstat.tcps_rcvbyteafterwin += tlen; + V_tcpstat.tcps_rcvbyteafterwin += tlen; /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from @@ -1604,11 +1624,11 @@ */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; - tcpstat.tcps_rcvwinprobe++; + V_tcpstat.tcps_rcvwinprobe++; } else goto dropafterack; } else - tcpstat.tcps_rcvbyteafterwin += todrop; + V_tcpstat.tcps_rcvbyteafterwin += todrop; m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); @@ -1679,7 +1699,7 @@ */ case TCPS_SYN_RECEIVED: - tcpstat.tcps_connects++; + V_tcpstat.tcps_connects++; soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == @@ -1725,7 +1745,7 @@ case TCPS_CLOSING: case TCPS_LAST_ACK: if (SEQ_GT(th->th_ack, tp->snd_max)) { - tcpstat.tcps_rcvacktoomuch++; + V_tcpstat.tcps_rcvacktoomuch++; goto dropafterack; } if ((tp->t_flags & TF_SACK_PERMIT) && @@ -1734,7 +1754,7 @@ tcp_sack_doack(tp, &to, th->th_ack); if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (tlen == 0 && tiwin == tp->snd_wnd) { - tcpstat.tcps_rcvdupack++; + V_tcpstat.tcps_rcvdupack++; /* * If we have outstanding data (other than * a window probe), this is a completely @@ -1763,7 +1783,7 @@ th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || - ((tcp_do_newreno || + ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && IN_FASTRECOVERY(tp))) { if ((tp->t_flags & TF_SACK_PERMIT) && @@ -1803,7 +1823,7 @@ tp->t_dupacks = 0; break; } - } else if (tcp_do_newreno) { + } else if (V_tcp_do_newreno) { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; @@ -1820,7 +1840,7 @@ tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (tp->t_flags & TF_SACK_PERMIT) { - tcpstat.tcps_sack_recovery_episode++; + V_tcpstat.tcps_sack_recovery_episode++; tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); @@ -1838,7 +1858,7 @@ if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; - } else if (tcp_do_rfc3042) { + } else if (V_tcp_do_rfc3042) { u_long oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; @@ -1880,7 +1900,7 @@ * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ - if (tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) { + if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) { if (IN_FASTRECOVERY(tp)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->t_flags & TF_SACK_PERMIT) @@ -1941,8 +1961,8 @@ INP_LOCK_ASSERT(tp->t_inpcb); acked = th->th_ack - tp->snd_una; - tcpstat.tcps_rcvackpack++; - tcpstat.tcps_rcvackbyte += acked; + V_tcpstat.tcps_rcvackpack++; + V_tcpstat.tcps_rcvackbyte += acked; /* * If we just performed our first retransmit, and the ACK @@ -1952,7 +1972,7 @@ * we left off. */ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { - ++tcpstat.tcps_sndrexmitbad; + ++V_tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; @@ -2014,7 +2034,7 @@ * Otherwise open linearly: maxseg per window * (maxseg^2 / cwnd per packet). */ - if ((!tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) || + if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) || !IN_FASTRECOVERY(tp)) { u_int cw = tp->snd_cwnd; u_int incr = tp->t_maxseg; @@ -2035,12 +2055,12 @@ /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); /* Detect una wraparound. */ - if ((tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && + if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && !IN_FASTRECOVERY(tp) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; - if ((tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && + if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover)) EXIT_FASTRECOVERY(tp); @@ -2095,7 +2115,7 @@ KASSERT(headlocked, ("%s: process_ACK: " "head not locked", __func__)); tcp_twstart(tp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; m_freem(m); return; @@ -2134,7 +2154,7 @@ /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) - tcpstat.tcps_rcvwinupd++; + V_tcpstat.tcps_rcvwinupd++; tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; @@ -2242,8 +2262,8 @@ tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; - tcpstat.tcps_rcvpack++; - tcpstat.tcps_rcvbyte += tlen; + V_tcpstat.tcps_rcvpack++; + V_tcpstat.tcps_rcvbyte += tlen; ND6_HINT(tp); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) @@ -2328,11 +2348,11 @@ KASSERT(headlocked == 1, ("%s: dodata: " "TCP_FIN_WAIT_2: head not locked", __func__)); tcp_twstart(tp); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return; } } - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) @@ -2349,7 +2369,7 @@ check_delack: KASSERT(headlocked == 0, ("%s: check_delack: head locked", __func__)); - INP_INFO_UNLOCK_ASSERT(&tcbinfo); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; @@ -2387,7 +2407,7 @@ &tcp_savetcp, 0); #endif KASSERT(headlocked, ("%s: headlocked should be 1", __func__)); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); INP_UNLOCK(tp->t_inpcb); @@ -2402,7 +2422,7 @@ if (tp != NULL) INP_UNLOCK(tp->t_inpcb); if (headlocked) - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); return; drop: @@ -2417,7 +2437,7 @@ if (tp != NULL) INP_UNLOCK(tp->t_inpcb); if (headlocked) - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); m_freem(m); return; } @@ -2482,6 +2502,7 @@ static void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { + INIT_VNET_INET(curvnet); int opt, optlen; to->to_flags = 0; @@ -2547,7 +2568,7 @@ continue; if (!(flags & TO_SYN)) continue; - if (!tcp_do_sack) + if (!V_tcp_do_sack) continue; to->to_flags |= TOF_SACKPERM; break; @@ -2559,7 +2580,7 @@ to->to_flags |= TOF_SACK; to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; to->to_sacks = cp + 2; - tcpstat.tcps_sack_rcv_blocks++; + V_tcpstat.tcps_sack_rcv_blocks++; break; default: continue; @@ -2607,11 +2628,12 @@ static void tcp_xmit_timer(struct tcpcb *tp, int rtt) { + INIT_VNET_INET(tp->t_inpcb->inp_vnet); int delta; INP_LOCK_ASSERT(tp->t_inpcb); - tcpstat.tcps_rttupdated++; + V_tcpstat.tcps_rttupdated++; tp->t_rttupdated++; if (tp->t_srtt != 0) { /* @@ -2712,6 +2734,7 @@ void tcp_mss(struct tcpcb *tp, int offer) { + INIT_VNET_INET(tp->t_inpcb->inp_vnet); int rtt, mss; u_long bufsize; u_long maxmtu; @@ -2733,12 +2756,12 @@ #ifdef INET6 if (isipv6) { maxmtu = tcp_maxmtu6(&inp->inp_inc, &mtuflags); - tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt; + tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt; } else #endif { maxmtu = tcp_maxmtu(&inp->inp_inc, &mtuflags); - tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; + tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt; } so = inp->inp_socket; @@ -2757,9 +2780,9 @@ */ offer = #ifdef INET6 - isipv6 ? tcp_v6mssdflt : + isipv6 ? V_tcp_v6mssdflt : #endif - tcp_mssdflt; + V_tcp_mssdflt; break; case -1: @@ -2773,7 +2796,7 @@ * Prevent DoS attack with too small MSS. Round up * to at least minmss. */ - offer = max(offer, tcp_minmss); + offer = max(offer, V_tcp_minmss); /* * Sanity check: make sure that maxopd will be large * enough to allow some data on segments even if the @@ -2798,16 +2821,16 @@ #ifdef INET6 if (isipv6) { mss = maxmtu - min_protoh; - if (!path_mtu_discovery && + if (!V_path_mtu_discovery && !in6_localaddr(&inp->in6p_faddr)) - mss = min(mss, tcp_v6mssdflt); + mss = min(mss, V_tcp_v6mssdflt); } else #endif { mss = maxmtu - min_protoh; - if (!path_mtu_discovery && + if (!V_path_mtu_discovery && !in_localaddr(inp->inp_faddr)) - mss = min(mss, tcp_mssdflt); + mss = min(mss, V_tcp_mssdflt); } } mss = min(mss, offer); @@ -2883,10 +2906,10 @@ if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; - tcpstat.tcps_usedrtt++; + V_tcpstat.tcps_usedrtt++; if (metrics.rmx_rttvar) { tp->t_rttvar = metrics.rmx_rttvar; - tcpstat.tcps_usedrttvar++; + V_tcpstat.tcps_usedrttvar++; } else { /* default variation is +- 1 rtt */ tp->t_rttvar = @@ -2904,7 +2927,7 @@ * threshold to no less than 2*mss. */ tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); - tcpstat.tcps_usedssthresh++; + V_tcpstat.tcps_usedssthresh++; } if (metrics.rmx_bandwidth) tp->snd_bandwidth = metrics.rmx_bandwidth; @@ -2933,7 +2956,7 @@ min(tp->snd_wnd, so->so_snd.sb_hiwat))); else #endif - if (tcp_do_rfc3390) + if (V_tcp_do_rfc3390) tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); #ifdef INET6 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || @@ -2941,9 +2964,9 @@ #else else if (in_localaddr(inp->inp_faddr)) #endif - tp->snd_cwnd = mss * ss_fltsz_local; + tp->snd_cwnd = mss * V_ss_fltsz_local; else - tp->snd_cwnd = mss * ss_fltsz; + tp->snd_cwnd = mss * V_ss_fltsz; /* Check the interface for TSO capabilities. */ if (mtuflags & CSUM_TSO) @@ -2956,6 +2979,7 @@ int tcp_mssopt(struct in_conninfo *inc) { + INIT_VNET_INET(curvnet); int mss = 0; u_long maxmtu = 0; u_long thcmtu = 0; @@ -2968,14 +2992,14 @@ #ifdef INET6 if (isipv6) { - mss = tcp_v6mssdflt; + mss = V_tcp_v6mssdflt; maxmtu = tcp_maxmtu6(inc, NULL); thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); } else #endif { - mss = tcp_mssdflt; + mss = V_tcp_mssdflt; maxmtu = tcp_maxmtu(inc, NULL); thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct tcpiphdr); --- /u/marko/p4/head/src/sys/netinet/tcp_output.c 2007-12-03 11:00:10.000000000 +0100 +++ src/sys/netinet/tcp_output.c 2007-12-10 11:26:13.000000000 +0100 @@ -37,6 +37,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" +#include "opt_vimage.h" #include #include @@ -49,9 +50,12 @@ #include #include #include +#include +#include #include +#include #include #include #include @@ -86,37 +90,45 @@ extern struct mbuf *m_copypack(); #endif -int path_mtu_discovery = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, - &path_mtu_discovery, 1, "Enable Path MTU Discovery"); +#ifndef VIMAGE +int path_mtu_discovery; +int ss_fltsz; +int ss_fltsz_local; +int tcp_do_newreno; +int tcp_do_tso; +int tcp_do_autosndbuf; +int tcp_autosndbuf_inc; +int tcp_autosndbuf_max; +#endif -int ss_fltsz = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW, - &ss_fltsz, 1, "Slow start flight size"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, path_mtu_discovery, + CTLFLAG_RW, path_mtu_discovery, 1, "Enable Path MTU Discovery"); -int ss_fltsz_local = 4; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW, - &ss_fltsz_local, 1, "Slow start flight size for local networks"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, + slowstart_flightsize, CTLFLAG_RW, + ss_fltsz, 1, "Slow start flight size"); -int tcp_do_newreno = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, - &tcp_do_newreno, 0, "Enable NewReno Algorithms"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, + local_slowstart_flightsize, CTLFLAG_RW, + ss_fltsz_local, 1, "Slow start flight size for local networks"); -int tcp_do_tso = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, - &tcp_do_tso, 0, "Enable TCP Segmentation Offload"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, + tcp_do_newreno, 0, "Enable NewReno Algorithms"); -int tcp_do_autosndbuf = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW, - &tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, + tcp_do_tso, 0, "Enable TCP Segmentation Offload"); -int tcp_autosndbuf_inc = 8*1024; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW, - &tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, sendbuf_auto, + CTLFLAG_RW, + tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing"); -int tcp_autosndbuf_max = 256*1024; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, - &tcp_autosndbuf_max, 0, "Max size of automatic send buffer"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, sendbuf_inc, + CTLFLAG_RW, tcp_autosndbuf_inc, 0, + "Incrementor step size of automatic send buffer"); + +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, sendbuf_max, + CTLFLAG_RW, tcp_autosndbuf_max, 0, + "Max size of automatic send buffer"); /* @@ -125,6 +137,7 @@ int tcp_output(struct tcpcb *tp) { + INIT_VNET_INET(tp->t_inpcb->inp_vnet); struct socket *so = tp->t_inpcb->inp_socket; long len, recwin, sendwin; int off, flags, error; @@ -170,15 +183,15 @@ * Set the slow-start flight size depending on whether * this is a local network or not. */ - int ss = ss_fltsz; + int ss = V_ss_fltsz; #ifdef INET6 if (isipv6) { if (in6_localaddr(&tp->t_inpcb->in6p_faddr)) - ss = ss_fltsz_local; + ss = V_ss_fltsz_local; } else #endif /* INET6 */ if (in_localaddr(tp->t_inpcb->inp_faddr)) - ss = ss_fltsz_local; + ss = V_ss_fltsz_local; tp->snd_cwnd = tp->t_maxseg * ss; } tp->t_flags &= ~TF_LASTIDLE; @@ -252,8 +265,8 @@ if (len > 0) { sack_rxmit = 1; sendalot = 1; - tcpstat.tcps_sack_rexmits++; - tcpstat.tcps_sack_rexmit_bytes += + V_tcpstat.tcps_sack_rexmits++; + V_tcpstat.tcps_sack_rexmit_bytes += min(len, tp->t_maxseg); } } @@ -428,14 +441,14 @@ * with congestion window. Requires another timer. Has to * wait for upcoming tcp timer rewrite. */ - if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { + if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && - so->so_snd.sb_cc < tcp_autosndbuf_max && + so->so_snd.sb_cc < V_tcp_autosndbuf_max && sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { if (!sbreserve_locked(&so->so_snd, - min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc, - tcp_autosndbuf_max), so, curthread)) + min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc, + V_tcp_autosndbuf_max), so, curthread)) so->so_snd.sb_flags &= ~SB_AUTOSIZE; } } @@ -464,7 +477,7 @@ ipsec_optlen = ipsec_hdrsiz_tcp(tp); #endif if (len > tp->t_maxseg) { - if ((tp->t_flags & TF_TSO) && tcp_do_tso && + if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && tp->t_inpcb->inp_options == NULL && @@ -754,13 +767,13 @@ u_int moff; if ((tp->t_flags & TF_FORCEDATA) && len == 1) - tcpstat.tcps_sndprobe++; + V_tcpstat.tcps_sndprobe++; else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { - tcpstat.tcps_sndrexmitpack++; - tcpstat.tcps_sndrexmitbyte += len; + V_tcpstat.tcps_sndrexmitpack++; + V_tcpstat.tcps_sndrexmitbyte += len; } else { - tcpstat.tcps_sndpack++; - tcpstat.tcps_sndbyte += len; + V_tcpstat.tcps_sndpack++; + V_tcpstat.tcps_sndbyte += len; } #ifdef notyet if ((m = m_copypack(so->so_snd.sb_mb, off, @@ -827,13 +840,13 @@ } else { SOCKBUF_UNLOCK(&so->so_snd); if (tp->t_flags & TF_ACKNOW) - tcpstat.tcps_sndacks++; + V_tcpstat.tcps_sndacks++; else if (flags & (TH_SYN|TH_FIN|TH_RST)) - tcpstat.tcps_sndctrl++; + V_tcpstat.tcps_sndctrl++; else if (SEQ_GT(tp->snd_up, tp->snd_una)) - tcpstat.tcps_sndurg++; + V_tcpstat.tcps_sndurg++; else - tcpstat.tcps_sndwinup++; + V_tcpstat.tcps_sndwinup++; MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) { @@ -1031,7 +1044,7 @@ if (tp->t_rtttime == 0) { tp->t_rtttime = ticks; tp->t_rtseq = startseq; - tcpstat.tcps_segstimed++; + V_tcpstat.tcps_segstimed++; } } @@ -1129,7 +1142,7 @@ * Section 2. However the tcp hostcache migitates the problem * so it affects only the first tcp connection with a host. */ - if (path_mtu_discovery) + if (V_path_mtu_discovery) ip->ip_off |= IP_DF; error = ip_output(m, tp->t_inpcb->inp_options, NULL, @@ -1208,7 +1221,7 @@ return (error); } } - tcpstat.tcps_sndtotal++; + V_tcpstat.tcps_sndtotal++; /* * Data sent (as far as we can tell). @@ -1275,6 +1288,7 @@ int tcp_addoptions(struct tcpopt *to, u_char *optp) { + INIT_VNET_INET(curvnet); u_int mask, optlen = 0; for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { @@ -1372,7 +1386,7 @@ optlen += TCPOLEN_SACK; sack++; } - tcpstat.tcps_sack_send_blocks++; + V_tcpstat.tcps_sack_send_blocks++; break; } default: --- /u/marko/p4/head/src/sys/netinet/tcp_reass.c 2007-10-16 13:53:39.000000000 +0200 +++ src/sys/netinet/tcp_reass.c 2007-10-22 18:06:43.000000000 +0200 @@ -35,6 +35,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" +#include "opt_vimage.h" #include #include @@ -45,12 +46,14 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -76,33 +79,37 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); +#ifndef VIMAGE static int tcp_reass_maxseg = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, - &tcp_reass_maxseg, 0, +int tcp_reass_qsize = 0; +static int tcp_reass_maxqlen = 48; +static int tcp_reass_overflows = 0; +#endif + +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, maxsegments, + CTLFLAG_RDTUN, tcp_reass_maxseg, 0, "Global maximum number of TCP Segments in Reassembly Queue"); -int tcp_reass_qsize = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD, - &tcp_reass_qsize, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, cursegments, + CTLFLAG_RD, tcp_reass_qsize, 0, "Global number of TCP Segments currently in Reassembly Queue"); -static int tcp_reass_maxqlen = 48; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxqlen, CTLFLAG_RW, - &tcp_reass_maxqlen, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, maxqlen, + CTLFLAG_RW, tcp_reass_maxqlen, 0, "Maximum number of TCP Segments per individual Reassembly Queue"); -static int tcp_reass_overflows = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD, - &tcp_reass_overflows, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, overflows, + CTLFLAG_RD, tcp_reass_overflows, 0, "Global number of TCP Segment Reassembly Queue Overflows"); /* Initialize TCP reassembly queue */ static void tcp_reass_zone_change(void *tag) { + INIT_VNET_INET(curvnet); - tcp_reass_maxseg = nmbclusters / 16; - uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg); + V_tcp_reass_maxseg = nmbclusters / 16; + uma_zone_set_max(tcp_reass_zone, V_tcp_reass_maxseg); } uma_zone_t tcp_reass_zone; @@ -110,13 +117,14 @@ void tcp_reass_init(void) { + INIT_VNET_INET(curvnet); - tcp_reass_maxseg = nmbclusters / 16; + V_tcp_reass_maxseg = nmbclusters / 16; TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments", - &tcp_reass_maxseg); + &V_tcp_reass_maxseg); tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg); + uma_zone_set_max(tcp_reass_zone, V_tcp_reass_maxseg); EVENTHANDLER_REGISTER(nmbclusters_change, tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); } @@ -124,6 +132,7 @@ int tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) { + INIT_VNET_INET(curvnet); struct tseg_qent *q; struct tseg_qent *p = NULL; struct tseg_qent *nq; @@ -154,10 +163,10 @@ * process the missing segment. */ if (th->th_seq != tp->rcv_nxt && - (tcp_reass_qsize + 1 >= tcp_reass_maxseg || - tp->t_segqlen >= tcp_reass_maxqlen)) { - tcp_reass_overflows++; - tcpstat.tcps_rcvmemdrop++; + (V_tcp_reass_qsize + 1 >= V_tcp_reass_maxseg || + tp->t_segqlen >= V_tcp_reass_maxqlen)) { + V_tcp_reass_overflows++; + V_tcpstat.tcps_rcvmemdrop++; m_freem(m); *tlenp = 0; return (0); @@ -169,13 +178,13 @@ */ te = uma_zalloc(tcp_reass_zone, M_NOWAIT); if (te == NULL) { - tcpstat.tcps_rcvmemdrop++; + V_tcpstat.tcps_rcvmemdrop++; m_freem(m); *tlenp = 0; return (0); } tp->t_segqlen++; - tcp_reass_qsize++; + V_tcp_reass_qsize++; /* * Find a segment which begins after this one does. @@ -197,12 +206,12 @@ i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; if (i > 0) { if (i >= *tlenp) { - tcpstat.tcps_rcvduppack++; - tcpstat.tcps_rcvdupbyte += *tlenp; + V_tcpstat.tcps_rcvduppack++; + V_tcpstat.tcps_rcvdupbyte += *tlenp; m_freem(m); uma_zfree(tcp_reass_zone, te); tp->t_segqlen--; - tcp_reass_qsize--; + V_tcp_reass_qsize--; /* * Try to present any queued data * at the left window edge to the user. @@ -216,8 +225,8 @@ th->th_seq += i; } } - tcpstat.tcps_rcvoopack++; - tcpstat.tcps_rcvoobyte += *tlenp; + V_tcpstat.tcps_rcvoopack++; + V_tcpstat.tcps_rcvoobyte += *tlenp; /* * While we overlap succeeding segments trim them or, @@ -239,7 +248,7 @@ m_freem(q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; - tcp_reass_qsize--; + V_tcp_reass_qsize--; q = nq; } @@ -276,7 +285,7 @@ sbappendstream_locked(&so->so_rcv, q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; - tcp_reass_qsize--; + V_tcp_reass_qsize--; q = nq; } while (q && q->tqe_th->th_seq == tp->rcv_nxt); ND6_HINT(tp); --- /u/marko/p4/head/src/sys/netinet/tcp_sack.c 2007-08-31 03:48:07.000000000 +0200 +++ src/sys/netinet/tcp_sack.c 2007-10-22 18:06:43.000000000 +0200 @@ -76,6 +76,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" +#include "opt_vimage.h" #include #include @@ -89,14 +90,17 @@ #include #include #include +#include #include /* before tcp_seq.h, for tcp_random18() */ #include +#include #include #include +#include #include #include #include @@ -123,25 +127,27 @@ extern struct uma_zone *sack_hole_zone; +#ifndef VIMAGE +int tcp_do_sack; +int tcp_sack_maxholes; +int tcp_sack_globalmaxholes; +int tcp_sack_globalholes; +#endif + SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK"); -int tcp_do_sack = 1; -SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_RW, - &tcp_do_sack, 0, "Enable/Disable TCP SACK support"); -TUNABLE_INT("net.inet.tcp.sack.enable", &tcp_do_sack); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, enable, + CTLFLAG_RW, tcp_do_sack, 0, "Enable/Disable TCP SACK support"); -static int tcp_sack_maxholes = 128; -SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_RW, - &tcp_sack_maxholes, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, maxholes, + CTLFLAG_RW, tcp_sack_maxholes, 0, "Maximum number of TCP SACK holes allowed per connection"); -static int tcp_sack_globalmaxholes = 65536; -SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_RW, - &tcp_sack_globalmaxholes, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, globalmaxholes, + CTLFLAG_RW, tcp_sack_globalmaxholes, 0, "Global maximum number of TCP SACK holes"); -static int tcp_sack_globalholes = 0; -SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_RD, - &tcp_sack_globalholes, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, globalholes, + CTLFLAG_RD, tcp_sack_globalholes, 0, "Global number of TCP SACK holes currently allocated"); /* @@ -252,11 +258,12 @@ static struct sackhole * tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end) { + INIT_VNET_INET(tp->t_inpcb->inp_vnet); struct sackhole *hole; - if (tp->snd_numholes >= tcp_sack_maxholes || - tcp_sack_globalholes >= tcp_sack_globalmaxholes) { - tcpstat.tcps_sack_sboverflow++; + if (tp->snd_numholes >= V_tcp_sack_maxholes || + V_tcp_sack_globalholes >= V_tcp_sack_globalmaxholes) { + V_tcpstat.tcps_sack_sboverflow++; return NULL; } @@ -269,7 +276,7 @@ hole->rxmit = start; tp->snd_numholes++; - tcp_sack_globalholes++; + V_tcp_sack_globalholes++; return hole; } @@ -280,14 +287,15 @@ static void tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole) { + INIT_VNET_INET(tp->t_vnet); uma_zfree(sack_hole_zone, hole); tp->snd_numholes--; - tcp_sack_globalholes--; + V_tcp_sack_globalholes--; KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes >= 0")); - KASSERT(tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0")); + KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0")); } /* --- /u/marko/p4/head/src/sys/netinet/tcp_subr.c 2007-12-27 19:32:56.000000000 +0100 +++ src/sys/netinet/tcp_subr.c 2008-01-14 19:23:54.000000000 +0100 @@ -38,6 +38,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" +#include "opt_vimage.h" #include #include @@ -55,12 +56,15 @@ #include #include #include +#include #include +#include #include #include +#include #include #include #include @@ -94,6 +98,7 @@ #include #endif #include +#include #ifdef IPSEC #include @@ -109,14 +114,33 @@ #include -int tcp_mssdflt = TCP_MSS; -SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, - &tcp_mssdflt, 0, "Default TCP Maximum Segment Size"); +static int tcp_tcbhashsize = 0; +static int do_tcpdrain = 1; +static int tcp_inflight_debug = 0; + +#ifndef VIMAGE +int tcp_mssdflt; +int tcp_minmss; +int tcp_do_rfc1323; +static int icmp_may_rst; +static int tcp_isn_reseed_interval; +static int tcp_inflight_enable; +static int tcp_inflight_rttthresh; +static int tcp_inflight_min; +static int tcp_inflight_max; +static int tcp_inflight_stab; +static int nolocaltimewait; +#endif + +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, + CTLFLAG_RW, tcp_mssdflt, 0, "Default TCP Maximum Segment Size"); #ifdef INET6 -int tcp_v6mssdflt = TCP6_MSS; -SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, - CTLFLAG_RW, &tcp_v6mssdflt , 0, +#ifndef VIMAGE +int tcp_v6mssdflt; +#endif +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, + CTLFLAG_RW, tcp_v6mssdflt, 0, "Default TCP Maximum Segment Size for IPv6"); #endif @@ -128,38 +152,33 @@ * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ -int tcp_minmss = TCP_MINMSS; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, - &tcp_minmss , 0, "Minmum TCP Maximum Segment Size"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, minmss, + CTLFLAG_RW, tcp_minmss , 0, "Minmum TCP Maximum Segment Size"); -int tcp_do_rfc1323 = 1; -SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, - &tcp_do_rfc1323, 0, "Enable rfc1323 (high performance TCP) extensions"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, + CTLFLAG_RW, tcp_do_rfc1323, 0, + "Enable rfc1323 (high performance TCP) extensions"); static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); -static int tcp_tcbhashsize = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); -static int do_tcpdrain = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, - &do_tcpdrain, 0, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); -SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, - &tcbinfo.ipi_count, 0, "Number of active PCBs"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, pcbcount, + CTLFLAG_RD, tcbinfo.ipi_count, 0, "Number of active PCBs"); -static int icmp_may_rst = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, - &icmp_may_rst, 0, +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, icmp_may_rst, + CTLFLAG_RW, icmp_may_rst, 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); -static int tcp_isn_reseed_interval = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, - &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, isn_reseed_interval, + CTLFLAG_RW, tcp_isn_reseed_interval, 0, + "Seconds between reseeding of ISN secret"); /* * TCP bandwidth limiting sysctls. Note that the default lower bound of @@ -169,30 +188,31 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0, "TCP inflight data limiting"); -static int tcp_inflight_enable = 1; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW, - &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, enable, + CTLFLAG_RW, tcp_inflight_enable, 0, + "Enable automatic TCP inflight data limiting"); -static int tcp_inflight_debug = 0; SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW, &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); -static int tcp_inflight_rttthresh; -SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW, - &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I", +static int sysctl_tcp_inflight_rttthresh(SYSCTL_HANDLER_ARGS); +int sysctl_tcp_inflight_rttthresh(SYSCTL_HANDLER_ARGS) +{ + return (0); /* XXX MARKO REVISIT */ +} +SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, + CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_tcp_inflight_rttthresh, "I", "RTT threshold below which inflight will deactivate itself"); -static int tcp_inflight_min = 6144; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW, - &tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, min, + CTLFLAG_RW, tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); -static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW, - &tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, max, + CTLFLAG_RW, tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); -static int tcp_inflight_stab = 20; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW, - &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, stab, + CTLFLAG_RW, tcp_inflight_stab, 0, + "Inflight Algorithm Stabilization 20 = 2 packets"); uma_zone_t sack_hole_zone; @@ -229,14 +249,15 @@ #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) +static struct uma_zone *tcp_ipi_zone; + /* * TCP initialization. */ static void tcp_zone_change(void *tag) { - - uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); + uma_zone_set_max(tcp_ipi_zone, maxsockets); uma_zone_set_max(tcpcb_zone, maxsockets); tcp_tw_zone_change(); } @@ -253,6 +274,27 @@ void tcp_init(void) { + INIT_VNET_INET(curvnet); + +#ifdef VIMAGE + if (IS_DEFAULT_VNET(curvnet)) { +#endif + tcp_ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), + NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(tcp_ipi_zone, maxsockets); + /* + * These have to be type stable for the benefit of the timers. + */ + tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(tcpcb_zone, maxsockets); + sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); +#ifdef VIMAGE + } +#endif + + tcp_tw_init(); int hashsize = TCBHASHSIZE; tcp_delacktime = TCPTV_DELACK; @@ -265,25 +307,59 @@ if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; tcp_rexmit_slop = TCPTV_CPU_VAR; - tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; + V_path_mtu_discovery = 1; + V_ss_fltsz = 1; + V_ss_fltsz_local = 4; + V_tcp_do_newreno = 1; + V_tcp_do_tso = 1; + V_tcp_do_autosndbuf = 1; + V_tcp_autosndbuf_inc = 8*1024; + V_tcp_autosndbuf_max = 256*1024; + V_blackhole = 0; + V_tcp_delack_enabled = 1; + V_drop_synfin = 0; + V_tcp_do_rfc3042 = 1; + V_tcp_do_rfc3390 = 1; + V_tcp_insecure_rst = 0; + V_tcp_do_autorcvbuf = 1; + V_tcp_autorcvbuf_inc = 16*1024; + V_tcp_autorcvbuf_max = 256*1024; + V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; + V_tcp_mssdflt = TCP_MSS; +#ifdef INET6 + V_tcp_v6mssdflt = TCP6_MSS; +#endif + V_tcp_minmss = TCP_MINMSS; + V_tcp_do_rfc1323 = 1; + V_icmp_may_rst = 1; + V_tcp_isn_reseed_interval = 0; + V_tcp_inflight_enable = 1; + V_tcp_inflight_min = 6144; + V_tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; + V_tcp_inflight_stab = 20; + V_nolocaltimewait = 0; + V_tcp_do_sack = 1; + V_tcp_sack_maxholes = 128; + V_tcp_sack_globalmaxholes = 65536; + V_tcp_sack_globalholes = 0; + TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); - INP_INFO_LOCK_INIT(&tcbinfo, "tcp"); - LIST_INIT(&tcb); - tcbinfo.ipi_listhead = &tcb; + INP_INFO_LOCK_INIT(&V_tcbinfo, "tcp"); + LIST_INIT(&V_tcb); + V_tcbinfo.ipi_listhead = &V_tcb; TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); if (!powerof2(hashsize)) { printf("WARNING: TCB hash size not a power of 2\n"); hashsize = 512; /* safe default */ } tcp_tcbhashsize = hashsize; - tcbinfo.ipi_hashbase = hashinit(hashsize, M_PCB, - &tcbinfo.ipi_hashmask); - tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB, - &tcbinfo.ipi_porthashmask); - tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), - NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); + V_tcbinfo.ipi_hashbase = hashinit(hashsize, M_PCB, + &V_tcbinfo.ipi_hashmask); + V_tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB, + &V_tcbinfo.ipi_porthashmask); + V_tcbinfo.ipi_zone = tcp_ipi_zone; + V_tcbinfo.ipi_vnet = curvnet; #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ @@ -294,27 +370,44 @@ if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR - /* - * These have to be type stable for the benefit of the timers. - */ - tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(tcpcb_zone, maxsockets); - tcp_tw_init(); + syncache_init(); tcp_hc_init(); + +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) + return; +#endif + tcp_reass_init(); ISN_LOCK_INIT(); callout_init(&isn_callout, CALLOUT_MPSAFE); - tcp_isn_tick(NULL); + callout_reset(&isn_callout, 1, tcp_isn_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); - sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } +#ifdef VIMAGE +void +tcp_destroy(void) +{ + INIT_VNET_INET(curvnet); + + tcp_tw_destroy(); + tcp_hc_destroy(); + syncache_destroy(); + + /* XXX check that hashes are empty! */ + hashdestroy(V_tcbinfo.ipi_hashbase, M_PCB, + V_tcbinfo.ipi_hashmask); + hashdestroy(V_tcbinfo.ipi_porthashbase, M_PCB, + V_tcbinfo.ipi_porthashmask); + INP_INFO_LOCK_DESTROY(&V_tcbinfo); +} +#endif + void tcp_fini(void *xtp) { @@ -416,6 +509,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags) { + INIT_VNET_INET(curvnet); int tlen; int win = 0; struct ip *ip; @@ -512,8 +606,8 @@ { tlen += sizeof (struct tcpiphdr); ip->ip_len = tlen; - ip->ip_ttl = ip_defttl; - if (path_mtu_discovery) + ip->ip_ttl = V_ip_defttl; + if (V_path_mtu_discovery) ip->ip_off |= IP_DF; } m->m_len = tlen; @@ -582,6 +676,8 @@ struct tcpcb * tcp_newtcpcb(struct inpcb *inp) { + INIT_VNET_INET(inp->inp_vnet); + struct tcpcb_mem *tm; struct tcpcb *tp; #ifdef INET6 @@ -593,12 +689,15 @@ return (NULL); tp = &tm->tcb; tp->t_timers = &tm->tt; +#ifdef VIMAGE + tp->t_vnet = inp->inp_vnet; +#endif /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = tp->t_maxopd = #ifdef INET6 - isipv6 ? tcp_v6mssdflt : + isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ - tcp_mssdflt; + V_tcp_mssdflt; /* Set up our timeouts. */ callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE); @@ -607,9 +706,9 @@ callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE); callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE); - if (tcp_do_rfc1323) + if (V_tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); - if (tcp_do_sack) + if (V_tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); tp->t_inpcb = inp; /* XXX */ @@ -632,7 +731,7 @@ * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ - inp->inp_ip_ttl = ip_defttl; + inp->inp_ip_ttl = V_ip_defttl; inp->inp_ppcb = tp; return (tp); /* XXX */ } @@ -645,17 +744,18 @@ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { + INIT_VNET_INET(tp->t_inpcb->inp_vnet); struct socket *so = tp->t_inpcb->inp_socket; - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_state = TCPS_CLOSED; (void) tcp_output_reset(tp); - tcpstat.tcps_drops++; + V_tcpstat.tcps_drops++; } else - tcpstat.tcps_conndrops++; + V_tcpstat.tcps_conndrops++; if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; @@ -665,6 +765,7 @@ void tcp_discardcb(struct tcpcb *tp) { + INIT_VNET_INET(tp->t_vnet); struct tseg_qent *q; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; @@ -748,7 +849,7 @@ m_freem(q->tqe_m); uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; - tcp_reass_qsize--; + V_tcp_reass_qsize--; } /* Disconnect offload device, if any. */ tcp_offload_detach(tp); @@ -766,17 +867,18 @@ struct tcpcb * tcp_close(struct tcpcb *tp) { + INIT_VNET_INET(tp->t_inpcb->inp_vnet); struct inpcb *inp = tp->t_inpcb; struct socket *so; - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(inp); /* Notify any offload devices of listener close */ if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_close(tp); in_pcbdrop(inp); - tcpstat.tcps_closed++; + V_tcpstat.tcps_closed++; KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; soisdisconnected(so); @@ -797,8 +899,9 @@ void tcp_drain(void) { - if (do_tcpdrain) { + VNET_ITERLOOP_BEGIN(); + INIT_VNET_INET(vnet_iter); struct inpcb *inpb; struct tcpcb *tcpb; struct tseg_qent *te; @@ -811,8 +914,8 @@ * where we're really low on mbufs, this is potentially * usefull. */ - INP_INFO_RLOCK(&tcbinfo); - LIST_FOREACH(inpb, tcbinfo.ipi_listhead, inp_list) { + INP_INFO_RLOCK(&V_tcbinfo); + LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { if (inpb->inp_vflag & INP_TIMEWAIT) continue; INP_LOCK(inpb); @@ -823,13 +926,14 @@ m_freem(te->tqe_m); uma_zfree(tcp_reass_zone, te); tcpb->t_segqlen--; - tcp_reass_qsize--; + V_tcp_reass_qsize--; } tcp_clean_sackreport(tcpb); } INP_UNLOCK(inpb); } - INP_INFO_RUNLOCK(&tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); + VNET_ITERLOOP_END(); } } @@ -845,8 +949,11 @@ tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; +#ifdef INVARIANTS + INIT_VNET_INET(inp->inp_vnet); +#endif - INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_LOCK_ASSERT(inp); if ((inp->inp_vflag & INP_TIMEWAIT) || @@ -888,6 +995,7 @@ static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET(curvnet); int error, i, m, n, pcb_count; struct inpcb *inp, **inp_list; inp_gen_t gencnt; @@ -899,7 +1007,7 @@ */ if (req->oldptr == NULL) { m = syncache_pcbcount(); - n = tcbinfo.ipi_count; + n = V_tcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + ((m + n) + n/8) * sizeof(struct xtcpcb); return (0); @@ -911,10 +1019,10 @@ /* * OK, now we're committed to doing something. */ - INP_INFO_RLOCK(&tcbinfo); - gencnt = tcbinfo.ipi_gencnt; - n = tcbinfo.ipi_count; - INP_INFO_RUNLOCK(&tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); + gencnt = V_tcbinfo.ipi_gencnt; + n = V_tcbinfo.ipi_count; + INP_INFO_RUNLOCK(&V_tcbinfo); m = syncache_pcbcount(); @@ -939,9 +1047,9 @@ if (inp_list == NULL) return (ENOMEM); - INP_INFO_RLOCK(&tcbinfo); - for (inp = LIST_FIRST(tcbinfo.ipi_listhead), i = 0; inp != NULL && i - < n; inp = LIST_NEXT(inp, inp_list)) { + INP_INFO_RLOCK(&V_tcbinfo); + for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; + inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_LOCK(inp); if (inp->inp_gencnt <= gencnt) { /* @@ -963,7 +1071,7 @@ } INP_UNLOCK(inp); } - INP_INFO_RUNLOCK(&tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); n = i; error = 0; @@ -1007,11 +1115,11 @@ * while we were processing this request, and it * might be necessary to retry. */ - INP_INFO_RLOCK(&tcbinfo); - xig.xig_gen = tcbinfo.ipi_gencnt; + INP_INFO_RLOCK(&V_tcbinfo); + xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; - xig.xig_count = tcbinfo.ipi_count + pcb_count; - INP_INFO_RUNLOCK(&tcbinfo); + xig.xig_count = V_tcbinfo.ipi_count + pcb_count; + INP_INFO_RUNLOCK(&V_tcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); @@ -1024,6 +1132,7 @@ static int tcp_getcred(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET(curvnet); struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; @@ -1035,9 +1144,9 @@ error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); - INP_INFO_RLOCK(&tcbinfo); - inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, - addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); + INP_INFO_RLOCK(&V_tcbinfo); + inp = in_pcblookup_hash(&V_tcbinfo, addrs[1].sin_addr, + addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); if (inp == NULL) { error = ENOENT; goto outunlocked; @@ -1054,7 +1163,7 @@ out: INP_UNLOCK(inp); outunlocked: - INP_INFO_RUNLOCK(&tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); @@ -1068,6 +1177,8 @@ static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { + INIT_VNET_INET(curvnet); + INIT_VNET_INET6(curvnet); struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; @@ -1079,8 +1190,8 @@ error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); - if ((error = sa6_embedscope(&addrs[0], ip6_use_defzone)) != 0 || - (error = sa6_embedscope(&addrs[1], ip6_use_defzone)) != 0) { + if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || + (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { @@ -1090,16 +1201,16 @@ return (EINVAL); } - INP_INFO_RLOCK(&tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); if (mapped == 1) - inp = in_pcblookup_hash(&tcbinfo, + inp = in_pcblookup_hash(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, 0, NULL); else - inp = in6_pcblookup_hash(&tcbinfo, + inp = in6_pcblookup_hash(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL); if (inp == NULL) { @@ -1118,7 +1229,7 @@ out: INP_UNLOCK(inp); outunlocked: - INP_INFO_RUNLOCK(&tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); @@ -1133,6 +1244,7 @@ void tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { + INIT_VNET_INET(curvnet); struct ip *ip = vip; struct tcphdr *th; struct in_addr faddr; @@ -1150,7 +1262,7 @@ if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; - else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || + else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; /* @@ -1177,8 +1289,8 @@ - offsetof(struct icmp, icmp_ip)); th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); - INP_INFO_WLOCK(&tcbinfo); - inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, + INP_INFO_WLOCK(&V_tcbinfo); + inp = in_pcblookup_hash(&V_tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, 0, NULL); if (inp != NULL) { INP_LOCK(inp); @@ -1210,11 +1322,11 @@ if (!mtu) mtu = ip_next_mtu(ip->ip_len, 1); - if (mtu < max(296, (tcp_minmss) + if (mtu < max(296, V_tcp_minmss + sizeof(struct tcpiphdr))) mtu = 0; if (!mtu) - mtu = tcp_mssdflt + mtu = V_tcp_mssdflt + sizeof(struct tcpiphdr); /* * Only cache the the MTU if it @@ -1241,15 +1353,16 @@ #endif syncache_unreach(&inc, th); } - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); } else - in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); + in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); } #ifdef INET6 void tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { + INIT_VNET_INET(curvnet); struct tcphdr th; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; @@ -1303,7 +1416,7 @@ bzero(&th, sizeof(th)); m_copydata(m, off, sizeof(*thp), (caddr_t)&th); - in6_pcbnotify(&tcbinfo, sa, th.th_dport, + in6_pcbnotify(&V_tcbinfo, sa, th.th_dport, (struct sockaddr *)ip6cp->ip6c_src, th.th_sport, cmd, NULL, notify); @@ -1312,11 +1425,11 @@ inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; inc.inc_isipv6 = 1; - INP_INFO_WLOCK(&tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); syncache_unreach(&inc, &th); - INP_INFO_WUNLOCK(&tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); } else - in6_pcbnotify(&tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, + in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, NULL, notify); } #endif /* INET6 */ @@ -1370,14 +1483,17 @@ #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) +#ifndef VIMAGE static u_char isn_secret[32]; static int isn_last_reseed; static u_int32_t isn_offset, isn_offset_old; static MD5_CTX isn_ctx; +#endif tcp_seq tcp_new_isn(struct tcpcb *tp) { + INIT_VNET_INET(tp->t_vnet); u_int32_t md5_buffer[4]; tcp_seq new_isn; @@ -1385,37 +1501,37 @@ ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ - if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) && - (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) + if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && + (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) < (u_int)ticks))) { - read_random(&isn_secret, sizeof(isn_secret)); - isn_last_reseed = ticks; + read_random(&V_isn_secret, sizeof(V_isn_secret)); + V_isn_last_reseed = ticks; } /* Compute the md5 hash and return the ISN. */ - MD5Init(&isn_ctx); - MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); - MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); + MD5Init(&V_isn_ctx); + MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); + MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { - MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, + MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,