/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * ROUTE - implementation of the IP router. * * Version: @(#)route.c 1.0.14 05/31/93 * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Linus Torvalds, <Linus.Torvalds@helsinki.fi> * * Fixes: * Alan Cox : Verify area fixes. * Alan Cox : cli() protects routing changes * Rui Oliveira : ICMP routing table updates * (rco@di.uminho.pt) Routing table insertion and update * Linus Torvalds : Rewrote bits to be sensible * Alan Cox : Added BSD route gw semantics * Alan Cox : Super /proc >4K * Alan Cox : MTU in route table * Alan Cox : MSS actually. Also added the window * clamper. * Sam Lantinga : Fixed route matching in rt_del() * Alan Cox : Routing cache support. * Alan Cox : Removed compatibility cruft. * Alan Cox : RTF_REJECT support. * Alan Cox : TCP irtt support. * Jonathan Naylor : Added Metric support. * Miquel van Smoorenburg : BSD API fixes. * Miquel van Smoorenburg : Metrics. * Alan Cox : Use __u32 properly * Alan Cox : Aligned routing errors more closely with BSD * our system is still very different. * Alan Cox : Faster /proc handling * Alexey Kuznetsov : Massive rework to support tree based routing, * routing caches and better behaviour. * * Olaf Erb : irtt wasn't being copied right. * Bjorn Ekwall : Kerneld route support. * Alan Cox : Multicast fixed (I hope) * Pavel Krauz : Limited broadcast fixed * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #include <linux/config.h> #include <asm/segment.h> #include <asm/system.h> #include <asm/bitops.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/tcp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/icmp.h> #include <net/netlink.h> #ifdef CONFIG_KERNELD #include <linux/kerneld.h> #endif /* * Forwarding Information Base definitions. */ struct fib_node { struct fib_node *fib_next; __u32 fib_dst; unsigned long fib_use; struct fib_info *fib_info; short fib_metric; unsigned char fib_tos; }; /* * This structure contains data shared by many of routes. */ struct fib_info { struct fib_info *fib_next; struct fib_info *fib_prev; __u32 fib_gateway; struct device *fib_dev; int fib_refcnt; unsigned long fib_window; unsigned short fib_flags; unsigned short fib_mtu; unsigned short fib_irtt; }; struct fib_zone { struct fib_zone *fz_next; struct fib_node **fz_hash_table; struct fib_node *fz_list; int fz_nent; int fz_logmask; __u32 fz_mask; }; static struct fib_zone *fib_zones[33]; static struct fib_zone *fib_zone_list; static struct fib_node *fib_loopback = NULL; static struct fib_info *fib_info_list; /* * Backlogging. */ #define RT_BH_REDIRECT 0 #define RT_BH_GARBAGE_COLLECT 1 #define RT_BH_FREE 2 struct rt_req { struct rt_req * rtr_next; struct device *dev; __u32 dst; __u32 gw; unsigned char tos; }; int ip_rt_lock; unsigned ip_rt_bh_mask; static struct rt_req *rt_backlog; /* * Route cache. */ struct rtable *ip_rt_hash_table[RT_HASH_DIVISOR]; static int rt_cache_size; static struct rtable *rt_free_queue; struct wait_queue *rt_wait; static void rt_kick_backlog(void); static void rt_cache_add(unsigned hash, struct rtable * rth); static void rt_cache_flush(void); static void rt_garbage_collect_1(void); /* * Evaluate mask length. */ static __inline__ int rt_logmask(__u32 mask) { if (!(mask = ntohl(mask))) return 32; return ffz(~mask); } /* * Create mask from length. */ static __inline__ __u32 rt_mask(int logmask) { if (logmask >= 32) return 0; return htonl(~((1<<logmask)-1)); } static __inline__ unsigned fz_hash_code(__u32 dst, int logmask) { return ip_rt_hash_code(ntohl(dst)>>logmask); } /* * Free FIB node. */ static void fib_free_node(struct fib_node * f) { struct fib_info * fi = f->fib_info; if (!--fi->fib_refcnt) { #if RT_CACHE_DEBUG >= 2 printk("fib_free_node: fi %08x/%s is free\n", fi->fib_gateway, fi->fib_dev->name); #endif if (fi->fib_next) fi->fib_next->fib_prev = fi->fib_prev; if (fi->fib_prev) fi->fib_prev->fib_next = fi->fib_next; if (fi == fib_info_list) fib_info_list = fi->fib_next; } kfree_s(f, sizeof(struct fib_node)); } /* * Find gateway route by address. */ static struct fib_node * fib_lookup_gateway(__u32 dst) { struct fib_zone * fz; struct fib_node * f; for (fz = fib_zone_list; fz; fz = fz->fz_next) { if (fz->fz_hash_table) f = fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)]; else f = fz->fz_list; for ( ; f; f = f->fib_next) { if ((dst ^ f->fib_dst) & fz->fz_mask) continue; if (f->fib_info->fib_flags & RTF_GATEWAY) return NULL; return f; } } return NULL; } /* * Find local route by address. * FIXME: I use "longest match" principle. If destination * has some non-local route, I'll not search shorter matches. * It's possible, I'm wrong, but I wanted to prevent following * situation: * route add 193.233.7.128 netmask 255.255.255.192 gw xxxxxx * route add 193.233.7.0 netmask 255.255.255.0 eth1 * (Two ethernets connected by serial line, one is small and other is large) * Host 193.233.7.129 is locally unreachable, * but old (<=1.3.37) code will send packets destined for it to eth1. * */ static struct fib_node * fib_lookup_local(__u32 dst) { struct fib_zone * fz; struct fib_node * f; for (fz = fib_zone_list; fz; fz = fz->fz_next) { int longest_match_found = 0; if (fz->fz_hash_table) f = fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)]; else f = fz->fz_list; for ( ; f; f = f->fib_next) { if ((dst ^ f->fib_dst) & fz->fz_mask) continue; if (!(f->fib_info->fib_flags & RTF_GATEWAY)) return f; longest_match_found = 1; } if (longest_match_found) return NULL; } return NULL; } /* * Main lookup routine. * IMPORTANT NOTE: this algorithm has small difference from <=1.3.37 visible * by user. It doesn't route non-CIDR broadcasts by default. * * F.e. * ifconfig eth0 193.233.7.65 netmask 255.255.255.192 broadcast 193.233.7.255 * is valid, but if you really are not able (not allowed, do not want) to * use CIDR compliant broadcast 193.233.7.127, you should add host route: * route add -host 193.233.7.255 eth0 */ static struct fib_node * fib_lookup(__u32 dst) { struct fib_zone * fz; struct fib_node * f; for (fz = fib_zone_list; fz; fz = fz->fz_next) { if (fz->fz_hash_table) f = fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)]; else f = fz->fz_list; for ( ; f; f = f->fib_next) { if ((dst ^ f->fib_dst) & fz->fz_mask) continue; return f; } } return NULL; } static __inline__ struct device * get_gw_dev(__u32 gw) { struct fib_node * f; f = fib_lookup_gateway(gw); if (f) return f->fib_info->fib_dev; return NULL; } /* * Check if a mask is acceptable. */ static inline int bad_mask(__u32 mask, __u32 addr) { if (addr & (mask = ~mask)) return 1; mask = ntohl(mask); if (mask & (mask+1)) return 1; return 0; } static int fib_del_list(struct fib_node **fp, __u32 dst, struct device * dev, __u32 gtw, short flags, short metric, __u32 mask) { struct fib_node *f; int found=0; while((f = *fp) != NULL) { struct fib_info * fi = f->fib_info; /* * Make sure the destination and netmask match. * metric, gateway and device are also checked * if they were specified. */ if (f->fib_dst != dst || (gtw && fi->fib_gateway != gtw) || (metric >= 0 && f->fib_metric != metric) || (dev && fi->fib_dev != dev) ) { fp = &f->fib_next; continue; } cli(); *fp = f->fib_next; if (fib_loopback == f) fib_loopback = NULL; sti(); ip_netlink_msg(RTMSG_DELROUTE, dst, gtw, mask, flags, metric, fi->fib_dev->name); fib_free_node(f); found++; } return found; } static __inline__ int fib_del_1(__u32 dst, __u32 mask, struct device * dev, __u32 gtw, short flags, short metric) { struct fib_node **fp; struct fib_zone *fz; int found=0; if (!mask) { for (fz=fib_zone_list; fz; fz = fz->fz_next) { int tmp; if (fz->fz_hash_table) fp = &fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)]; else fp = &fz->fz_list; tmp = fib_del_list(fp, dst, dev, gtw, flags, metric, mask); fz->fz_nent -= tmp; found += tmp; } } else { if ((fz = fib_zones[rt_logmask(mask)]) != NULL) { if (fz->fz_hash_table) fp = &fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)]; else fp = &fz->fz_list; found = fib_del_list(fp, dst, dev, gtw, flags, metric, mask); fz->fz_nent -= found; } } if (found) { rt_cache_flush(); return 0; } return -ESRCH; } static struct fib_info * fib_create_info(__u32 gw, struct device * dev, unsigned short flags, unsigned short mss, unsigned long window, unsigned short irtt) { struct fib_info * fi; if (!(flags & RTF_MSS)) { mss = dev->mtu; #ifdef CONFIG_NO_PATH_MTU_DISCOVERY /* * If MTU was not specified, use default. * If you want to increase MTU for some net (local subnet) * use "route add .... mss xxx". * * The MTU isn't currently always used and computed as it * should be as far as I can tell. [Still verifying this is right] */ if ((flags & RTF_GATEWAY) && mss > 576) mss = 576; #endif } if (!(flags & RTF_WINDOW)) window = 0; if (!(flags & RTF_IRTT)) irtt = 0; for (fi=fib_info_list; fi; fi = fi->fib_next) { if (fi->fib_gateway != gw || fi->fib_dev != dev || fi->fib_flags != flags || fi->fib_mtu != mss || fi->fib_window != window || fi->fib_irtt != irtt) continue; fi->fib_refcnt++; #if RT_CACHE_DEBUG >= 2 printk("fib_create_info: fi %08x/%s is duplicate\n", fi->fib_gateway, fi->fib_dev->name); #endif return fi; } fi = (struct fib_info*)kmalloc(sizeof(struct fib_info), GFP_KERNEL); if (!fi) return NULL; memset(fi, 0, sizeof(struct fib_info)); fi->fib_flags = flags; fi->fib_dev = dev; fi->fib_gateway = gw; fi->fib_mtu = mss; fi->fib_window = window; fi->fib_refcnt++; fi->fib_next = fib_info_list; fi->fib_prev = NULL; fi->fib_irtt = irtt; if (fib_info_list) fib_info_list->fib_prev = fi; fib_info_list = fi; #if RT_CACHE_DEBUG >= 2 printk("fib_create_info: fi %08x/%s is created\n", fi->fib_gateway, fi->fib_dev->name); #endif return fi; } static __inline__ void fib_add_1(short flags, __u32 dst, __u32 mask, __u32 gw, struct device *dev, unsigned short mss, unsigned long window, unsigned short irtt, short metric) { struct fib_node *f, *f1; struct fib_node **fp; struct fib_node **dup_fp = NULL; struct fib_zone * fz; struct fib_info * fi; int logmask; /* * Allocate an entry and fill it in. */ f = (struct fib_node *) kmalloc(sizeof(struct fib_node), GFP_KERNEL); if (f == NULL) return; memset(f, 0, sizeof(struct fib_node)); f->fib_dst = dst; f->fib_metric = metric; f->fib_tos = 0; if ((fi = fib_create_info(gw, dev, flags, mss, window, irtt)) == NULL) { kfree_s(f, sizeof(struct fib_node)); return; } f->fib_info = fi; logmask = rt_logmask(mask); fz = fib_zones[logmask]; if (!fz) { int i; fz = kmalloc(sizeof(struct fib_zone), GFP_KERNEL); if (!fz) { fib_free_node(f); return; } memset(fz, 0, sizeof(struct fib_zone)); fz->fz_logmask = logmask; fz->fz_mask = mask; for (i=logmask-1; i>=0; i--) if (fib_zones[i]) break; cli(); if (i<0) { fz->fz_next = fib_zone_list; fib_zone_list = fz; } else { fz->fz_next = fib_zones[i]->fz_next; fib_zones[i]->fz_next = fz; } fib_zones[logmask] = fz; sti(); } /* * If zone overgrows RTZ_HASHING_LIMIT, create hash table. */ if (fz->fz_nent >= RTZ_HASHING_LIMIT && !fz->fz_hash_table && logmask<32) { struct fib_node ** ht; #if RT_CACHE_DEBUG >= 2 printk("fib_add_1: hashing for zone %d started\n", logmask); #endif ht = kmalloc(RTZ_HASH_DIVISOR*sizeof(struct rtable*), GFP_KERNEL); if (ht) { memset(ht, 0, RTZ_HASH_DIVISOR*sizeof(struct fib_node*)); cli(); f1 = fz->fz_list; while (f1) { struct fib_node * next; unsigned hash = fz_hash_code(f1->fib_dst, logmask); next = f1->fib_next; f1->fib_next = ht[hash]; ht[hash] = f1; f1 = next; } fz->fz_list = NULL; fz->fz_hash_table = ht; sti(); } } if (fz->fz_hash_table) fp = &fz->fz_hash_table[fz_hash_code(dst, logmask)]; else fp = &fz->fz_list; /* * Scan list to find the first route with the same destination */ while ((f1 = *fp) != NULL) { if (f1->fib_dst == dst) break; fp = &f1->fib_next; } /* * Find route with the same destination and less (or equal) metric. */ while ((f1 = *fp) != NULL && f1->fib_dst == dst) { if (f1->fib_metric >= metric) break; /* * Record route with the same destination and gateway, * but less metric. We'll delete it * after instantiation of new route. */ if (f1->fib_info->fib_gateway == gw && (gw || f1->fib_info->fib_dev == dev)) dup_fp = fp; fp = &f1->fib_next; } /* * Is it already present? */ if (f1 && f1->fib_metric == metric && f1->fib_info == fi) { fib_free_node(f); return; } /* * Insert new entry to the list. */ cli(); f->fib_next = f1; *fp = f; if (!fib_loopback && (fi->fib_dev->flags & IFF_LOOPBACK)) fib_loopback = f; sti(); fz->fz_nent++; ip_netlink_msg(RTMSG_NEWROUTE, dst, gw, mask, flags, metric, fi->fib_dev->name); /* * Delete route with the same destination and gateway. * Note that we should have at most one such route. */ if (dup_fp) fp = dup_fp; else fp = &f->fib_next; while ((f1 = *fp) != NULL && f1->fib_dst == dst) { if (f1->fib_info->fib_gateway == gw && (gw || f1->fib_info->fib_dev == dev)) { cli(); *fp = f1->fib_next; if (fib_loopback == f1) fib_loopback = NULL; sti(); ip_netlink_msg(RTMSG_DELROUTE, dst, gw, mask, flags, metric, f1->fib_info->fib_dev->name); fib_free_node(f1); fz->fz_nent--; break; } fp = &f1->fib_next; } rt_cache_flush(); return; } static int rt_flush_list(struct fib_node ** fp, struct device *dev) { int found = 0; struct fib_node *f; while ((f = *fp) != NULL) { /* * "Magic" device route is allowed to point to loopback, * discard it too. */ if (f->fib_info->fib_dev != dev && (f->fib_info->fib_dev != &loopback_dev || f->fib_dst != dev->pa_addr)) { fp = &f->fib_next; continue; } cli(); *fp = f->fib_next; if (fib_loopback == f) fib_loopback = NULL; sti(); fib_free_node(f); found++; } return found; } static __inline__ void fib_flush_1(struct device *dev) { struct fib_zone *fz; int found = 0; for (fz = fib_zone_list; fz; fz = fz->fz_next) { if (fz->fz_hash_table) { int i; int tmp = 0; for (i=0; i<RTZ_HASH_DIVISOR; i++) tmp += rt_flush_list(&fz->fz_hash_table[i], dev); fz->fz_nent -= tmp; found += tmp; } else { int tmp; tmp = rt_flush_list(&fz->fz_list, dev); fz->fz_nent -= tmp; found += tmp; } } if (found) rt_cache_flush(); } /* * Called from the PROCfs module. This outputs /proc/net/route. * * We preserve the old format but pad the buffers out. This means that * we can spin over the other entries as we read them. Remember the * gated BGP4 code could need to read 60,000+ routes on occasion (that's * about 7Mb of data). To do that ok we will need to also cache the * last route we got to (reads will generally be following on from * one another without gaps). */ int rt_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { struct fib_zone *fz; struct fib_node *f; int len=0; off_t pos=0; char temp[129]; int i; pos = 128; if (offset<128) { sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT"); len = 128; } while (ip_rt_lock) sleep_on(&rt_wait); ip_rt_fast_lock(); for (fz=fib_zone_list; fz; fz = fz->fz_next) { int maxslot; struct fib_node ** fp; if (fz->fz_nent == 0) continue; if (pos + 128*fz->fz_nent <= offset) { pos += 128*fz->fz_nent; len = 0; continue; } if (fz->fz_hash_table) { maxslot = RTZ_HASH_DIVISOR; fp = fz->fz_hash_table; } else { maxslot = 1; fp = &fz->fz_list; } for (i=0; i < maxslot; i++, fp++) { for (f = *fp; f; f = f->fib_next) { struct fib_info * fi; /* * Spin through entries until we are ready */ pos += 128; if (pos <= offset) { len=0; continue; } fi = f->fib_info; sprintf(temp, "%s\t%08lX\t%08lX\t%02X\t%d\t%lu\t%d\t%08lX\t%d\t%lu\t%u", fi->fib_dev->name, (unsigned long)f->fib_dst, (unsigned long)fi->fib_gateway, fi->fib_flags, 0, f->fib_use, f->fib_metric, (unsigned long)fz->fz_mask, (int)fi->fib_mtu, fi->fib_window, (int)fi->fib_irtt); sprintf(buffer+len,"%-127s\n",temp); len += 128; if (pos >= offset+length) goto done; } } } done: ip_rt_unlock(); wake_up(&rt_wait); *start = buffer+len-(pos-offset); len = pos - offset; if (len>length) len = length; return len; } int rt_cache_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { int len=0; off_t pos=0; char temp[129]; struct rtable *r; int i; pos = 128; if (offset<128) { sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tHH\tARP"); len = 128; } while (ip_rt_lock) sleep_on(&rt_wait); ip_rt_fast_lock(); for (i = 0; i<RT_HASH_DIVISOR; i++) { for (r = ip_rt_hash_table[i]; r; r = r->rt_next) { /* * Spin through entries until we are ready */ pos += 128; if (pos <= offset) { len = 0; continue; } sprintf(temp, "%s\t%08lX\t%08lX\t%02X\t%d\t%u\t%d\t%08lX\t%d\t%lu\t%u\t%d\t%1d", r->rt_dev->name, (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, r->rt_flags, r->rt_refcnt, r->rt_use, 0, (unsigned long)r->rt_src, (int)r->rt_mtu, r->rt_window, (int)r->rt_irtt, r->rt_hh ? r->rt_hh->hh_refcnt : -1, r->rt_hh ? r->rt_hh->hh_uptodate : 0); sprintf(buffer+len,"%-127s\n",temp); len += 128; if (pos >= offset+length) goto done; } } done: ip_rt_unlock(); wake_up(&rt_wait); *start = buffer+len-(pos-offset); len = pos-offset; if (len>length) len = length; return len; } static void rt_free(struct rtable * rt) { unsigned long flags; save_flags(flags); cli(); if (!rt->rt_refcnt) { struct hh_cache * hh = rt->rt_hh; rt->rt_hh = NULL; restore_flags(flags); if (hh && atomic_dec_and_test(&hh->hh_refcnt)) kfree_s(hh, sizeof(struct hh_cache)); kfree_s(rt, sizeof(struct rt_table)); return; } rt->rt_next = rt_free_queue; rt->rt_flags &= ~RTF_UP; rt_free_queue = rt; ip_rt_bh_mask |= RT_BH_FREE; #if RT_CACHE_DEBUG >= 2 printk("rt_free: %08x\n", rt->rt_dst); #endif restore_flags(flags); } /* * RT "bottom half" handlers. Called with masked interrupts. */ static __inline__ void rt_kick_free_queue(void) { struct rtable *rt, **rtp; rtp = &rt_free_queue; while ((rt = *rtp) != NULL) { if (!rt->rt_refcnt) { struct hh_cache * hh = rt->rt_hh; #if RT_CACHE_DEBUG >= 2 __u32 daddr = rt->rt_dst; #endif *rtp = rt->rt_next; rt->rt_hh = NULL; sti(); if (hh && atomic_dec_and_test(&hh->hh_refcnt)) kfree_s(hh, sizeof(struct hh_cache)); kfree_s(rt, sizeof(struct rt_table)); #if RT_CACHE_DEBUG >= 2 printk("rt_kick_free_queue: %08x is free\n", daddr); #endif cli(); continue; } rtp = &rt->rt_next; } } void ip_rt_run_bh() { unsigned long flags; save_flags(flags); cli(); if (ip_rt_bh_mask && !ip_rt_lock) { if (ip_rt_bh_mask & RT_BH_REDIRECT) rt_kick_backlog(); if (ip_rt_bh_mask & RT_BH_GARBAGE_COLLECT) { ip_rt_fast_lock(); ip_rt_bh_mask &= ~RT_BH_GARBAGE_COLLECT; sti(); rt_garbage_collect_1(); cli(); ip_rt_fast_unlock(); } if (ip_rt_bh_mask & RT_BH_FREE) rt_kick_free_queue(); } restore_flags(flags); } void ip_rt_check_expire() { ip_rt_fast_lock(); if (ip_rt_lock == 1) { int i; struct rtable *rth, **rthp; unsigned long flags; unsigned long now = jiffies; save_flags(flags); for (i=0; i<RT_HASH_DIVISOR; i++) { rthp = &ip_rt_hash_table[i]; while ((rth = *rthp) != NULL) { struct rtable * rth_next = rth->rt_next; /* * Cleanup aged off entries. */ cli(); if (!rth->rt_refcnt && rth->rt_lastuse + RT_CACHE_TIMEOUT < now) { *rthp = rth_next; sti(); rt_cache_size--; #if RT_CACHE_DEBUG >= 2 printk("rt_check_expire clean %02x@%08x\n", i, rth->rt_dst); #endif rt_free(rth); continue; } sti(); if (!rth_next) break; /* * LRU ordering. */ if (rth->rt_lastuse + RT_CACHE_BUBBLE_THRESHOLD < rth_next->rt_lastuse || (rth->rt_lastuse < rth_next->rt_lastuse && rth->rt_use < rth_next->rt_use)) { #if RT_CACHE_DEBUG >= 2 printk("rt_check_expire bubbled %02x@%08x<->%08x\n", i, rth->rt_dst, rth_next->rt_dst); #endif cli(); *rthp = rth_next; rth->rt_next = rth_next->rt_next; rth_next->rt_next = rth; sti(); rthp = &rth_next->rt_next; continue; } rthp = &rth->rt_next; } } restore_flags(flags); rt_kick_free_queue(); } ip_rt_unlock(); } static void rt_redirect_1(__u32 dst, __u32 gw, struct device *dev) { struct rtable *rt; unsigned long hash = ip_rt_hash_code(dst); if (gw == dev->pa_addr) return; if (dev != get_gw_dev(gw)) return; rt = (struct rtable *) kmalloc(sizeof(struct rtable), GFP_ATOMIC); if (rt == NULL) return; memset(rt, 0, sizeof(struct rtable)); rt->rt_flags = RTF_DYNAMIC | RTF_MODIFIED | RTF_HOST | RTF_GATEWAY | RTF_UP; rt->rt_dst = dst; rt->rt_dev = dev; rt->rt_gateway = gw; rt->rt_src = dev->pa_addr; rt->rt_mtu = dev->mtu; #ifdef CONFIG_NO_PATH_MTU_DISCOVERY if (dev->mtu > 576) rt->rt_mtu = 576; #endif rt->rt_lastuse = jiffies; rt->rt_refcnt = 1; rt_cache_add(hash, rt); ip_rt_put(rt); return; } static void rt_cache_flush(void) { int i; struct rtable * rth, * next; for (i=0; i<RT_HASH_DIVISOR; i++) { int nr=0; cli(); if (!(rth = ip_rt_hash_table[i])) { sti(); continue; } ip_rt_hash_table[i] = NULL; sti(); for (; rth; rth=next) { next = rth->rt_next; rt_cache_size--; nr++; rth->rt_next = NULL; rt_free(rth); } #if RT_CACHE_DEBUG >= 2 if (nr > 0) printk("rt_cache_flush: %d@%02x\n", nr, i); #endif } #if RT_CACHE_DEBUG >= 1 if (rt_cache_size) { printk("rt_cache_flush: bug rt_cache_size=%d\n", rt_cache_size); rt_cache_size = 0; } #endif } static void rt_garbage_collect_1(void) { int i; unsigned expire = RT_CACHE_TIMEOUT>>1; struct rtable * rth, **rthp; unsigned long now = jiffies; for (;;) { for (i=0; i<RT_HASH_DIVISOR; i++) { if (!ip_rt_hash_table[i]) continue; for (rthp=&ip_rt_hash_table[i]; (rth=*rthp); rthp=&rth->rt_next) { if (rth->rt_lastuse + expire*(rth->rt_refcnt+1) > now) continue; rt_cache_size--; cli(); *rthp=rth->rt_next; rth->rt_next = NULL; sti(); rt_free(rth); break; } } if (rt_cache_size < RT_CACHE_SIZE_MAX) return; expire >>= 1; } } static __inline__ void rt_req_enqueue(struct rt_req **q, struct rt_req *rtr) { unsigned long flags; struct rt_req * tail; save_flags(flags); cli(); tail = *q; if (!tail) rtr->rtr_next = rtr; else { rtr->rtr_next = tail->rtr_next; tail->rtr_next = rtr; } *q = rtr; restore_flags(flags); return; } /* * Caller should mask interrupts. */ static __inline__ struct rt_req * rt_req_dequeue(struct rt_req **q) { struct rt_req * rtr; if (*q) { rtr = (*q)->rtr_next; (*q)->rtr_next = rtr->rtr_next; if (rtr->rtr_next == rtr) *q = NULL; rtr->rtr_next = NULL; return rtr; } return NULL; } /* Called with masked interrupts */ static void rt_kick_backlog() { if (!ip_rt_lock) { struct rt_req * rtr; ip_rt_fast_lock(); while ((rtr = rt_req_dequeue(&rt_backlog)) != NULL) { sti(); rt_redirect_1(rtr->dst, rtr->gw, rtr->dev); kfree_s(rtr, sizeof(struct rt_req)); cli(); } ip_rt_bh_mask &= ~RT_BH_REDIRECT; ip_rt_fast_unlock(); } } /* * rt_{del|add|flush} called only from USER process. Waiting is OK. */ static int rt_del(__u32 dst, __u32 mask, struct device * dev, __u32 gtw, short rt_flags, short metric) { int retval; while (ip_rt_lock) sleep_on(&rt_wait); ip_rt_fast_lock(); retval = fib_del_1(dst, mask, dev, gtw, rt_flags, metric); ip_rt_unlock(); wake_up(&rt_wait); return retval; } static void rt_add(short flags, __u32 dst, __u32 mask, __u32 gw, struct device *dev, unsigned short mss, unsigned long window, unsigned short irtt, short metric) { while (ip_rt_lock) sleep_on(&rt_wait); ip_rt_fast_lock(); fib_add_1(flags, dst, mask, gw, dev, mss, window, irtt, metric); ip_rt_unlock(); wake_up(&rt_wait); } void ip_rt_flush(struct device *dev) { while (ip_rt_lock) sleep_on(&rt_wait); ip_rt_fast_lock(); fib_flush_1(dev); ip_rt_unlock(); wake_up(&rt_wait); } /* Called by ICMP module. */ void ip_rt_redirect(__u32 src, __u32 dst, __u32 gw, struct device *dev) { struct rt_req * rtr; struct rtable * rt; rt = ip_rt_route(dst, 0); if (!rt) return; if (rt->rt_gateway != src || rt->rt_dev != dev || ((gw^dev->pa_addr)&dev->pa_mask) || ip_chk_addr(gw)) { ip_rt_put(rt); return; } ip_rt_put(rt); ip_rt_fast_lock(); if (ip_rt_lock == 1) { rt_redirect_1(dst, gw, dev); ip_rt_unlock(); return; } rtr = kmalloc(sizeof(struct rt_req), GFP_ATOMIC); if (rtr) { rtr->dst = dst; rtr->gw = gw; rtr->dev = dev; rt_req_enqueue(&rt_backlog, rtr); ip_rt_bh_mask |= RT_BH_REDIRECT; } ip_rt_unlock(); } static __inline__ void rt_garbage_collect(void) { if (ip_rt_lock == 1) { rt_garbage_collect_1(); return; } ip_rt_bh_mask |= RT_BH_GARBAGE_COLLECT; } static void rt_cache_add(unsigned hash, struct rtable * rth) { unsigned long flags; struct rtable **rthp; __u32 daddr = rth->rt_dst; unsigned long now = jiffies; #if RT_CACHE_DEBUG >= 2 if (ip_rt_lock != 1) { printk("rt_cache_add: ip_rt_lock==%d\n", ip_rt_lock); return; } #endif save_flags(flags); if (rth->rt_dev->header_cache_bind) { struct rtable * rtg = rth; if (rth->rt_gateway != daddr) { ip_rt_fast_unlock(); rtg = ip_rt_route(rth->rt_gateway, 0); ip_rt_fast_lock(); } if (rtg) { if (rtg == rth) rtg->rt_dev->header_cache_bind(&rtg->rt_hh, rtg->rt_dev, ETH_P_IP, rtg->rt_dst); else { if (rtg->rt_hh) atomic_inc(&rtg->rt_hh->hh_refcnt); rth->rt_hh = rtg->rt_hh; ip_rt_put(rtg); } } } if (rt_cache_size >= RT_CACHE_SIZE_MAX) rt_garbage_collect(); cli(); rth->rt_next = ip_rt_hash_table[hash]; #if RT_CACHE_DEBUG >= 2 if (rth->rt_next) { struct rtable * trth; printk("rt_cache @%02x: %08x", hash, daddr); for (trth=rth->rt_next; trth; trth=trth->rt_next) printk(" . %08x", trth->rt_dst); printk("\n"); } #endif ip_rt_hash_table[hash] = rth; rthp = &rth->rt_next; sti(); rt_cache_size++; /* * Cleanup duplicate (and aged off) entries. */ while ((rth = *rthp) != NULL) { cli(); if ((!rth->rt_refcnt && rth->rt_lastuse + RT_CACHE_TIMEOUT < now) || rth->rt_dst == daddr) { *rthp = rth->rt_next; rt_cache_size--; sti(); #if RT_CACHE_DEBUG >= 2 printk("rt_cache clean %02x@%08x\n", hash, rth->rt_dst); #endif rt_free(rth); continue; } sti(); rthp = &rth->rt_next; } restore_flags(flags); } /* RT should be already locked. We could improve this by keeping a chain of say 32 struct rtable's last freed for fast recycling. */ struct rtable * ip_rt_slow_route (__u32 daddr, int local) { unsigned hash = ip_rt_hash_code(daddr)^local; struct rtable * rth; struct fib_node * f; struct fib_info * fi; __u32 saddr; #if RT_CACHE_DEBUG >= 2 printk("rt_cache miss @%08x\n", daddr); #endif rth = kmalloc(sizeof(struct rtable), GFP_ATOMIC); if (!rth) { ip_rt_unlock(); return NULL; } if (local) f = fib_lookup_local(daddr); else f = fib_lookup (daddr); if (f) { fi = f->fib_info; f->fib_use++; } if (!f || (fi->fib_flags & RTF_REJECT)) { #ifdef CONFIG_KERNELD char wanted_route[20]; #endif #if RT_CACHE_DEBUG >= 2 printk("rt_route failed @%08x\n", daddr); #endif ip_rt_unlock(); kfree_s(rth, sizeof(struct rtable)); #ifdef CONFIG_KERNELD daddr=ntohl(daddr); sprintf(wanted_route, "%d.%d.%d.%d", (int)(daddr >> 24) & 0xff, (int)(daddr >> 16) & 0xff, (int)(daddr >> 8) & 0xff, (int)daddr & 0xff); kerneld_route(wanted_route); /* Dynamic route request */ #endif return NULL; } saddr = fi->fib_dev->pa_addr; if (daddr == fi->fib_dev->pa_addr) { f->fib_use--; if ((f = fib_loopback) != NULL) { f->fib_use++; fi = f->fib_info; } } if (!f) { ip_rt_unlock(); kfree_s(rth, sizeof(struct rtable)); return NULL; } rth->rt_dst = daddr; rth->rt_src = saddr; rth->rt_lastuse = jiffies; rth->rt_refcnt = 1; rth->rt_use = 1; rth->rt_next = NULL; rth->rt_hh = NULL; rth->rt_gateway = fi->fib_gateway; rth->rt_dev = fi->fib_dev; rth->rt_mtu = fi->fib_mtu; rth->rt_window = fi->fib_window; rth->rt_irtt = fi->fib_irtt; rth->rt_tos = f->fib_tos; rth->rt_flags = fi->fib_flags | RTF_HOST; if (local) rth->rt_flags |= RTF_LOCAL; if (!(rth->rt_flags & RTF_GATEWAY)) rth->rt_gateway = rth->rt_dst; /* * Multicast or limited broadcast is never gatewayed. */ if (MULTICAST(daddr) || daddr == 0xFFFFFFFF) rth->rt_gateway = rth->rt_dst; if (ip_rt_lock == 1) rt_cache_add(hash, rth); else { rt_free(rth); #if RT_CACHE_DEBUG >= 1 printk(KERN_DEBUG "rt_cache: route to %08x was born dead\n", daddr); #endif } ip_rt_unlock(); return rth; } void ip_rt_put(struct rtable * rt) { if (rt) atomic_dec(&rt->rt_refcnt); } struct rtable * ip_rt_route(__u32 daddr, int local) { struct rtable * rth; ip_rt_fast_lock(); for (rth=ip_rt_hash_table[ip_rt_hash_code(daddr)^local]; rth; rth=rth->rt_next) { if (rth->rt_dst == daddr) { rth->rt_lastuse = jiffies; atomic_inc(&rth->rt_use); atomic_inc(&rth->rt_refcnt); ip_rt_unlock(); return rth; } } return ip_rt_slow_route (daddr, local); } /* * Process a route add request from the user, or from a kernel * task. */ int ip_rt_new(struct rtentry *r) { int err; char * devname; struct device * dev = NULL; unsigned long flags; __u32 daddr, mask, gw; short metric; /* * If a device is specified find it. */ if ((devname = r->rt_dev) != NULL) { err = getname(devname, &devname); if (err) return err; dev = dev_get(devname); putname(devname); if (!dev) return -ENODEV; } /* * If the device isn't INET, don't allow it */ if (r->rt_dst.sa_family != AF_INET) return -EAFNOSUPPORT; /* * Make local copies of the important bits * We decrement the metric by one for BSD compatibility. */ flags = r->rt_flags; daddr = (__u32) ((struct sockaddr_in *) &r->rt_dst)->sin_addr.s_addr; mask = (__u32) ((struct sockaddr_in *) &r->rt_genmask)->sin_addr.s_addr; gw = (__u32) ((struct sockaddr_in *) &r->rt_gateway)->sin_addr.s_addr; metric = r->rt_metric > 0 ? r->rt_metric - 1 : 0; /* * BSD emulation: Permits route add someroute gw one-of-my-addresses * to indicate which iface. Not as clean as the nice Linux dev technique * but people keep using it... (and gated likes it ;)) */ if (!dev && (flags & RTF_GATEWAY)) { struct device *dev2; for (dev2 = dev_base ; dev2 != NULL ; dev2 = dev2->next) { if ((dev2->flags & IFF_UP) && dev2->pa_addr == gw) { flags &= ~RTF_GATEWAY; dev = dev2; break; } } } if (flags & RTF_HOST) mask = 0xffffffff; else if (mask && r->rt_genmask.sa_family != AF_INET) return -EAFNOSUPPORT; if (flags & RTF_GATEWAY) { if (r->rt_gateway.sa_family != AF_INET) return -EAFNOSUPPORT; /* * Don't try to add a gateway we can't reach.. * Tunnel devices are exempt from this rule. */ if (!dev) dev = get_gw_dev(gw); else if (dev != get_gw_dev(gw) && dev->type != ARPHRD_TUNNEL) return -EINVAL; if (!dev) return -ENETUNREACH; } else { gw = 0; if (!dev) dev = ip_dev_bynet(daddr, mask); if (!dev) return -ENETUNREACH; if (!mask) { if (((daddr ^ dev->pa_addr) & dev->pa_mask) == 0) mask = dev->pa_mask; } } #ifndef CONFIG_IP_CLASSLESS if (!mask) mask = ip_get_mask(daddr); #endif if (bad_mask(mask, daddr)) return -EINVAL; /* * Add the route */ rt_add(flags, daddr, mask, gw, dev, r->rt_mss, r->rt_window, r->rt_irtt, metric); return 0; } /* * Remove a route, as requested by the user. */ int ip_rt_kill(struct rtentry *r) { struct sockaddr_in *trg; struct sockaddr_in *msk; struct sockaddr_in *gtw; char *devname; int err; struct device * dev = NULL; trg = (struct sockaddr_in *) &r->rt_dst; msk = (struct sockaddr_in *) &r->rt_genmask; gtw = (struct sockaddr_in *) &r->rt_gateway; if ((devname = r->rt_dev) != NULL) { err = getname(devname, &devname); if (err) return err; dev = dev_get(devname); putname(devname); if (!dev) return -ENODEV; } /* * metric can become negative here if it wasn't filled in * but that's a fortunate accident; we really use that in rt_del. */ err=rt_del((__u32)trg->sin_addr.s_addr, (__u32)msk->sin_addr.s_addr, dev, (__u32)gtw->sin_addr.s_addr, r->rt_flags, r->rt_metric - 1); return err; } /* * Handle IP routing ioctl calls. These are used to manipulate the routing tables */ int ip_rt_ioctl(unsigned int cmd, void *arg) { int err; struct rtentry rt; switch(cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ if (!suser()) return -EPERM; err=verify_area(VERIFY_READ, arg, sizeof(struct rtentry)); if (err) return err; memcpy_fromfs(&rt, arg, sizeof(struct rtentry)); return (cmd == SIOCDELRT) ? ip_rt_kill(&rt) : ip_rt_new(&rt); } return -EINVAL; } void ip_rt_advice(struct rtable **rp, int advice) { /* Thanks! */ return; } void ip_rt_update(int event, struct device *dev) { /* * This causes too much grief to do now. */ #ifdef COMING_IN_2_1 if (event == NETDEV_UP) rt_add(RTF_HOST|RTF_UP, dev->pa_addr, ~0, 0, dev, 0, 0, 0, 0); else if (event == NETDEV_DOWN) rt_del(dev->pa_addr, ~0, dev, 0, RTF_HOST|RTF_UP, 0); #endif }