/*
 * INET         An implementation of the TCP/IP protocol suite for the LINUX
 *              operating system.  INET is implemented using the  BSD Socket
 *              interface as the means of communication with the user level.
 *
 *              ROUTE - implementation of the IP router.
 *
 * Version:     @(#)route.c     1.0.14  05/31/93
 *
 * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
 *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
 *
 * Fixes:
 *              Alan Cox        :       Verify area fixes.
 *              Alan Cox        :       cli() protects routing changes
 *              Rui Oliveira    :       ICMP routing table updates
 *              (rco@di.uminho.pt)      Routing table insertion and update
 *              Linus Torvalds  :       Rewrote bits to be sensible
 *              Alan Cox        :       Added BSD route gw semantics
 *              Alan Cox        :       Super /proc >4K 
 *              Alan Cox        :       MTU in route table
 *              Alan Cox        :       MSS actually. Also added the window
 *                                      clamper.
 *              Sam Lantinga    :       Fixed route matching in rt_del()
 *              Alan Cox        :       Routing cache support.
 *              Alan Cox        :       Removed compatibility cruft.
 *              Alan Cox        :       RTF_REJECT support.
 *              Alan Cox        :       TCP irtt support.
 *              Jonathan Naylor :       Added Metric support.
 *      Miquel van Smoorenburg  :       BSD API fixes.
 *      Miquel van Smoorenburg  :       Metrics.
 *              Alan Cox        :       Use __u32 properly
 *              Alan Cox        :       Aligned routing errors more closely with BSD
 *                                      our system is still very different.
 *              Alan Cox        :       Faster /proc handling
 *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
 *                                      routing caches and better behaviour.
 *              
 *              Olaf Erb        :       irtt wasn't being copied right.
 *              Bjorn Ekwall    :       Kerneld route support.
 *              Alan Cox        :       Multicast fixed (I hope)
 *              Pavel Krauz     :       Limited broadcast fixed
 *
 *              This program is free software; you can redistribute it and/or
 *              modify it under the terms of the GNU General Public License
 *              as published by the Free Software Foundation; either version
 *              2 of the License, or (at your option) any later version.
 */

#include <linux/config.h>
#include <asm/segment.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/icmp.h>
#include <net/netlink.h>
#ifdef CONFIG_KERNELD
#include <linux/kerneld.h>
#endif

/*
 * Forwarding Information Base definitions.
 */

struct fib_node
{
        struct fib_node         *fib_next;
        __u32                   fib_dst;
        unsigned long           fib_use;
        struct fib_info         *fib_info;
        short                   fib_metric;
        unsigned char           fib_tos;
};

/*
 * This structure contains data shared by many of routes.
 */     

struct fib_info
{
        struct fib_info         *fib_next;
        struct fib_info         *fib_prev;
        __u32                   fib_gateway;
        struct device           *fib_dev;
        int                     fib_refcnt;
        unsigned long           fib_window;
        unsigned short          fib_flags;
        unsigned short          fib_mtu;
        unsigned short          fib_irtt;
};

struct fib_zone
{
        struct fib_zone *fz_next;
        struct fib_node **fz_hash_table;
        struct fib_node *fz_list;
        int             fz_nent;
        int             fz_logmask;
        __u32           fz_mask;
};

static struct fib_zone  *fib_zones[33];
static struct fib_zone  *fib_zone_list;
static struct fib_node  *fib_loopback = NULL;
static struct fib_info  *fib_info_list;

/*
 * Backlogging.
 */

#define RT_BH_REDIRECT          0
#define RT_BH_GARBAGE_COLLECT   1
#define RT_BH_FREE              2

struct rt_req
{
        struct rt_req * rtr_next;
        struct device *dev;
        __u32 dst;
        __u32 gw;
        unsigned char tos;
};

int                     ip_rt_lock;
unsigned                ip_rt_bh_mask;
static struct rt_req    *rt_backlog;

/*
 * Route cache.
 */

struct rtable           *ip_rt_hash_table[RT_HASH_DIVISOR];
static int              rt_cache_size;
static struct rtable    *rt_free_queue;
struct wait_queue       *rt_wait;

static void rt_kick_backlog(void);
static void rt_cache_add(unsigned hash, struct rtable * rth);
static void rt_cache_flush(void);
static void rt_garbage_collect_1(void);

/* 
 * Evaluate mask length.
 */

static __inline__ int rt_logmask(__u32 mask)
{
        if (!(mask = ntohl(mask)))
                return 32;
        return ffz(~mask);
}

/* 
 * Create mask from length.
 */

static __inline__ __u32 rt_mask(int logmask)
{
        if (logmask >= 32)
                return 0;
        return htonl(~((1<<logmask)-1));
}

static __inline__ unsigned fz_hash_code(__u32 dst, int logmask)
{
        return ip_rt_hash_code(ntohl(dst)>>logmask);
}

/*
 * Free FIB node.
 */

static void fib_free_node(struct fib_node * f)
{
        struct fib_info * fi = f->fib_info;
        if (!--fi->fib_refcnt)
        {
#if RT_CACHE_DEBUG >= 2
                printk("fib_free_node: fi %08x/%s is free\n", fi->fib_gateway, fi->fib_dev->name);
#endif
                if (fi->fib_next)
                        fi->fib_next->fib_prev = fi->fib_prev;
                if (fi->fib_prev)
                        fi->fib_prev->fib_next = fi->fib_next;
                if (fi == fib_info_list)
                        fib_info_list = fi->fib_next;
        }
        kfree_s(f, sizeof(struct fib_node));
}

/*
 * Find gateway route by address.
 */

static struct fib_node * fib_lookup_gateway(__u32 dst)
{
        struct fib_zone * fz;
        struct fib_node * f;

        for (fz = fib_zone_list; fz; fz = fz->fz_next) 
        {
                if (fz->fz_hash_table)
                        f = fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
                else
                        f = fz->fz_list;
                
                for ( ; f; f = f->fib_next)
                {
                        if ((dst ^ f->fib_dst) & fz->fz_mask)
                                continue;
                        if (f->fib_info->fib_flags & RTF_GATEWAY)
                                return NULL;
                        return f;
                }
        }
        return NULL;
}

/*
 * Find local route by address.
 * FIXME: I use "longest match" principle. If destination
 *        has some non-local route, I'll not search shorter matches.
 *        It's possible, I'm wrong, but I wanted to prevent following
 *        situation:
 *      route add 193.233.7.128 netmask 255.255.255.192 gw xxxxxx
 *      route add 193.233.7.0   netmask 255.255.255.0 eth1
 *        (Two ethernets connected by serial line, one is small and other is large)
 *        Host 193.233.7.129 is locally unreachable,
 *        but old (<=1.3.37) code will send packets destined for it to eth1.
 *
 */

static struct fib_node * fib_lookup_local(__u32 dst)
{
        struct fib_zone * fz;
        struct fib_node * f;

        for (fz = fib_zone_list; fz; fz = fz->fz_next) 
        {
                int longest_match_found = 0;

                if (fz->fz_hash_table)
                        f = fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
                else
                        f = fz->fz_list;
                
                for ( ; f; f = f->fib_next)
                {
                        if ((dst ^ f->fib_dst) & fz->fz_mask)
                                continue;
                        if (!(f->fib_info->fib_flags & RTF_GATEWAY))
                                return f;
                        longest_match_found = 1;
                }
                if (longest_match_found)
                        return NULL;
        }
        return NULL;
}

/*
 * Main lookup routine.
 *      IMPORTANT NOTE: this algorithm has small difference from <=1.3.37 visible
 *      by user. It doesn't route non-CIDR broadcasts by default.
 *
 *      F.e.
 *              ifconfig eth0 193.233.7.65 netmask 255.255.255.192 broadcast 193.233.7.255
 *      is valid, but if you really are not able (not allowed, do not want) to
 *      use CIDR compliant broadcast 193.233.7.127, you should add host route:
 *              route add -host 193.233.7.255 eth0
 */

static struct fib_node * fib_lookup(__u32 dst)
{
        struct fib_zone * fz;
        struct fib_node * f;

        for (fz = fib_zone_list; fz; fz = fz->fz_next) 
        {
                if (fz->fz_hash_table)
                        f = fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
                else
                        f = fz->fz_list;
                
                for ( ; f; f = f->fib_next)
                {
                        if ((dst ^ f->fib_dst) & fz->fz_mask)
                                continue;
                        return f;
                }
        }
        return NULL;
}

static __inline__ struct device * get_gw_dev(__u32 gw)
{
        struct fib_node * f;
        f = fib_lookup_gateway(gw);
        if (f)
                return f->fib_info->fib_dev;
        return NULL;
}

/*
 *      Check if a mask is acceptable.
 */
 
static inline int bad_mask(__u32 mask, __u32 addr)
{
        if (addr & (mask = ~mask))
                return 1;
        mask = ntohl(mask);
        if (mask & (mask+1))
                return 1;
        return 0;
}


static int fib_del_list(struct fib_node **fp, __u32 dst,
                struct device * dev, __u32 gtw, short flags, short metric, __u32 mask)
{
        struct fib_node *f;
        int found=0;

        while((f = *fp) != NULL) 
        {
                struct fib_info * fi = f->fib_info;

                /*
                 *      Make sure the destination and netmask match.
                 *      metric, gateway and device are also checked
                 *      if they were specified.
                 */
                if (f->fib_dst != dst ||
                    (gtw && fi->fib_gateway != gtw) ||
                    (metric >= 0 && f->fib_metric != metric) ||
                    (dev && fi->fib_dev != dev) )
                {
                        fp = &f->fib_next;
                        continue;
                }
                cli();
                *fp = f->fib_next;
                if (fib_loopback == f)
                        fib_loopback = NULL;
                sti();
                ip_netlink_msg(RTMSG_DELROUTE, dst, gtw, mask, flags, metric, fi->fib_dev->name);
                fib_free_node(f);
                found++;
        }
        return found;
}

static __inline__ int fib_del_1(__u32 dst, __u32 mask,
                struct device * dev, __u32 gtw, short flags, short metric)
{
        struct fib_node **fp;
        struct fib_zone *fz;
        int found=0;

        if (!mask)
        {
                for (fz=fib_zone_list; fz; fz = fz->fz_next)
                {
                        int tmp;
                        if (fz->fz_hash_table)
                                fp = &fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
                        else
                                fp = &fz->fz_list;

                        tmp = fib_del_list(fp, dst, dev, gtw, flags, metric, mask);
                        fz->fz_nent -= tmp;
                        found += tmp;
                }
        } 
        else
        {
                if ((fz = fib_zones[rt_logmask(mask)]) != NULL)
                {
                        if (fz->fz_hash_table)
                                fp = &fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
                        else
                                fp = &fz->fz_list;
        
                        found = fib_del_list(fp, dst, dev, gtw, flags, metric, mask);
                        fz->fz_nent -= found;
                }
        }

        if (found)
        {
                rt_cache_flush();
                return 0;
        }
        return -ESRCH;
}


static struct fib_info * fib_create_info(__u32 gw, struct device * dev,
                                         unsigned short flags, unsigned short mss,
                                         unsigned long window, unsigned short irtt)
{
        struct fib_info * fi;

        if (!(flags & RTF_MSS))
        {
                mss = dev->mtu;
#ifdef CONFIG_NO_PATH_MTU_DISCOVERY
                /*
                 *      If MTU was not specified, use default.
                 *      If you want to increase MTU for some net (local subnet)
                 *      use "route add .... mss xxx".
                 *
                 *      The MTU isn't currently always used and computed as it
                 *      should be as far as I can tell. [Still verifying this is right]
                 */
                if ((flags & RTF_GATEWAY) && mss > 576)
                        mss = 576;
#endif
        }
        if (!(flags & RTF_WINDOW))
                window = 0;
        if (!(flags & RTF_IRTT))
                irtt = 0;

        for (fi=fib_info_list; fi; fi = fi->fib_next)
        {
                if (fi->fib_gateway != gw ||
                    fi->fib_dev != dev  ||
                    fi->fib_flags != flags ||
                    fi->fib_mtu != mss ||
                    fi->fib_window != window ||
                    fi->fib_irtt != irtt)
                        continue;
                fi->fib_refcnt++;
#if RT_CACHE_DEBUG >= 2
                printk("fib_create_info: fi %08x/%s is duplicate\n", fi->fib_gateway, fi->fib_dev->name);
#endif
                return fi;
        }
        fi = (struct fib_info*)kmalloc(sizeof(struct fib_info), GFP_KERNEL);
        if (!fi)
                return NULL;
        memset(fi, 0, sizeof(struct fib_info));
        fi->fib_flags = flags;
        fi->fib_dev = dev;
        fi->fib_gateway = gw;
        fi->fib_mtu = mss;
        fi->fib_window = window;
        fi->fib_refcnt++;
        fi->fib_next = fib_info_list;
        fi->fib_prev = NULL;
        fi->fib_irtt = irtt;
        if (fib_info_list)
                fib_info_list->fib_prev = fi;
        fib_info_list = fi;
#if RT_CACHE_DEBUG >= 2
        printk("fib_create_info: fi %08x/%s is created\n", fi->fib_gateway, fi->fib_dev->name);
#endif
        return fi;
}


static __inline__ void fib_add_1(short flags, __u32 dst, __u32 mask,
        __u32 gw, struct device *dev, unsigned short mss,
        unsigned long window, unsigned short irtt, short metric)
{
        struct fib_node *f, *f1;
        struct fib_node **fp;
        struct fib_node **dup_fp = NULL;
        struct fib_zone * fz;
        struct fib_info * fi;
        int logmask;

        /*
         *      Allocate an entry and fill it in.
         */
         
        f = (struct fib_node *) kmalloc(sizeof(struct fib_node), GFP_KERNEL);
        if (f == NULL)
                return;

        memset(f, 0, sizeof(struct fib_node));
        f->fib_dst = dst;
        f->fib_metric = metric;
        f->fib_tos    = 0;

        if  ((fi = fib_create_info(gw, dev, flags, mss, window, irtt)) == NULL)
        {
                kfree_s(f, sizeof(struct fib_node));
                return;
        }
        f->fib_info = fi;

        logmask = rt_logmask(mask);
        fz = fib_zones[logmask];


        if (!fz)
        {
                int i;
                fz = kmalloc(sizeof(struct fib_zone), GFP_KERNEL);
                if (!fz)
                {
                        fib_free_node(f);
                        return;
                }
                memset(fz, 0, sizeof(struct fib_zone));
                fz->fz_logmask = logmask;
                fz->fz_mask = mask;
                for (i=logmask-1; i>=0; i--)
                        if (fib_zones[i])
                                break;
                cli();
                if (i<0)
                {
                        fz->fz_next = fib_zone_list;
                        fib_zone_list = fz;
                }
                else
                {
                        fz->fz_next = fib_zones[i]->fz_next;
                        fib_zones[i]->fz_next = fz;
                }
                fib_zones[logmask] = fz;
                sti();
        }

        /*
         * If zone overgrows RTZ_HASHING_LIMIT, create hash table.
         */

        if (fz->fz_nent >= RTZ_HASHING_LIMIT && !fz->fz_hash_table && logmask<32)
        {
                struct fib_node ** ht;
#if RT_CACHE_DEBUG >= 2
                printk("fib_add_1: hashing for zone %d started\n", logmask);
#endif
                ht = kmalloc(RTZ_HASH_DIVISOR*sizeof(struct rtable*), GFP_KERNEL);

                if (ht)
                {
                        memset(ht, 0, RTZ_HASH_DIVISOR*sizeof(struct fib_node*));
                        cli();
                        f1 = fz->fz_list;
                        while (f1)
                        {
                                struct fib_node * next;
                                unsigned hash = fz_hash_code(f1->fib_dst, logmask);
                                next = f1->fib_next;
                                f1->fib_next = ht[hash];
                                ht[hash] = f1;
                                f1 = next;
                        }
                        fz->fz_list = NULL;
                        fz->fz_hash_table = ht; 
                        sti();
                }
        }

        if (fz->fz_hash_table)
                fp = &fz->fz_hash_table[fz_hash_code(dst, logmask)];
        else
                fp = &fz->fz_list;

        /*
         * Scan list to find the first route with the same destination
         */
        while ((f1 = *fp) != NULL)
        {
                if (f1->fib_dst == dst)
                        break;
                fp = &f1->fib_next;
        }

        /*
         * Find route with the same destination and less (or equal) metric.
         */
        while ((f1 = *fp) != NULL && f1->fib_dst == dst)
        {
                if (f1->fib_metric >= metric)
                        break;
                /*
                 *      Record route with the same destination and gateway,
                 *      but less metric. We'll delete it 
                 *      after instantiation of new route.
                 */
                if (f1->fib_info->fib_gateway == gw &&
                    (gw || f1->fib_info->fib_dev == dev))
                        dup_fp = fp;
                fp = &f1->fib_next;
        }

        /*
         * Is it already present?
         */

        if (f1 && f1->fib_metric == metric && f1->fib_info == fi)
        {
                fib_free_node(f);
                return;
        }
        
        /*
         * Insert new entry to the list.
         */

        cli();
        f->fib_next = f1;
        *fp = f;
        if (!fib_loopback && (fi->fib_dev->flags & IFF_LOOPBACK))
                fib_loopback = f;
        sti();
        fz->fz_nent++;
        ip_netlink_msg(RTMSG_NEWROUTE, dst, gw, mask, flags, metric, fi->fib_dev->name);

        /*
         *      Delete route with the same destination and gateway.
         *      Note that we should have at most one such route.
         */
        if (dup_fp)
                fp = dup_fp;
        else
                fp = &f->fib_next;

        while ((f1 = *fp) != NULL && f1->fib_dst == dst)
        {
                if (f1->fib_info->fib_gateway == gw &&
                    (gw || f1->fib_info->fib_dev == dev))
                {
                        cli();
                        *fp = f1->fib_next;
                        if (fib_loopback == f1)
                                fib_loopback = NULL;
                        sti();
                        ip_netlink_msg(RTMSG_DELROUTE, dst, gw, mask, flags, metric, f1->fib_info->fib_dev->name);
                        fib_free_node(f1);
                        fz->fz_nent--;
                        break;
                }
                fp = &f1->fib_next;
        }
        rt_cache_flush();
        return;
}

static int rt_flush_list(struct fib_node ** fp, struct device *dev)
{
        int found = 0;
        struct fib_node *f;

        while ((f = *fp) != NULL) {
/*
 *      "Magic" device route is allowed to point to loopback,
 *      discard it too.
 */
                if (f->fib_info->fib_dev != dev &&
                    (f->fib_info->fib_dev != &loopback_dev || f->fib_dst != dev->pa_addr)) {
                        fp = &f->fib_next;
                        continue;
                }
                cli();
                *fp = f->fib_next;
                if (fib_loopback == f)
                        fib_loopback = NULL;
                sti();
                fib_free_node(f);
                found++;
        }
        return found;
}

static __inline__ void fib_flush_1(struct device *dev)
{
        struct fib_zone *fz;
        int found = 0;

        for (fz = fib_zone_list; fz; fz = fz->fz_next)
        {
                if (fz->fz_hash_table)
                {
                        int i;
                        int tmp = 0;
                        for (i=0; i<RTZ_HASH_DIVISOR; i++)
                                tmp += rt_flush_list(&fz->fz_hash_table[i], dev);
                        fz->fz_nent -= tmp;
                        found += tmp;
                }
                else
                {
                        int tmp;
                        tmp = rt_flush_list(&fz->fz_list, dev);
                        fz->fz_nent -= tmp;
                        found += tmp;
                }
        }
                
        if (found)
                rt_cache_flush();
}


/* 
 *      Called from the PROCfs module. This outputs /proc/net/route.
 *
 *      We preserve the old format but pad the buffers out. This means that
 *      we can spin over the other entries as we read them. Remember the
 *      gated BGP4 code could need to read 60,000+ routes on occasion (that's
 *      about 7Mb of data). To do that ok we will need to also cache the
 *      last route we got to (reads will generally be following on from
 *      one another without gaps).
 */
 
int rt_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
{
        struct fib_zone *fz;
        struct fib_node *f;
        int len=0;
        off_t pos=0;
        char temp[129];
        int i;
        
        pos = 128;

        if (offset<128)
        {
                sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT");
                len = 128;
        }
        
        while  (ip_rt_lock)
                sleep_on(&rt_wait);
        ip_rt_fast_lock();

        for (fz=fib_zone_list; fz; fz = fz->fz_next)
        {
                int maxslot;
                struct fib_node ** fp;

                if (fz->fz_nent == 0)
                        continue;

                if (pos + 128*fz->fz_nent <= offset)
                {
                        pos += 128*fz->fz_nent;
                        len = 0;
                        continue;
                }

                if (fz->fz_hash_table)
                {
                        maxslot = RTZ_HASH_DIVISOR;
                        fp      = fz->fz_hash_table;
                }
                else
                {
                        maxslot = 1;
                        fp      = &fz->fz_list;
                }
                        
                for (i=0; i < maxslot; i++, fp++)
                {
                        
                        for (f = *fp; f; f = f->fib_next) 
                        {
                                struct fib_info * fi;
                                /*
                                 *      Spin through entries until we are ready
                                 */
                                pos += 128;

                                if (pos <= offset)
                                {
                                        len=0;
                                        continue;
                                }
                                        
                                fi = f->fib_info;
                                sprintf(temp, "%s\t%08lX\t%08lX\t%02X\t%d\t%lu\t%d\t%08lX\t%d\t%lu\t%u",
                                        fi->fib_dev->name, (unsigned long)f->fib_dst, (unsigned long)fi->fib_gateway,
                                        fi->fib_flags, 0, f->fib_use, f->fib_metric,
                                        (unsigned long)fz->fz_mask, (int)fi->fib_mtu, fi->fib_window, (int)fi->fib_irtt);
                                sprintf(buffer+len,"%-127s\n",temp);

                                len += 128;
                                if (pos >= offset+length)
                                        goto done;
                        }
                }
        }

done:
        ip_rt_unlock();
        wake_up(&rt_wait);
        
        *start = buffer+len-(pos-offset);
        len = pos - offset;
        if (len>length)
                len = length;
        return len;
}

int rt_cache_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
{
        int len=0;
        off_t pos=0;
        char temp[129];
        struct rtable *r;
        int i;

        pos = 128;

        if (offset<128)
        {
                sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tHH\tARP");
                len = 128;
        }
        
        
        while  (ip_rt_lock)
                sleep_on(&rt_wait);
        ip_rt_fast_lock();

        for (i = 0; i<RT_HASH_DIVISOR; i++)
        {
                for (r = ip_rt_hash_table[i]; r; r = r->rt_next) 
                {
                        /*
                         *      Spin through entries until we are ready
                         */
                        pos += 128;

                        if (pos <= offset)
                        {
                                len = 0;
                                continue;
                        }
                                        
                        sprintf(temp, "%s\t%08lX\t%08lX\t%02X\t%d\t%u\t%d\t%08lX\t%d\t%lu\t%u\t%d\t%1d",
                                r->rt_dev->name, (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
                                r->rt_flags, r->rt_refcnt, r->rt_use, 0,
                                (unsigned long)r->rt_src, (int)r->rt_mtu, r->rt_window, (int)r->rt_irtt, r->rt_hh ? r->rt_hh->hh_refcnt : -1, r->rt_hh ? r->rt_hh->hh_uptodate : 0);
                        sprintf(buffer+len,"%-127s\n",temp);
                        len += 128;
                        if (pos >= offset+length)
                                goto done;
                }
        }

done:
        ip_rt_unlock();
        wake_up(&rt_wait);
        
        *start = buffer+len-(pos-offset);
        len = pos-offset;
        if (len>length)
                len = length;
        return len;
}


static void rt_free(struct rtable * rt)
{
        unsigned long flags;

        save_flags(flags);
        cli();
        if (!rt->rt_refcnt)
        {
                struct hh_cache * hh = rt->rt_hh;
                rt->rt_hh = NULL;
                restore_flags(flags);
                if (hh && atomic_dec_and_test(&hh->hh_refcnt))
                        kfree_s(hh, sizeof(struct hh_cache));
                kfree_s(rt, sizeof(struct rt_table));
                return;
        }
        rt->rt_next = rt_free_queue;
        rt->rt_flags &= ~RTF_UP;
        rt_free_queue = rt;
        ip_rt_bh_mask |= RT_BH_FREE;
#if RT_CACHE_DEBUG >= 2
        printk("rt_free: %08x\n", rt->rt_dst);
#endif
        restore_flags(flags);
}

/*
 * RT "bottom half" handlers. Called with masked interrupts.
 */

static __inline__ void rt_kick_free_queue(void)
{
        struct rtable *rt, **rtp;

        rtp = &rt_free_queue;

        while ((rt = *rtp) != NULL)
        {
                if  (!rt->rt_refcnt)
                {
                        struct hh_cache * hh = rt->rt_hh;
#if RT_CACHE_DEBUG >= 2
                        __u32 daddr = rt->rt_dst;
#endif
                        *rtp = rt->rt_next;
                        rt->rt_hh = NULL;
                        sti();
                        if (hh && atomic_dec_and_test(&hh->hh_refcnt))
                                kfree_s(hh, sizeof(struct hh_cache));
                        kfree_s(rt, sizeof(struct rt_table));
#if RT_CACHE_DEBUG >= 2
                        printk("rt_kick_free_queue: %08x is free\n", daddr);
#endif
                        cli();
                        continue;
                }
                rtp = &rt->rt_next;
        }
}

void ip_rt_run_bh()
{
        unsigned long flags;
        save_flags(flags);
        cli();
        if (ip_rt_bh_mask && !ip_rt_lock)
        {
                if (ip_rt_bh_mask & RT_BH_REDIRECT)
                        rt_kick_backlog();

                if (ip_rt_bh_mask & RT_BH_GARBAGE_COLLECT)
                {
                        ip_rt_fast_lock();
                        ip_rt_bh_mask &= ~RT_BH_GARBAGE_COLLECT;
                        sti();
                        rt_garbage_collect_1();
                        cli();
                        ip_rt_fast_unlock();
                }

                if (ip_rt_bh_mask & RT_BH_FREE)
                        rt_kick_free_queue();
        }
        restore_flags(flags);
}


void ip_rt_check_expire()
{
        ip_rt_fast_lock();
        if (ip_rt_lock == 1)
        {
                int i;
                struct rtable *rth, **rthp;
                unsigned long flags;
                unsigned long now = jiffies;

                save_flags(flags);
                for (i=0; i<RT_HASH_DIVISOR; i++)
                {
                        rthp = &ip_rt_hash_table[i];

                        while ((rth = *rthp) != NULL)
                        {
                                struct rtable * rth_next = rth->rt_next;

                                /*
                                 * Cleanup aged off entries.
                                 */

                                cli();
                                if (!rth->rt_refcnt && rth->rt_lastuse + RT_CACHE_TIMEOUT < now)
                                {
                                        *rthp = rth_next;
                                        sti();
                                        rt_cache_size--;
#if RT_CACHE_DEBUG >= 2
                                        printk("rt_check_expire clean %02x@%08x\n", i, rth->rt_dst);
#endif
                                        rt_free(rth);
                                        continue;
                                }
                                sti();

                                if (!rth_next)
                                        break;

                                /*
                                 * LRU ordering.
                                 */

                                if (rth->rt_lastuse + RT_CACHE_BUBBLE_THRESHOLD < rth_next->rt_lastuse ||
                                    (rth->rt_lastuse < rth_next->rt_lastuse &&
                                     rth->rt_use < rth_next->rt_use))
                                {
#if RT_CACHE_DEBUG >= 2
                                        printk("rt_check_expire bubbled %02x@%08x<->%08x\n", i, rth->rt_dst, rth_next->rt_dst);
#endif
                                        cli();
                                        *rthp = rth_next;
                                        rth->rt_next = rth_next->rt_next;
                                        rth_next->rt_next = rth;
                                        sti();
                                        rthp = &rth_next->rt_next;
                                        continue;
                                }
                                rthp = &rth->rt_next;
                        }
                }
                restore_flags(flags);
                rt_kick_free_queue();
        }
        ip_rt_unlock();
}

static void rt_redirect_1(__u32 dst, __u32 gw, struct device *dev)
{
        struct rtable *rt;
        unsigned long hash = ip_rt_hash_code(dst);

        if (gw == dev->pa_addr)
                return;
        if (dev != get_gw_dev(gw))
                return;
        rt = (struct rtable *) kmalloc(sizeof(struct rtable), GFP_ATOMIC);
        if (rt == NULL) 
                return;
        memset(rt, 0, sizeof(struct rtable));
        rt->rt_flags = RTF_DYNAMIC | RTF_MODIFIED | RTF_HOST | RTF_GATEWAY | RTF_UP;
        rt->rt_dst = dst;
        rt->rt_dev = dev;
        rt->rt_gateway = gw;
        rt->rt_src = dev->pa_addr;
        rt->rt_mtu = dev->mtu;
#ifdef CONFIG_NO_PATH_MTU_DISCOVERY
        if (dev->mtu > 576)
                rt->rt_mtu = 576;
#endif
        rt->rt_lastuse  = jiffies;
        rt->rt_refcnt  = 1;
        rt_cache_add(hash, rt);
        ip_rt_put(rt);
        return;
}

static void rt_cache_flush(void)
{
        int i;
        struct rtable * rth, * next;

        for (i=0; i<RT_HASH_DIVISOR; i++)
        {
                int nr=0;

                cli();
                if (!(rth = ip_rt_hash_table[i]))
                {
                        sti();
                        continue;
                }

                ip_rt_hash_table[i] = NULL;
                sti();

                for (; rth; rth=next)
                {
                        next = rth->rt_next;
                        rt_cache_size--;
                        nr++;
                        rth->rt_next = NULL;
                        rt_free(rth);
                }
#if RT_CACHE_DEBUG >= 2
                if (nr > 0)
                        printk("rt_cache_flush: %d@%02x\n", nr, i);
#endif
        }
#if RT_CACHE_DEBUG >= 1
        if (rt_cache_size)
        {
                printk("rt_cache_flush: bug rt_cache_size=%d\n", rt_cache_size);
                rt_cache_size = 0;
        }
#endif
}

static void rt_garbage_collect_1(void)
{
        int i;
        unsigned expire = RT_CACHE_TIMEOUT>>1;
        struct rtable * rth, **rthp;
        unsigned long now = jiffies;

        for (;;)
        {
                for (i=0; i<RT_HASH_DIVISOR; i++)
                {
                        if (!ip_rt_hash_table[i])
                                continue;
                        for (rthp=&ip_rt_hash_table[i]; (rth=*rthp); rthp=&rth->rt_next)
                        {
                                if (rth->rt_lastuse + expire*(rth->rt_refcnt+1) > now)
                                        continue;
                                rt_cache_size--;
                                cli();
                                *rthp=rth->rt_next;
                                rth->rt_next = NULL;
                                sti();
                                rt_free(rth);
                                break;
                        }
                }
                if (rt_cache_size < RT_CACHE_SIZE_MAX)
                        return;
                expire >>= 1;
        }
}

static __inline__ void rt_req_enqueue(struct rt_req **q, struct rt_req *rtr)
{
        unsigned long flags;
        struct rt_req * tail;

        save_flags(flags);
        cli();
        tail = *q;
        if (!tail)
                rtr->rtr_next = rtr;
        else
        {
                rtr->rtr_next = tail->rtr_next;
                tail->rtr_next = rtr;
        }
        *q = rtr;
        restore_flags(flags);
        return;
}

/*
 * Caller should mask interrupts.
 */

static __inline__ struct rt_req * rt_req_dequeue(struct rt_req **q)
{
        struct rt_req * rtr;

        if (*q)
        {
                rtr = (*q)->rtr_next;
                (*q)->rtr_next = rtr->rtr_next;
                if (rtr->rtr_next == rtr)
                        *q = NULL;
                rtr->rtr_next = NULL;
                return rtr;
        }
        return NULL;
}

/*
   Called with masked interrupts
 */

static void rt_kick_backlog()
{
        if (!ip_rt_lock)
        {
                struct rt_req * rtr;

                ip_rt_fast_lock();

                while ((rtr = rt_req_dequeue(&rt_backlog)) != NULL)
                {
                        sti();
                        rt_redirect_1(rtr->dst, rtr->gw, rtr->dev);
                        kfree_s(rtr, sizeof(struct rt_req));
                        cli();
                }

                ip_rt_bh_mask &= ~RT_BH_REDIRECT;

                ip_rt_fast_unlock();
        }
}

/*
 * rt_{del|add|flush} called only from USER process. Waiting is OK.
 */

static int rt_del(__u32 dst, __u32 mask,
                struct device * dev, __u32 gtw, short rt_flags, short metric)
{
        int retval;

        while (ip_rt_lock)
                sleep_on(&rt_wait);
        ip_rt_fast_lock();
        retval = fib_del_1(dst, mask, dev, gtw, rt_flags, metric);
        ip_rt_unlock();
        wake_up(&rt_wait);
        return retval;
}

static void rt_add(short flags, __u32 dst, __u32 mask,
        __u32 gw, struct device *dev, unsigned short mss,
        unsigned long window, unsigned short irtt, short metric)
{
        while (ip_rt_lock)
                sleep_on(&rt_wait);
        ip_rt_fast_lock();
        fib_add_1(flags, dst, mask, gw, dev, mss, window, irtt, metric);
        ip_rt_unlock();
        wake_up(&rt_wait);
}

void ip_rt_flush(struct device *dev)
{
        while (ip_rt_lock)
                sleep_on(&rt_wait);
        ip_rt_fast_lock();
        fib_flush_1(dev);
        ip_rt_unlock();
        wake_up(&rt_wait);
}

/*
   Called by ICMP module.
 */

void ip_rt_redirect(__u32 src, __u32 dst, __u32 gw, struct device *dev)
{
        struct rt_req * rtr;
        struct rtable * rt;

        rt = ip_rt_route(dst, 0);
        if (!rt)
                return;

        if (rt->rt_gateway != src ||
            rt->rt_dev != dev ||
            ((gw^dev->pa_addr)&dev->pa_mask) ||
            ip_chk_addr(gw))
        {
                ip_rt_put(rt);
                return;
        }
        ip_rt_put(rt);

        ip_rt_fast_lock();
        if (ip_rt_lock == 1)
        {
                rt_redirect_1(dst, gw, dev);
                ip_rt_unlock();
                return;
        }

        rtr = kmalloc(sizeof(struct rt_req), GFP_ATOMIC);
        if (rtr)
        {
                rtr->dst = dst;
                rtr->gw = gw;
                rtr->dev = dev;
                rt_req_enqueue(&rt_backlog, rtr);
                ip_rt_bh_mask |= RT_BH_REDIRECT;
        }
        ip_rt_unlock();
}


static __inline__ void rt_garbage_collect(void)
{
        if (ip_rt_lock == 1)
        {
                rt_garbage_collect_1();
                return;
        }
        ip_rt_bh_mask |= RT_BH_GARBAGE_COLLECT;
}

static void rt_cache_add(unsigned hash, struct rtable * rth)
{
        unsigned long   flags;
        struct rtable   **rthp;
        __u32           daddr = rth->rt_dst;
        unsigned long   now = jiffies;

#if RT_CACHE_DEBUG >= 2
        if (ip_rt_lock != 1)
        {
                printk("rt_cache_add: ip_rt_lock==%d\n", ip_rt_lock);
                return;
        }
#endif

        save_flags(flags);

        if (rth->rt_dev->header_cache_bind)
        {
                struct rtable * rtg = rth;

                if (rth->rt_gateway != daddr)
                {
                        ip_rt_fast_unlock();
                        rtg = ip_rt_route(rth->rt_gateway, 0);
                        ip_rt_fast_lock();
                }

                if (rtg)
                {
                        if (rtg == rth)
                                rtg->rt_dev->header_cache_bind(&rtg->rt_hh, rtg->rt_dev, ETH_P_IP, rtg->rt_dst);
                        else
                        {
                                if (rtg->rt_hh)
                                        atomic_inc(&rtg->rt_hh->hh_refcnt);
                                rth->rt_hh = rtg->rt_hh;
                                ip_rt_put(rtg);
                        }
                }
        }

        if (rt_cache_size >= RT_CACHE_SIZE_MAX)
                rt_garbage_collect();

        cli();
        rth->rt_next = ip_rt_hash_table[hash];
#if RT_CACHE_DEBUG >= 2
        if (rth->rt_next)
        {
                struct rtable * trth;
                printk("rt_cache @%02x: %08x", hash, daddr);
                for (trth=rth->rt_next; trth; trth=trth->rt_next)
                        printk(" . %08x", trth->rt_dst);
                printk("\n");
        }
#endif
        ip_rt_hash_table[hash] = rth;
        rthp = &rth->rt_next;
        sti();
        rt_cache_size++;

        /*
         * Cleanup duplicate (and aged off) entries.
         */

        while ((rth = *rthp) != NULL)
        {

                cli();
                if ((!rth->rt_refcnt && rth->rt_lastuse + RT_CACHE_TIMEOUT < now)
                    || rth->rt_dst == daddr)
                {
                        *rthp = rth->rt_next;
                        rt_cache_size--;
                        sti();
#if RT_CACHE_DEBUG >= 2
                        printk("rt_cache clean %02x@%08x\n", hash, rth->rt_dst);
#endif
                        rt_free(rth);
                        continue;
                }
                sti();
                rthp = &rth->rt_next;
        }
        restore_flags(flags);
}

/*
   RT should be already locked.
   
   We could improve this by keeping a chain of say 32 struct rtable's
   last freed for fast recycling.
   
 */

struct rtable * ip_rt_slow_route (__u32 daddr, int local)
{
        unsigned hash = ip_rt_hash_code(daddr)^local;
        struct rtable * rth;
        struct fib_node * f;
        struct fib_info * fi;
        __u32 saddr;

#if RT_CACHE_DEBUG >= 2
        printk("rt_cache miss @%08x\n", daddr);
#endif

        rth = kmalloc(sizeof(struct rtable), GFP_ATOMIC);
        if (!rth)
        {
                ip_rt_unlock();
                return NULL;
        }

        if (local)
                f = fib_lookup_local(daddr);
        else
                f = fib_lookup (daddr);

        if (f)
        {
                fi = f->fib_info;
                f->fib_use++;
        }

        if (!f || (fi->fib_flags & RTF_REJECT))
        {
#ifdef CONFIG_KERNELD   
                char wanted_route[20];
#endif          
#if RT_CACHE_DEBUG >= 2
                printk("rt_route failed @%08x\n", daddr);
#endif
                ip_rt_unlock();
                kfree_s(rth, sizeof(struct rtable));
#ifdef CONFIG_KERNELD           
                daddr=ntohl(daddr);
                sprintf(wanted_route, "%d.%d.%d.%d",
                        (int)(daddr >> 24) & 0xff, (int)(daddr >> 16) & 0xff,
                        (int)(daddr >> 8) & 0xff, (int)daddr & 0xff);
                kerneld_route(wanted_route);    /* Dynamic route request */
#endif          
                return NULL;
        }

        saddr = fi->fib_dev->pa_addr;

        if (daddr == fi->fib_dev->pa_addr)
        {
                f->fib_use--;
                if ((f = fib_loopback) != NULL)
                {
                        f->fib_use++;
                        fi = f->fib_info;
                }
        }
        
        if (!f)
        {
                ip_rt_unlock();
                kfree_s(rth, sizeof(struct rtable));
                return NULL;
        }

        rth->rt_dst     = daddr;
        rth->rt_src     = saddr;
        rth->rt_lastuse = jiffies;
        rth->rt_refcnt  = 1;
        rth->rt_use     = 1;
        rth->rt_next    = NULL;
        rth->rt_hh      = NULL;
        rth->rt_gateway = fi->fib_gateway;
        rth->rt_dev     = fi->fib_dev;
        rth->rt_mtu     = fi->fib_mtu;
        rth->rt_window  = fi->fib_window;
        rth->rt_irtt    = fi->fib_irtt;
        rth->rt_tos     = f->fib_tos;
        rth->rt_flags   = fi->fib_flags | RTF_HOST;
        if (local)
                rth->rt_flags   |= RTF_LOCAL;

        if (!(rth->rt_flags & RTF_GATEWAY))
                rth->rt_gateway = rth->rt_dst;
        /*
         *      Multicast or limited broadcast is never gatewayed.
         */
        if (MULTICAST(daddr) || daddr == 0xFFFFFFFF)
                rth->rt_gateway = rth->rt_dst;

        if (ip_rt_lock == 1)
                rt_cache_add(hash, rth);
        else
        {
                rt_free(rth);
#if RT_CACHE_DEBUG >= 1
                printk(KERN_DEBUG "rt_cache: route to %08x was born dead\n", daddr);
#endif
        }

        ip_rt_unlock();
        return rth;
}

void ip_rt_put(struct rtable * rt)
{
        if (rt)
                atomic_dec(&rt->rt_refcnt);
}

struct rtable * ip_rt_route(__u32 daddr, int local)
{
        struct rtable * rth;

        ip_rt_fast_lock();

        for (rth=ip_rt_hash_table[ip_rt_hash_code(daddr)^local]; rth; rth=rth->rt_next)
        {
                if (rth->rt_dst == daddr)
                {
                        rth->rt_lastuse = jiffies;
                        atomic_inc(&rth->rt_use);
                        atomic_inc(&rth->rt_refcnt);
                        ip_rt_unlock();
                        return rth;
                }
        }
        return ip_rt_slow_route (daddr, local);
}

/*
 *      Process a route add request from the user, or from a kernel
 *      task.
 */
 
int ip_rt_new(struct rtentry *r)
{
        int err;
        char * devname;
        struct device * dev = NULL;
        unsigned long flags;
        __u32 daddr, mask, gw;
        short metric;

        /*
         *      If a device is specified find it.
         */
         
        if ((devname = r->rt_dev) != NULL) 
        {
                err = getname(devname, &devname);
                if (err)
                        return err;
                dev = dev_get(devname);
                putname(devname);
                if (!dev)
                        return -ENODEV;
        }
        
        /*
         *      If the device isn't INET, don't allow it
         */

        if (r->rt_dst.sa_family != AF_INET)
                return -EAFNOSUPPORT;

        /*
         *      Make local copies of the important bits
         *      We decrement the metric by one for BSD compatibility.
         */
         
        flags = r->rt_flags;
        daddr = (__u32) ((struct sockaddr_in *) &r->rt_dst)->sin_addr.s_addr;
        mask  = (__u32) ((struct sockaddr_in *) &r->rt_genmask)->sin_addr.s_addr;
        gw    = (__u32) ((struct sockaddr_in *) &r->rt_gateway)->sin_addr.s_addr;
        metric = r->rt_metric > 0 ? r->rt_metric - 1 : 0;

        /*
         *      BSD emulation: Permits route add someroute gw one-of-my-addresses
         *      to indicate which iface. Not as clean as the nice Linux dev technique
         *      but people keep using it...  (and gated likes it ;))
         */
         
        if (!dev && (flags & RTF_GATEWAY)) 
        {
                struct device *dev2;
                for (dev2 = dev_base ; dev2 != NULL ; dev2 = dev2->next) 
                {
                        if ((dev2->flags & IFF_UP) && dev2->pa_addr == gw) 
                        {
                                flags &= ~RTF_GATEWAY;
                                dev = dev2;
                                break;
                        }
                }
        }

        if (flags & RTF_HOST) 
                mask = 0xffffffff;
        else if (mask && r->rt_genmask.sa_family != AF_INET)
                return -EAFNOSUPPORT;

        if (flags & RTF_GATEWAY) 
        {
                if (r->rt_gateway.sa_family != AF_INET)
                        return -EAFNOSUPPORT;

                /*
                 *      Don't try to add a gateway we can't reach.. 
                 *      Tunnel devices are exempt from this rule.
                 */

                if (!dev)
                        dev = get_gw_dev(gw);
                else if (dev != get_gw_dev(gw) && dev->type != ARPHRD_TUNNEL)
                        return -EINVAL;
                if (!dev)
                        return -ENETUNREACH;
        } 
        else
        {
                gw = 0;
                if (!dev)
                        dev = ip_dev_bynet(daddr, mask);
                if (!dev)
                        return -ENETUNREACH;
                if (!mask)
                {
                        if (((daddr ^ dev->pa_addr) & dev->pa_mask) == 0)
                                mask = dev->pa_mask;
                }
        }

#ifndef CONFIG_IP_CLASSLESS
        if (!mask)
                mask = ip_get_mask(daddr);
#endif
        
        if (bad_mask(mask, daddr))
                return -EINVAL;

        /*
         *      Add the route
         */

        rt_add(flags, daddr, mask, gw, dev, r->rt_mss, r->rt_window, r->rt_irtt, metric);
        return 0;
}


/*
 *      Remove a route, as requested by the user.
 */

int ip_rt_kill(struct rtentry *r)
{
        struct sockaddr_in *trg;
        struct sockaddr_in *msk;
        struct sockaddr_in *gtw;
        char *devname;
        int err;
        struct device * dev = NULL;

        trg = (struct sockaddr_in *) &r->rt_dst;
        msk = (struct sockaddr_in *) &r->rt_genmask;
        gtw = (struct sockaddr_in *) &r->rt_gateway;
        if ((devname = r->rt_dev) != NULL) 
        {
                err = getname(devname, &devname);
                if (err)
                        return err;
                dev = dev_get(devname);
                putname(devname);
                if (!dev)
                        return -ENODEV;
        }
        /*
         * metric can become negative here if it wasn't filled in
         * but that's a fortunate accident; we really use that in rt_del.
         */
        err=rt_del((__u32)trg->sin_addr.s_addr, (__u32)msk->sin_addr.s_addr, dev,
                (__u32)gtw->sin_addr.s_addr, r->rt_flags, r->rt_metric - 1);
        return err;
}

/*
 *      Handle IP routing ioctl calls. These are used to manipulate the routing tables
 */
 
int ip_rt_ioctl(unsigned int cmd, void *arg)
{
        int err;
        struct rtentry rt;

        switch(cmd) 
        {
                case SIOCADDRT:         /* Add a route */
                case SIOCDELRT:         /* Delete a route */
                        if (!suser())
                                return -EPERM;
                        err=verify_area(VERIFY_READ, arg, sizeof(struct rtentry));
                        if (err)
                                return err;
                        memcpy_fromfs(&rt, arg, sizeof(struct rtentry));
                        return (cmd == SIOCDELRT) ? ip_rt_kill(&rt) : ip_rt_new(&rt);
        }

        return -EINVAL;
}

void ip_rt_advice(struct rtable **rp, int advice)
{
        /* Thanks! */
        return;
}

void ip_rt_update(int event, struct device *dev)
{
/*
 *      This causes too much grief to do now.
 */
#ifdef COMING_IN_2_1
        if (event == NETDEV_UP)
                rt_add(RTF_HOST|RTF_UP, dev->pa_addr, ~0, 0, dev, 0, 0, 0, 0);
        else if (event == NETDEV_DOWN)
                rt_del(dev->pa_addr, ~0, dev, 0, RTF_HOST|RTF_UP, 0);
#endif          
}