/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Implementation of the Transmission Control Protocol(TCP).
*
* Version: @(#)tcp.c 1.0.16 05/25/93
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Mark Evans, <evansmp@uhura.aston.ac.uk>
* Corey Minyard <wf-rch!minyard@relay.EU.net>
* Florian La Roche, <flla@stud.uni-sb.de>
* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
* Linus Torvalds, <torvalds@cs.helsinki.fi>
* Alan Cox, <gw4pts@gw4pts.ampr.org>
* Matthew Dillon, <dillon@apollo.west.oic.com>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
* Jorge Cwik, <jorge@laser.satlink.net>
*
* Fixes:
* Alan Cox : Numerous verify_area() calls
* Alan Cox : Set the ACK bit on a reset
* Alan Cox : Stopped it crashing if it closed while
* sk->inuse=1 and was trying to connect
* (tcp_err()).
* Alan Cox : All icmp error handling was broken
* pointers passed where wrong and the
* socket was looked up backwards. Nobody
* tested any icmp error code obviously.
* Alan Cox : tcp_err() now handled properly. It
* wakes people on errors. select
* behaves and the icmp error race
* has gone by moving it into sock.c
* Alan Cox : tcp_send_reset() fixed to work for
* everything not just packets for
* unknown sockets.
* Alan Cox : tcp option processing.
* Alan Cox : Reset tweaked (still not 100%) [Had
* syn rule wrong]
* Herp Rosmanith : More reset fixes
* Alan Cox : No longer acks invalid rst frames.
* Acking any kind of RST is right out.
* Alan Cox : Sets an ignore me flag on an rst
* receive otherwise odd bits of prattle
* escape still
* Alan Cox : Fixed another acking RST frame bug.
* Should stop LAN workplace lockups.
* Alan Cox : Some tidyups using the new skb list
* facilities
* Alan Cox : sk->keepopen now seems to work
* Alan Cox : Pulls options out correctly on accepts
* Alan Cox : Fixed assorted sk->rqueue->next errors
* Alan Cox : PSH doesn't end a TCP read. Switched a
* bit to skb ops.
* Alan Cox : Tidied tcp_data to avoid a potential
* nasty.
* Alan Cox : Added some better commenting, as the
* tcp is hard to follow
* Alan Cox : Removed incorrect check for 20 * psh
* Michael O'Reilly : ack < copied bug fix.
* Johannes Stille : Misc tcp fixes (not all in yet).
* Alan Cox : FIN with no memory -> CRASH
* Alan Cox : Added socket option proto entries.
* Also added awareness of them to accept.
* Alan Cox : Added TCP options (SOL_TCP)
* Alan Cox : Switched wakeup calls to callbacks,
* so the kernel can layer network
* sockets.
* Alan Cox : Use ip_tos/ip_ttl settings.
* Alan Cox : Handle FIN (more) properly (we hope).
* Alan Cox : RST frames sent on unsynchronised
* state ack error.
* Alan Cox : Put in missing check for SYN bit.
* Alan Cox : Added tcp_select_window() aka NET2E
* window non shrink trick.
* Alan Cox : Added a couple of small NET2E timer
* fixes
* Charles Hedrick : TCP fixes
* Toomas Tamm : TCP window fixes
* Alan Cox : Small URG fix to rlogin ^C ack fight
* Charles Hedrick : Rewrote most of it to actually work
* Linus : Rewrote tcp_read() and URG handling
* completely
* Gerhard Koerting: Fixed some missing timer handling
* Matthew Dillon : Reworked TCP machine states as per RFC
* Gerhard Koerting: PC/TCP workarounds
* Adam Caldwell : Assorted timer/timing errors
* Matthew Dillon : Fixed another RST bug
* Alan Cox : Move to kernel side addressing changes.
* Alan Cox : Beginning work on TCP fastpathing
* (not yet usable)
* Arnt Gulbrandsen: Turbocharged tcp_check() routine.
* Alan Cox : TCP fast path debugging
* Alan Cox : Window clamping
* Michael Riepe : Bug in tcp_check()
* Matt Dillon : More TCP improvements and RST bug fixes
* Matt Dillon : Yet more small nasties remove from the
* TCP code (Be very nice to this man if
* tcp finally works 100%) 8)
* Alan Cox : BSD accept semantics.
* Alan Cox : Reset on closedown bug.
* Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
* Michael Pall : Handle select() after URG properly in
* all cases.
* Michael Pall : Undo the last fix in tcp_read_urg()
* (multi URG PUSH broke rlogin).
* Michael Pall : Fix the multi URG PUSH problem in
* tcp_readable(), select() after URG
* works now.
* Michael Pall : recv(...,MSG_OOB) never blocks in the
* BSD api.
* Alan Cox : Changed the semantics of sk->socket to
* fix a race and a signal problem with
* accept() and async I/O.
* Alan Cox : Relaxed the rules on tcp_sendto().
* Yury Shevchuk : Really fixed accept() blocking problem.
* Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
* clients/servers which listen in on
* fixed ports.
* Alan Cox : Cleaned the above up and shrank it to
* a sensible code size.
* Alan Cox : Self connect lockup fix.
* Alan Cox : No connect to multicast.
* Ross Biro : Close unaccepted children on master
* socket close.
* Alan Cox : Reset tracing code.
* Alan Cox : Spurious resets on shutdown.
* Alan Cox : Giant 15 minute/60 second timer error
* Alan Cox : Small whoops in selecting before an
* accept.
* Alan Cox : Kept the state trace facility since
* it's handy for debugging.
* Alan Cox : More reset handler fixes.
* Alan Cox : Started rewriting the code based on
* the RFC's for other useful protocol
* references see: Comer, KA9Q NOS, and
* for a reference on the difference
* between specifications and how BSD
* works see the 4.4lite source.
* A.N.Kuznetsov : Don't time wait on completion of tidy
* close.
* Linus Torvalds : Fin/Shutdown & copied_seq changes.
* Linus Torvalds : Fixed BSD port reuse to work first syn
* Alan Cox : Reimplemented timers as per the RFC
* and using multiple timers for sanity.
* Alan Cox : Small bug fixes, and a lot of new
* comments.
* Alan Cox : Fixed dual reader crash by locking
* the buffers (much like datagram.c)
* Alan Cox : Fixed stuck sockets in probe. A probe
* now gets fed up of retrying without
* (even a no space) answer.
* Alan Cox : Extracted closing code better
* Alan Cox : Fixed the closing state machine to
* resemble the RFC.
* Alan Cox : More 'per spec' fixes.
* Jorge Cwik : Even faster checksumming.
* Alan Cox : tcp_data() doesn't ack illegal PSH
* only frames. At least one pc tcp stack
* generates them.
* Alan Cox : Cache last socket.
* Alan Cox : Per route irtt.
* Matt Day : Select() match BSD precisely on error
* Alan Cox : New buffers
* Marc Tamsky : Various sk->prot->retransmits and
* sk->retransmits misupdating fixed.
* Fixed tcp_write_timeout: stuck close,
* and TCP syn retries gets used now.
* Mark Yarvis : In tcp_read_wakeup(), don't send an
* ack if stat is TCP_CLOSED.
* Alan Cox : Look up device on a retransmit - routes may
* change. Doesn't yet cope with MSS shrink right
* but it's a start!
* Marc Tamsky : Closing in closing fixes.
* Mike Shaver : RFC1122 verifications.
* Alan Cox : rcv_saddr errors.
* Alan Cox : Block double connect().
* Alan Cox : Small hooks for enSKIP.
* Alexey Kuznetsov: Path MTU discovery.
* Alan Cox : Support soft errors.
* Alan Cox : Fix MTU discovery pathological case
* when the remote claims no mtu!
* Marc Tamsky : TCP_CLOSE fix.
* Colin (G3TNE) : Send a reset on syn ack replies in
* window but wrong (fixes NT lpd problems)
* Pedro Roque : Better TCP window handling, delayed ack.
* Joerg Reuter : No modification of locked buffers in
* tcp_do_retransmit()
* Eric Schenk : Changed receiver side silly window
* avoidance algorithm to BSD style
* algorithm. This doubles throughput
* against machines running Solaris,
* and seems to result in general
* improvement.
* Eric Schenk : Changed receiver side silly window
* avoidance algorithm to BSD style
* algorithm. This doubles throughput
* against machines running Solaris,
* and seems to result in general
* improvement.
* Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
* Willy Konynenberg : Transparent proxying support.
*
* To Fix:
* Fast path the code. Two things here - fix the window calculation
* so it doesn't iterate over the queue, also spot packets with no funny
* options arriving in order and process directly.
*
* Rewrite output state machine to use a single queue.
* Speed up input assembly algorithm.
* RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we
* could do with it working on IPv4
* User settable/learned rtt/max window/mtu
*
* Change the fundamental structure to a single send queue maintained
* by TCP (removing the bogus ip stuff [thus fixing mtu drops on
* active routes too]). Cut the queue off in tcp_retransmit/
* tcp_transmit.
* Change the receive queue to assemble as it goes. This lets us
* dispose of most of tcp_sequence, half of tcp_ack and chunks of
* tcp_data/tcp_read as well as the window shrink crud.
* Separate out duplicated code - tcp_alloc_skb, tcp_build_ack
* tcp_queue_skb seem obvious routines to extract.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or(at your option) any later version.
*
* Description of States:
*
* TCP_SYN_SENT sent a connection request, waiting for ack
*
* TCP_SYN_RECV received a connection request, sent ack,
* waiting for final ack in three-way handshake.
*
* TCP_ESTABLISHED connection established
*
* TCP_FIN_WAIT1 our side has shutdown, waiting to complete
* transmission of remaining buffered data
*
* TCP_FIN_WAIT2 all buffered data sent, waiting for remote
* to shutdown
*
* TCP_CLOSING both sides have shutdown but we still have
* data we have to finish sending
*
* TCP_TIME_WAIT timeout to catch resent junk before entering
* closed, can only be entered from FIN_WAIT2
* or CLOSING. Required because the other end
* may not have gotten our last ACK causing it
* to retransmit the data packet (which we ignore)
*
* TCP_CLOSE_WAIT remote side has shutdown and is waiting for
* us to finish writing our data and to shutdown
* (we have to close() to move on to LAST_ACK)
*
* TCP_LAST_ACK out side has shutdown after remote has
* shutdown. There may still be data in our
* buffer that we have to finish sending
*
* TCP_CLOSE socket is finished
*/
/*
* RFC1122 status:
* NOTE: I'm not going to be doing comments in the code for this one except
* for violations and the like. tcp.c is just too big... If I say something
* "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
* with Alan. -- MS 950903
*
* Use of PSH (4.2.2.2)
* MAY aggregate data sent without the PSH flag. (does)
* MAY queue data received without the PSH flag. (does)
* SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
* MAY implement PSH on send calls. (doesn't, thus:)
* MUST NOT buffer data indefinitely (doesn't [1 second])
* MUST set PSH on last segment (does)
* MAY pass received PSH to application layer (doesn't)
* SHOULD send maximum-sized segment whenever possible. (almost always does)
*
* Window Size (4.2.2.3, 4.2.2.16)
* MUST treat window size as an unsigned number (does)
* SHOULD treat window size as a 32-bit number (does not)
* MUST NOT shrink window once it is offered (does not normally)
*
* Urgent Pointer (4.2.2.4)
* **MUST point urgent pointer to last byte of urgent data (not right
* after). (doesn't, to be like BSD)
* MUST inform application layer asynchronously of incoming urgent
* data. (does)
* MUST provide application with means of determining the amount of
* urgent data pending. (does)
* **MUST support urgent data sequence of arbitrary length. (doesn't, but
* it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
* [Follows BSD 1 byte of urgent data]
*
* TCP Options (4.2.2.5)
* MUST be able to receive TCP options in any segment. (does)
* MUST ignore unsupported options (does)
*
* Maximum Segment Size Option (4.2.2.6)
* MUST implement both sending and receiving MSS. (does)
* SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
* it always). (does, even when MSS == 536, which is legal)
* MUST assume MSS == 536 if no MSS received at connection setup (does)
* MUST calculate "effective send MSS" correctly:
* min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
* (does - but allows operator override)
*
* TCP Checksum (4.2.2.7)
* MUST generate and check TCP checksum. (does)
*
* Initial Sequence Number Selection (4.2.2.8)
* MUST use the RFC 793 clock selection mechanism. (doesn't, but it's
* OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
* necessary for 10Mbps networks - and harder than BSD to spoof!)
*
* Simultaneous Open Attempts (4.2.2.10)
* MUST support simultaneous open attempts (does)
*
* Recovery from Old Duplicate SYN (4.2.2.11)
* MUST keep track of active vs. passive open (does)
*
* RST segment (4.2.2.12)
* SHOULD allow an RST segment to contain data (does, but doesn't do
* anything with it, which is standard)
*
* Closing a Connection (4.2.2.13)
* MUST inform application of whether connection was closed by RST or
* normal close. (does)
* MAY allow "half-duplex" close (treat connection as closed for the
* local app, even before handshake is done). (does)
* MUST linger in TIME_WAIT for 2 * MSL (does)
*
* Retransmission Timeout (4.2.2.15)
* MUST implement Jacobson's slow start and congestion avoidance
* stuff. (does)
*
* Probing Zero Windows (4.2.2.17)
* MUST support probing of zero windows. (does)
* MAY keep offered window closed indefinitely. (does)
* MUST allow remote window to stay closed indefinitely. (does)
*
* Passive Open Calls (4.2.2.18)
* MUST NOT let new passive open affect other connections. (doesn't)
* MUST support passive opens (LISTENs) concurrently. (does)
*
* Time to Live (4.2.2.19)
* MUST make TCP TTL configurable. (does - IP_TTL option)
*
* Event Processing (4.2.2.20)
* SHOULD queue out-of-order segments. (does)
* MUST aggregate ACK segments whenever possible. (does but badly)
*
* Retransmission Timeout Calculation (4.2.3.1)
* MUST implement Karn's algorithm and Jacobson's algorithm for RTO
* calculation. (does, or at least explains them in the comments 8*b)
* SHOULD initialize RTO to 0 and RTT to 3. (does)
*
* When to Send an ACK Segment (4.2.3.2)
* SHOULD implement delayed ACK. (does)
* MUST keep ACK delay < 0.5 sec. (does)
*
* When to Send a Window Update (4.2.3.3)
* MUST implement receiver-side SWS. (does)
*
* When to Send Data (4.2.3.4)
* MUST implement sender-side SWS. (does)
* SHOULD implement Nagle algorithm. (does)
*
* TCP Connection Failures (4.2.3.5)
* MUST handle excessive retransmissions "properly" (see the RFC). (does)
* SHOULD inform application layer of soft errors. (does)
*
* TCP Keep-Alives (4.2.3.6)
* MAY provide keep-alives. (does)
* MUST make keep-alives configurable on a per-connection basis. (does)
* MUST default to no keep-alives. (does)
* **MUST make keep-alive interval configurable. (doesn't)
* **MUST make default keep-alive interval > 2 hours. (doesn't)
* MUST NOT interpret failure to ACK keep-alive packet as dead
* connection. (doesn't)
* SHOULD send keep-alive with no data. (does)
*
* TCP Multihoming (4.2.3.7)
* MUST get source address from IP layer before sending first
* SYN. (does)
* MUST use same local address for all segments of a connection. (does)
*
* IP Options (4.2.3.8)
* MUST ignore unsupported IP options. (does)
* MAY support Time Stamp and Record Route. (does)
* MUST allow application to specify a source route. (does)
* MUST allow received Source Route option to set route for all future
* segments on this connection. (does not (security issues))
*
* ICMP messages (4.2.3.9)
* MUST act on ICMP errors. (does)
* MUST slow transmission upon receipt of a Source Quench. (does)
* MUST NOT abort connection upon receipt of soft Destination
* Unreachables (0, 1, 5), Time Exceededs and Parameter
* Problems. (doesn't)
* SHOULD report soft Destination Unreachables etc. to the
* application. (does)
* SHOULD abort connection upon receipt of hard Destination Unreachable
* messages (2, 3, 4). (does)
*
* Remote Address Validation (4.2.3.10)
* MUST reject as an error OPEN for invalid remote IP address. (does)
* MUST ignore SYN with invalid source address. (does)
* MUST silently discard incoming SYN for broadcast/multicast
* address. (does)
*
* Asynchronous Reports (4.2.4.1)
* MUST provide mechanism for reporting soft errors to application
* layer. (does)
*
* Type of Service (4.2.4.2)
* MUST allow application layer to set Type of Service. (does IP_TOS)
*
* (Whew. -- MS 950903)
**/
#include <linux/config.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <net/icmp.h>
#include <net/tcp.h>
#include <asm/segment.h>
unsigned long seq_offset;
struct tcp_mib tcp_statistics;
static void tcp_close(struct sock *sk, unsigned long timeout);
/*
* Find someone to 'accept'. Must be called with
* the socket locked or with interrupts disabled
*/
static struct sk_buff *tcp_find_established(struct sock *s)
{
struct sk_buff *p=skb_peek(&s->receive_queue);
if(p==NULL)
return NULL;
do
{
if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
return p;
p=p->next;
}
while(p!=(struct sk_buff *)&s->receive_queue);
return NULL;
}
/*
* This routine closes sockets which have been at least partially
* opened, but not yet accepted. Currently it is only called by
* tcp_close, and timeout mirrors the value there.
*/
static void tcp_close_pending (struct sock *sk)
{
struct sk_buff *skb;
while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
{
tcp_close(skb->sk, 0);
kfree_skb(skb, FREE_READ);
}
return;
}
/*
* Enter the time wait state.
*/
void tcp_time_wait(struct sock *sk)
{
tcp_set_state(sk,TCP_TIME_WAIT);
sk->shutdown = SHUTDOWN_MASK;
if (!sk->dead)
sk->state_change(sk);
tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
}
/*
* This routine is called by the ICMP module when it gets some
* sort of error condition. If err < 0 then the socket should
* be closed and the error returned to the user. If err > 0
* it's just the icmp type << 8 | icmp code. After adjustment
* header points to the first 8 bytes of the tcp header. We need
* to find the appropriate port.
*/
void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
__u32 saddr, struct inet_protocol *protocol)
{
struct tcphdr *th = (struct tcphdr *)header;
struct sock *sk;
/*
* This one is _WRONG_. FIXME urgently.
*/
#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
struct iphdr *iph=(struct iphdr *)(header-sizeof(struct iphdr));
#endif
th =(struct tcphdr *)header;
sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr, 0, 0);
if (sk == NULL)
return;
if (type == ICMP_SOURCE_QUENCH)
{
/*
* FIXME:
* Follow BSD for now and just reduce cong_window to 1 again.
* It is possible that we just want to reduce the
* window by 1/2, or that we want to reduce ssthresh by 1/2
* here as well.
*/
sk->cong_window = 1;
sk->high_seq = sk->sent_seq;
return;
}
if (type == ICMP_PARAMETERPROB)
{
sk->err=EPROTO;
sk->error_report(sk);
}
#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
{
struct rtable * rt;
/*
* Ugly trick to pass MTU to protocol layer.
* Really we should add argument "info" to error handler.
*/
unsigned short new_mtu = ntohs(iph->id);
if ((rt = sk->ip_route_cache) != NULL)
if (rt->rt_mtu > new_mtu)
rt->rt_mtu = new_mtu;
if (sk->mtu > new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr)
&& new_mtu > sizeof(struct iphdr)+sizeof(struct tcphdr))
sk->mtu = new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
return;
}
#endif
/*
* If we've already connected we will keep trying
* until we time out, or the user gives up.
*/
if (code < 13)
{
if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
{
sk->err = icmp_err_convert[code].errno;
if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
{
tcp_statistics.TcpAttemptFails++;
tcp_set_state(sk,TCP_CLOSE);
sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
}
}
else /* Only an error on timeout */
sk->err_soft = icmp_err_convert[code].errno;
}
}
/*
* Walk down the receive queue counting readable data until we hit the end or we find a gap
* in the received data queue (ie a frame missing that needs sending to us). Not
* sorting using two queues as data arrives makes life so much harder.
*/
static int tcp_readable(struct sock *sk)
{
unsigned long counted;
unsigned long amount;
struct sk_buff *skb;
int sum;
unsigned long flags;
if(sk && sk->debug)
printk("tcp_readable: %p - ",sk);
save_flags(flags);
cli();
if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
{
restore_flags(flags);
if(sk && sk->debug)
printk("empty\n");
return(0);
}
counted = sk->copied_seq; /* Where we are at the moment */
amount = 0;
/*
* Do until a push or until we are out of data.
*/
do
{
if (before(counted, skb->seq)) /* Found a hole so stops here */
break;
sum = skb->len - (counted - skb->seq); /* Length - header but start from where we are up to (avoid overlaps) */
if (skb->h.th->syn)
sum++;
if (sum > 0)
{ /* Add it up, move on */
amount += sum;
if (skb->h.th->syn)
amount--;
counted += sum;
}
/*
* Don't count urg data ... but do it in the right place!
* Consider: "old_data (ptr is here) URG PUSH data"
* The old code would stop at the first push because
* it counted the urg (amount==1) and then does amount--
* *after* the loop. This means tcp_readable() always
* returned zero if any URG PUSH was in the queue, even
* though there was normal data available. If we subtract
* the urg data right here, we even get it to work for more
* than one URG PUSH skb without normal data.
* This means that select() finally works now with urg data
* in the queue. Note that rlogin was never affected
* because it doesn't use select(); it uses two processes
* and a blocking read(). And the queue scan in tcp_read()
* was correct. Mike <pall@rz.uni-karlsruhe.de>
*/
if (skb->h.th->urg)
amount--; /* don't count urg data */
/* if (amount && skb->h.th->psh) break;*/
skb = skb->next;
}
while(skb != (struct sk_buff *)&sk->receive_queue);
restore_flags(flags);
if(sk->debug)
printk("got %lu bytes.\n",amount);
return(amount);
}
/*
* LISTEN is a special case for select..
*/
static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
{
if (sel_type == SEL_IN) {
struct sk_buff * skb;
lock_sock(sk);
skb = tcp_find_established(sk);
release_sock(sk);
if (skb)
return 1;
select_wait(sk->sleep,wait);
return 0;
}
return 0;
}
/*
* Wait for a TCP event.
*
* Note that we don't need to lock the socket, as the upper select layers
* take care of normal races (between the test and the event) and we don't
* go look at any of the socket buffers directly.
*/
static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
{
if (sk->state == TCP_LISTEN)
return tcp_listen_select(sk, sel_type, wait);
switch(sel_type) {
case SEL_IN:
if (sk->err)
return 1;
if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
break;
if (sk->shutdown & RCV_SHUTDOWN)
return 1;
if (sk->acked_seq == sk->copied_seq)
break;
if (sk->urg_seq != sk->copied_seq ||
sk->acked_seq != sk->copied_seq+1 ||
sk->urginline || !sk->urg_data)
return 1;
break;
case SEL_OUT:
if (sk->err)
return 1;
if (sk->shutdown & SEND_SHUTDOWN)
return 0;
if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
break;
/*
* This is now right thanks to a small fix
* by Matt Dillon.
*/
if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
break;
return 1;
case SEL_EX:
if (sk->urg_data)
return 1;
break;
}
select_wait(sk->sleep, wait);
return 0;
}
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
int err;
switch(cmd)
{
case TIOCINQ:
#ifdef FIXME /* FIXME: */
case FIONREAD:
#endif
{
unsigned long amount;
if (sk->state == TCP_LISTEN)
return(-EINVAL);
lock_sock(sk);
amount = tcp_readable(sk);
release_sock(sk);
err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
if(err)
return err;
put_user(amount, (int *)arg);
return(0);
}
case SIOCATMARK:
{
int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(int));
if (err)
return err;
put_user(answ,(int *) arg);
return(0);
}
case TIOCOUTQ:
{
unsigned long amount;
if (sk->state == TCP_LISTEN) return(-EINVAL);
amount = sock_wspace(sk);
err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(int));
if(err)
return err;
put_user(amount, (int *)arg);
return(0);
}
default:
return(-EINVAL);
}
}
/*
* This routine computes a TCP checksum.
*
* Modified January 1995 from a go-faster DOS routine by
* Jorge Cwik <jorge@laser.satlink.net>
*/
#undef DEBUG_TCP_CHECK
void tcp_send_check(struct tcphdr *th, unsigned long saddr,
unsigned long daddr, int len, struct sk_buff *skb)
{
#ifdef DEBUG_TCP_CHECK
u16 check;
#endif
th->check = 0;
th->check = tcp_check(th, len, saddr, daddr,
csum_partial((char *)th,sizeof(*th),skb->csum));
#ifdef DEBUG_TCP_CHECK
check = th->check;
th->check = 0;
th->check = tcp_check(th, len, saddr, daddr,
csum_partial((char *)th,len,0));
if (check != th->check) {
static int count = 0;
if (++count < 10) {
printk("Checksum %x (%x) from %p\n", th->check, check,
(&th)[-1]);
printk("TCP=<off:%d a:%d s:%d f:%d>\n", th->doff*4, th->ack, th->syn, th->fin);
}
}
#endif
}
/*
* This routine builds a generic TCP header.
*/
static inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
{
memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
th->psh = (push == 0) ? 1 : 0;
th->seq = htonl(sk->write_seq);
th->ack_seq = htonl(sk->acked_seq);
th->window = htons(tcp_select_window(sk));
return(sizeof(*th));
}
/*
* Wait for a socket to get into the connected state
*/
static void wait_for_tcp_connect(struct sock * sk)
{
release_sock(sk);
cli();
if (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT && sk->err == 0)
{
interruptible_sleep_on(sk->sleep);
}
sti();
lock_sock(sk);
}
/*
* Wait for more memory for a socket
*/
static void wait_for_tcp_memory(struct sock * sk)
{
release_sock(sk);
cli();
if (sk->wmem_alloc*2 > sk->sndbuf &&
(sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
&& sk->err == 0)
{
sk->socket->flags &= ~SO_NOSPACE;
interruptible_sleep_on(sk->sleep);
}
sti();
lock_sock(sk);
}
/*
* This routine copies from a user buffer into a socket,
* and starts the transmit system.
*/
static int do_tcp_sendmsg(struct sock *sk,
int iovlen, struct iovec *iov,
int len, int nonblock, int flags)
{
int copied = 0;
struct device *dev = NULL;
/*
* Wait for a connection to finish.
*/
while (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
{
if (sk->err)
return sock_error(sk);
if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
{
if (sk->keepopen)
send_sig(SIGPIPE, current, 0);
return -EPIPE;
}
if (nonblock)
return -EAGAIN;
if (current->signal & ~current->blocked)
return -ERESTARTSYS;
wait_for_tcp_connect(sk);
}
/*
* Ok commence sending
*/
while (--iovlen >= 0)
{
int seglen=iov->iov_len;
unsigned char * from=iov->iov_base;
iov++;
while(seglen > 0)
{
int copy, delay;
int tmp;
struct sk_buff *skb;
/*
* Stop on errors
*/
if (sk->err)
{
if (copied)
return copied;
return sock_error(sk);
}
/*
* Make sure that we are established.
*/
if (sk->shutdown & SEND_SHUTDOWN)
{
if (copied)
return copied;
return -EPIPE;
}
/*
* The following code can result in copy <= if sk->mss is ever
* decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).
* sk->mtu is constant once SYN processing is finished. I.e. we
* had better not get here until we've seen his SYN and at least one
* valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)
* But ESTABLISHED should guarantee that. sk->max_window is by definition
* non-decreasing. Note that any ioctl to set user_mss must be done
* before the exchange of SYN's. If the initial ack from the other
* end has a window of 0, max_window and thus mss will both be 0.
*/
/*
* Now we need to check if we have a half built packet.
*/
#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
/*
* FIXME: I'm almost sure that this fragment is BUG,
* but it works... I do not know why 8) --ANK
*
* Really, we should rebuild all the queues...
* It's difficult. Temporary hack is to send all
* queued segments with allowed fragmentation.
*/
{
int new_mss = min(sk->mtu, sk->max_window);
if (new_mss < sk->mss)
{
tcp_send_partial(sk);
sk->mss = new_mss;
}
}
#endif
if ((skb = tcp_dequeue_partial(sk)) != NULL)
{
int tcp_size;
tcp_size = skb->tail - (unsigned char *)(skb->h.th + 1);
/* Add more stuff to the end of skb->len */
if (!(flags & MSG_OOB))
{
copy = min(sk->mss - tcp_size, seglen);
if (copy <= 0)
{
printk(KERN_CRIT "TCP: **bug**: \"copy\" <= 0\n");
return -EFAULT;
}
tcp_size += copy;
memcpy_fromfs(skb_put(skb,copy), from, copy);
skb->csum = csum_partial(skb->tail - tcp_size, tcp_size, 0);
from += copy;
copied += copy;
len -= copy;
sk->write_seq += copy;
seglen -= copy;
}
if (tcp_size >= sk->mss || (flags & MSG_OOB) || !sk->packets_out)
tcp_send_skb(sk, skb);
else
tcp_enqueue_partial(skb, sk);
continue;
}
/*
* We also need to worry about the window.
* If window < 1/2 the maximum window we've seen from this
* host, don't use it. This is sender side
* silly window prevention, as specified in RFC1122.
* (Note that this is different than earlier versions of
* SWS prevention, e.g. RFC813.). What we actually do is
* use the whole MSS. Since the results in the right
* edge of the packet being outside the window, it will
* be queued for later rather than sent.
*/
copy = sk->window_seq - sk->write_seq;
if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
copy = sk->mss;
if (copy > seglen)
copy = seglen;
if (copy <= 0)
{
printk(KERN_CRIT "TCP: **bug**: copy=%d, sk->mss=%d\n", copy, sk->mss);
return -EFAULT;
}
/*
* We should really check the window here also.
*/
delay = 0;
tmp = copy + sk->prot->max_header + 15;
if (copy < sk->mss && !(flags & MSG_OOB) && sk->packets_out)
{
tmp = tmp - copy + sk->mtu + 128;
delay = 1;
}
skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL);
/*
* If we didn't get any memory, we need to sleep.
*/
if (skb == NULL)
{
sk->socket->flags |= SO_NOSPACE;
if (nonblock)
{
if (copied)
return copied;
return -EAGAIN;
}
if (current->signal & ~current->blocked)
{
if (copied)
return copied;
return -ERESTARTSYS;
}
wait_for_tcp_memory(sk);
continue;
}
skb->sk = sk;
skb->free = 0;
skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
/*
* FIXME: we need to optimize this.
* Perhaps some hints here would be good.
*/
tmp = sk->prot->build_header(skb, sk->saddr, sk->daddr, &dev,
IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
if (tmp < 0 )
{
sock_wfree(sk, skb);
if (copied)
return(copied);
return(tmp);
}
#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
skb->ip_hdr->frag_off |= htons(IP_DF);
#endif
skb->dev = dev;
skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
if (tmp < 0)
{
sock_wfree(sk, skb);
if (copied)
return(copied);
return(tmp);
}
if (flags & MSG_OOB)
{
skb->h.th->urg = 1;
skb->h.th->urg_ptr = ntohs(copy);
}
skb->csum = csum_partial_copy_fromuser(from,
skb_put(skb,copy), copy, 0);
from += copy;
copied += copy;
len -= copy;
seglen -= copy;
skb->free = 0;
sk->write_seq += copy;
if (delay)
{
tcp_enqueue_partial(skb, sk);
continue;
}
tcp_send_skb(sk, skb);
}
}
sk->err = 0;
return copied;
}
static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
int len, int nonblock, int flags)
{
int retval = -EINVAL;
/*
* Do sanity checking for sendmsg/sendto/send
*/
if (flags & ~(MSG_OOB|MSG_DONTROUTE))
goto out;
if (msg->msg_name) {
struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
if (msg->msg_namelen < sizeof(*addr))
goto out;
if (addr->sin_family && addr->sin_family != AF_INET)
goto out;
retval = -ENOTCONN;
if(sk->state == TCP_CLOSE)
goto out;
retval = -EISCONN;
if (addr->sin_port != sk->dummy_th.dest)
goto out;
if (addr->sin_addr.s_addr != sk->daddr)
goto out;
}
lock_sock(sk);
retval = do_tcp_sendmsg(sk, msg->msg_iovlen, msg->msg_iov, len, nonblock, flags);
/*
* Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
* interactive fast network servers. It's meant to be on and
* it really improves the throughput though not the echo time
* on my slow slip link - Alan
*
* If not nagling we can send on the before case too..
*/
if (sk->partial) {
if (!sk->packets_out ||
(sk->nonagle && before(sk->write_seq , sk->window_seq))) {
tcp_send_partial(sk);
}
}
release_sock(sk);
out:
return retval;
}
/*
* Send an ack if one is backlogged at this point.
*/
void tcp_read_wakeup(struct sock *sk)
{
if (!sk->ack_backlog)
return;
/*
* If we're closed, don't send an ack, or we'll get a RST
* from the closed destination.
*/
if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
return;
tcp_send_ack(sk);
}
/*
* Handle reading urgent data. BSD has very simple semantics for
* this, no blocking and very strange errors 8)
*/
static int tcp_recv_urg(struct sock * sk, int nonblock,
struct msghdr *msg, int len, int flags, int *addr_len)
{
/*
* No URG data to read
*/
if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
return -EINVAL; /* Yes this is right ! */
if (sk->err)
return sock_error(sk);
if (sk->state == TCP_CLOSE || sk->done)
{
if (!sk->done)
{
sk->done = 1;
return 0;
}
return -ENOTCONN;
}
if (sk->shutdown & RCV_SHUTDOWN)
{
sk->done = 1;
return 0;
}
lock_sock(sk);
if (sk->urg_data & URG_VALID)
{
char c = sk->urg_data;
if (!(flags & MSG_PEEK))
sk->urg_data = URG_READ;
memcpy_toiovec(msg->msg_iov, &c, 1);
if(msg->msg_name)
{
struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
sin->sin_family=AF_INET;
sin->sin_addr.s_addr=sk->daddr;
sin->sin_port=sk->dummy_th.dest;
}
if(addr_len)
*addr_len=sizeof(struct sockaddr_in);
release_sock(sk);
return 1;
}
release_sock(sk);
/*
* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
* the available implementations agree in this case:
* this call should never block, independent of the
* blocking state of the socket.
* Mike <pall@rz.uni-karlsruhe.de>
*/
return -EAGAIN;
}
/*
* Release a skb if it is no longer needed. This routine
* must be called with interrupts disabled or with the
* socket locked so that the sk_buff queue operation is ok.
*/
static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
{
skb->sk = sk;
__skb_unlink(skb, &sk->receive_queue);
kfree_skb(skb, FREE_READ);
}
/*
* FIXME:
* This routine frees used buffers.
* It should consider sending an ACK to let the
* other end know we now have a bigger window.
*/
static void cleanup_rbuf(struct sock *sk)
{
/*
* NOTE! The socket must be locked, so that we don't get
* a messed-up receive queue.
*/
while (!skb_queue_empty(&sk->receive_queue)) {
struct sk_buff *skb = sk->receive_queue.next;
if (!skb->used || skb->users)
break;
tcp_eat_skb(sk, skb);
}
/*
* Tell the world if we raised the window.
*/
if (tcp_raise_window(sk))
tcp_send_ack(sk);
}
/*
* This routine copies from a sock struct into the user buffer.
*/
static int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
int len, int nonblock, int flags, int *addr_len)
{
struct wait_queue wait = { current, NULL };
int copied = 0;
u32 peek_seq;
volatile u32 *seq; /* So gcc doesn't overoptimise */
unsigned long used;
/*
* This error should be checked.
*/
if (sk->state == TCP_LISTEN)
return -ENOTCONN;
/*
* Urgent data needs to be handled specially.
*/
if (flags & MSG_OOB)
return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
/*
* Copying sequence to update. This is volatile to handle
* the multi-reader case neatly (memcpy_to/fromfs might be
* inline and thus not flush cached variables otherwise).
*/
peek_seq = sk->copied_seq;
seq = &sk->copied_seq;
if (flags & MSG_PEEK)
seq = &peek_seq;
add_wait_queue(sk->sleep, &wait);
lock_sock(sk);
while (len > 0)
{
struct sk_buff * skb;
u32 offset;
/*
* Are we at urgent data? Stop if we have read anything.
*/
if (copied && sk->urg_data && sk->urg_seq == *seq)
break;
/*
* We need to check signals first, to get correct SIGURG
* handling.
*/
if (current->signal & ~current->blocked) {
if (copied)
break;
copied = -ERESTARTSYS;
break;
}
/*
* Next get a buffer.
*/
current->state = TASK_INTERRUPTIBLE;
skb = skb_peek(&sk->receive_queue);
do
{
if (!skb)
break;
if (before(*seq, skb->seq))
break;
offset = *seq - skb->seq;
if (skb->h.th->syn)
offset--;
if (offset < skb->len)
goto found_ok_skb;
if (skb->h.th->fin)
goto found_fin_ok;
if (!(flags & MSG_PEEK))
skb->used = 1;
skb = skb->next;
}
while (skb != (struct sk_buff *)&sk->receive_queue);
if (copied)
break;
if (sk->err)
{
copied = sock_error(sk);
break;
}
if (sk->state == TCP_CLOSE)
{
if (!sk->done)
{
sk->done = 1;
break;
}
copied = -ENOTCONN;
break;
}
if (sk->shutdown & RCV_SHUTDOWN)
{
sk->done = 1;
break;
}
if (nonblock)
{
copied = -EAGAIN;
break;
}
cleanup_rbuf(sk);
release_sock(sk);
sk->socket->flags |= SO_WAITDATA;
schedule();
sk->socket->flags &= ~SO_WAITDATA;
lock_sock(sk);
continue;
found_ok_skb:
/*
* Lock the buffer. We can be fairly relaxed as
* an interrupt will never steal a buffer we are
* using unless I've missed something serious in
* tcp_data.
*/
skb->users++;
/*
* Ok so how much can we use ?
*/
used = skb->len - offset;
if (len < used)
used = len;
/*
* Do we have urgent data here?
*/
if (sk->urg_data)
{
u32 urg_offset = sk->urg_seq - *seq;
if (urg_offset < used)
{
if (!urg_offset)
{
if (!sk->urginline)
{
++*seq;
offset++;
used--;
}
}
else
used = urg_offset;
}
}
/*
* Copy it - We _MUST_ update *seq first so that we
* don't ever double read when we have dual readers
*/
*seq += used;
/*
* This memcpy_tofs can sleep. If it sleeps and we
* do a second read it relies on the skb->users to avoid
* a crash when cleanup_rbuf() gets called.
*/
memcpy_toiovec(msg->msg_iov,((unsigned char *)skb->h.th) +
skb->h.th->doff*4 + offset, used);
copied += used;
len -= used;
/*
* We now will not sleep again until we are finished
* with skb. Sorry if you are doing the SMP port
* but you'll just have to fix it neatly ;)
*/
skb->users --;
if (after(sk->copied_seq,sk->urg_seq))
sk->urg_data = 0;
if (used + offset < skb->len)
continue;
/*
* Process the FIN.
*/
if (skb->h.th->fin)
goto found_fin_ok;
if (flags & MSG_PEEK)
continue;
skb->used = 1;
if (!skb->users)
tcp_eat_skb(sk, skb);
continue;
found_fin_ok:
++*seq;
if (flags & MSG_PEEK)
break;
/*
* All is done
*/
skb->used = 1;
sk->shutdown |= RCV_SHUTDOWN;
break;
}
if(copied>0 && msg->msg_name)
{
struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
sin->sin_family=AF_INET;
sin->sin_addr.s_addr=sk->daddr;
sin->sin_port=sk->dummy_th.dest;
}
if(addr_len)
*addr_len=sizeof(struct sockaddr_in);
remove_wait_queue(sk->sleep, &wait);
current->state = TASK_RUNNING;
/* Clean up data we have read: This will do ACK frames */
cleanup_rbuf(sk);
release_sock(sk);
return copied;
}
/*
* State processing on a close. This implements the state shift for
* sending our FIN frame. Note that we only send a FIN for some
* states. A shutdown() may have already sent the FIN, or we may be
* closed.
*/
static int tcp_close_state(struct sock *sk, int dead)
{
int ns=TCP_CLOSE;
int send_fin=0;
switch(sk->state)
{
case TCP_SYN_SENT: /* No SYN back, no FIN needed */
break;
case TCP_SYN_RECV:
case TCP_ESTABLISHED: /* Closedown begin */
ns=TCP_FIN_WAIT1;
send_fin=1;
break;
case TCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */
case TCP_FIN_WAIT2:
case TCP_CLOSING:
ns=sk->state;
break;
case TCP_CLOSE:
case TCP_LISTEN:
break;
case TCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and
wait only for the ACK */
ns=TCP_LAST_ACK;
send_fin=1;
}
tcp_set_state(sk,ns);
/*
* This is a (useful) BSD violating of the RFC. There is a
* problem with TCP as specified in that the other end could
* keep a socket open forever with no application left this end.
* We use a 3 minute timeout (about the same as BSD) then kill
* our end. If they send after that then tough - BUT: long enough
* that we won't make the old 4*rto = almost no time - whoops
* reset mistake.
*/
if(dead && ns==TCP_FIN_WAIT2)
{
int timer_active=del_timer(&sk->timer);
if(timer_active)
add_timer(&sk->timer);
else
tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
}
return send_fin;
}
/*
* Shutdown the sending side of a connection. Much like close except
* that we don't receive shut down or set sk->dead.
*/
void tcp_shutdown(struct sock *sk, int how)
{
/*
* We need to grab some memory, and put together a FIN,
* and then put it into the queue to be sent.
* Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
*/
if (!(how & SEND_SHUTDOWN))
return;
/*
* If we've already sent a FIN, or it's a closed state
*/
if (sk->state == TCP_FIN_WAIT1 ||
sk->state == TCP_FIN_WAIT2 ||
sk->state == TCP_CLOSING ||
sk->state == TCP_LAST_ACK ||
sk->state == TCP_TIME_WAIT ||
sk->state == TCP_CLOSE ||
sk->state == TCP_LISTEN
)
{
return;
}
lock_sock(sk);
/*
* flag that the sender has shutdown
*/
sk->shutdown |= SEND_SHUTDOWN;
/*
* Clear out any half completed packets.
*/
if (sk->partial)
tcp_send_partial(sk);
/*
* FIN if needed
*/
if (tcp_close_state(sk,0))
tcp_send_fin(sk);
release_sock(sk);
}
/*
* Return 1 if we still have things to send in our buffers.
*/
static inline int closing(struct sock * sk)
{
switch (sk->state) {
case TCP_FIN_WAIT1:
case TCP_CLOSING:
case TCP_LAST_ACK:
return 1;
}
return 0;
}
static void tcp_close(struct sock *sk, unsigned long timeout)
{
struct sk_buff *skb;
/*
* We need to grab some memory, and put together a FIN,
* and then put it into the queue to be sent.
*/
lock_sock(sk);
tcp_cache_zap();
if(sk->state == TCP_LISTEN)
{
/* Special case */
tcp_set_state(sk, TCP_CLOSE);
tcp_close_pending(sk);
release_sock(sk);
sk->dead = 1;
return;
}
sk->keepopen = 1;
sk->shutdown = SHUTDOWN_MASK;
if (!sk->dead)
sk->state_change(sk);
/*
* We need to flush the recv. buffs. We do this only on the
* descriptor close, not protocol-sourced closes, because the
* reader process may not have drained the data yet!
*/
while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
kfree_skb(skb, FREE_READ);
/*
* Get rid off any half-completed packets.
*/
if (sk->partial)
tcp_send_partial(sk);
/*
* Timeout is not the same thing - however the code likes
* to send both the same way (sigh).
*/
if (tcp_close_state(sk,1)==1)
{
tcp_send_fin(sk);
}
if (timeout) {
cli();
release_sock(sk);
current->timeout = timeout;
while(closing(sk) && current->timeout)
{
interruptible_sleep_on(sk->sleep);
if (current->signal & ~current->blocked)
{
break;
}
}
current->timeout=0;
lock_sock(sk);
sti();
}
/*
* This will destroy it. The timers will take care of actually
* free'ing up the memory.
*/
tcp_cache_zap(); /* Kill the cache again. */
/* Now that the socket is dead, if we are in the FIN_WAIT2 state
* we may need to set up a timer.
*/
if (sk->state==TCP_FIN_WAIT2)
{
int timer_active=del_timer(&sk->timer);
if(timer_active)
add_timer(&sk->timer);
else
tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
}
release_sock(sk);
sk->dead = 1;
}
/*
* Wait for a incoming connection, avoid race
* conditions. This must be called with the socket
* locked.
*/
static struct sk_buff * wait_for_connect(struct sock * sk)
{
struct wait_queue wait = { current, NULL };
struct sk_buff * skb = NULL;
add_wait_queue(sk->sleep, &wait);
for (;;) {
current->state = TASK_INTERRUPTIBLE;
release_sock(sk);
schedule();
lock_sock(sk);
skb = tcp_find_established(sk);
if (skb)
break;
if (current->signal & ~current->blocked)
break;
}
remove_wait_queue(sk->sleep, &wait);
return skb;
}
/*
* This will accept the next outstanding connection.
*
* Be careful about race conditions here - this is subtle.
*/
static struct sock *tcp_accept(struct sock *sk, int flags)
{
int error;
struct sk_buff *skb;
struct sock *newsk = NULL;
/*
* We need to make sure that this socket is listening,
* and that it has something pending.
*/
error = EINVAL;
if (sk->state != TCP_LISTEN)
goto no_listen;
lock_sock(sk);
skb = tcp_find_established(sk);
if (skb) {
got_new_connect:
__skb_unlink(skb, &sk->receive_queue);
newsk = skb->sk;
kfree_skb(skb, FREE_READ);
sk->ack_backlog--;
error = 0;
out:
release_sock(sk);
no_listen:
sk->err = error;
return newsk;
}
error = EAGAIN;
if (flags & O_NONBLOCK)
goto out;
skb = wait_for_connect(sk);
if (skb)
goto got_new_connect;
error = ERESTARTSYS;
goto out;
}
/*
* This will initiate an outgoing connection.
*/
static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
{
struct sk_buff *buff;
struct device *dev=NULL;
unsigned char *ptr;
int tmp;
int atype;
struct tcphdr *t1;
struct rtable *rt;
if (sk->state != TCP_CLOSE)
return(-EISCONN);
/*
* Don't allow a double connect.
*/
if(sk->daddr)
return -EINVAL;
if (addr_len < 8)
return(-EINVAL);
if (usin->sin_family && usin->sin_family != AF_INET)
return(-EAFNOSUPPORT);
/*
* connect() to INADDR_ANY means loopback (BSD'ism).
*/
if(usin->sin_addr.s_addr==INADDR_ANY)
usin->sin_addr.s_addr=ip_my_addr();
/*
* Don't want a TCP connection going to a broadcast address
*/
if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
return -ENETUNREACH;
lock_sock(sk);
sk->daddr = usin->sin_addr.s_addr;
sk->write_seq = tcp_init_seq();
sk->window_seq = sk->write_seq;
sk->rcv_ack_seq = sk->write_seq -1;
sk->rcv_ack_cnt = 1;
sk->err = 0;
sk->dummy_th.dest = usin->sin_port;
release_sock(sk);
buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
if (buff == NULL)
{
return(-ENOMEM);
}
lock_sock(sk);
buff->sk = sk;
buff->free = 0;
buff->localroute = sk->localroute;
/*
* Put in the IP header and routing stuff.
*/
tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
IPPROTO_TCP, sk->opt, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
if (tmp < 0)
{
sock_wfree(sk, buff);
release_sock(sk);
return(-ENETUNREACH);
}
if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
sk->saddr = rt->rt_src;
sk->rcv_saddr = sk->saddr;
t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
buff->seq = sk->write_seq++;
t1->seq = htonl(buff->seq);
sk->sent_seq = sk->write_seq;
buff->end_seq = sk->write_seq;
t1->ack = 0;
t1->window = 2;
t1->syn = 1;
t1->doff = 6;
/* use 512 or whatever user asked for */
if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
sk->window_clamp=rt->rt_window;
else
sk->window_clamp=0;
if (sk->user_mss)
sk->mtu = sk->user_mss;
else if (rt)
sk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
else
sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
/*
* but not bigger than device MTU
*/
if(sk->mtu <32)
sk->mtu = 32; /* Sanity limit */
sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
#ifdef CONFIG_SKIP
/*
* SKIP devices set their MTU to 65535. This is so they can take packets
* unfragmented to security process then fragment. They could lie to the
* TCP layer about a suitable MTU, but it's easier to let skip sort it out
* simply because the final package we want unfragmented is going to be
*
* [IPHDR][IPSP][Security data][Modified TCP data][Security data]
*/
if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */
sk->mtu=skip_pick_mtu(sk->mtu,dev);
#endif
/*
* Put in the TCP options to say MTU.
*/
ptr = skb_put(buff,4);
ptr[0] = 2;
ptr[1] = 4;
ptr[2] = (sk->mtu) >> 8;
ptr[3] = (sk->mtu) & 0xff;
buff->csum = csum_partial(ptr, 4, 0);
tcp_send_check(t1, sk->saddr, sk->daddr,
sizeof(struct tcphdr) + 4, buff);
/*
* This must go first otherwise a really quick response will get reset.
*/
tcp_cache_zap();
tcp_set_state(sk,TCP_SYN_SENT);
if(rt&&rt->rt_flags&RTF_IRTT)
sk->rto = rt->rt_irtt;
else
sk->rto = TCP_TIMEOUT_INIT;
sk->delack_timer.function = tcp_delack_timer;
sk->delack_timer.data = (unsigned long) sk;
sk->retransmit_timer.function = tcp_retransmit_timer;
sk->retransmit_timer.data = (unsigned long)sk;
sk->retransmits = 0;
sk->prot->queue_xmit(sk, dev, buff, 0);
tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
tcp_statistics.TcpActiveOpens++;
tcp_statistics.TcpOutSegs++;
release_sock(sk);
return(0);
}
/*
* Socket option code for TCP.
*/
int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
{
int val,err;
if(level!=SOL_TCP)
return ip_setsockopt(sk,level,optname,optval,optlen);
if (optval == NULL)
return(-EINVAL);
err=verify_area(VERIFY_READ, optval, sizeof(int));
if(err)
return err;
val = get_user((int *)optval);
switch(optname)
{
case TCP_MAXSEG:
/*
* values greater than interface MTU won't take effect. however at
* the point when this call is done we typically don't yet know
* which interface is going to be used
*/
if(val<1||val>MAX_WINDOW)
return -EINVAL;
sk->user_mss=val;
return 0;
case TCP_NODELAY:
sk->nonagle=(val==0)?0:1;
return 0;
default:
return(-ENOPROTOOPT);
}
}
int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
{
int val,err;
if(level!=SOL_TCP)
return ip_getsockopt(sk,level,optname,optval,optlen);
switch(optname)
{
case TCP_MAXSEG:
val=sk->user_mss;
break;
case TCP_NODELAY:
val=sk->nonagle;
break;
default:
return(-ENOPROTOOPT);
}
err=verify_area(VERIFY_WRITE, optlen, sizeof(int));
if(err)
return err;
put_user(sizeof(int),(int *) optlen);
err=verify_area(VERIFY_WRITE, optval, sizeof(int));
if(err)
return err;
put_user(val,(int *)optval);
return(0);
}
struct proto tcp_prot = {
tcp_close,
ip_build_header,
tcp_connect,
tcp_accept,
ip_queue_xmit,
tcp_retransmit,
tcp_write_wakeup,
tcp_read_wakeup,
tcp_rcv,
tcp_select,
tcp_ioctl,
NULL,
tcp_shutdown,
tcp_setsockopt,
tcp_getsockopt,
tcp_sendmsg,
tcp_recvmsg,
NULL, /* No special bind() */
128,
0,
"TCP",
0, 0,
{NULL,}
};