diff -Nuarp linux-2.4.25-web100/include/linux/skbuff.h linux-2.4.25-web100-cubic/include/linux/skbuff.h --- linux-2.4.25-web100/include/linux/skbuff.h 2003-08-25 06:44:44.000000000 -0500 +++ linux-2.4.25-web100-cubic/include/linux/skbuff.h 2004-06-30 22:58:33.000000000 -0500 @@ -566,8 +566,13 @@ static inline struct sk_buff *__skb_dequ { struct sk_buff *next, *prev, *result; + if (list==NULL) return NULL; /*From H-TCP*/ + prev = (struct sk_buff *) list; next = prev->next; + + if (next==NULL) return NULL; /*From H-TCP*/ + result = NULL; if (next != prev) { result = next; diff -Nuarp linux-2.4.25-web100/include/linux/sysctl.h linux-2.4.25-web100-cubic/include/linux/sysctl.h --- linux-2.4.25-web100/include/linux/sysctl.h 2004-07-02 14:38:30.000000000 -0500 +++ linux-2.4.25-web100-cubic/include/linux/sysctl.h 2004-06-30 22:58:27.000000000 -0500 @@ -325,7 +325,18 @@ enum NET_IPV4_WAD_MAX_BURST, NET_IPV4_WAD_FLOYD_AIMD, NET_IPV4_WEB100_SCALABLE_TCP, + NET_IPV4_BICTCP, + NET_IPV4_BICTCP_1024TIMES_BETA, + NET_IPV4_BICTCP_MAX_INCREMENT, + NET_IPV4_BICTCP_1024TIMES_CUBIC_SCALE, + NET_IPV4_BICTCP_1024TIMES_LOW_UTILIZATION_THRESHOLD, + NET_IPV4_BICTCP_LOW_UTILIZATION_CHECKING_PERIOD, + NET_IPV4_BICTCP_SSTHRESH, + NET_IPV4_BICTCP_FAST_CONVERGENCE, + NET_IPV4_BICTCP_TCP_FRIENDLINESS, #endif + NET_IPV4_BICTCP_THROTTLE, + NET_IPV4_BICTCP_MODERATION, #ifdef CONFIG_WEB100_STATS NET_IPV4_WEB100_FPERMS, NET_IPV4_WEB100_GID, diff -Nuarp linux-2.4.25-web100/include/net/sock.h linux-2.4.25-web100-cubic/include/net/sock.h --- linux-2.4.25-web100/include/net/sock.h 2004-07-02 14:38:30.000000000 -0500 +++ linux-2.4.25-web100-cubic/include/net/sock.h 2004-07-01 03:49:30.000000000 -0500 @@ -258,6 +258,10 @@ struct tcp_sack_block { __u32 end_seq; }; +extern int sysctl_bictcp_throttle; /*disable throttle action*/ +#define bictcp_HZ (10) /*BIC HZ 2^10=1024*/ + + struct tcp_opt { int tcp_header_len; /* Bytes of tcp header to send */ @@ -339,6 +343,46 @@ struct tcp_opt { __u32 snd_cwnd_used; __u32 snd_cwnd_stamp; + /*bictcp Parameters*/ + __u32 bictcp_cnt; /* increase cwnd by 1 after this number of ACKs */ + __u32 bictcp_last_max_cwnd; /* Wmax used by CUBIC */ + __u32 bictcp_last_loss_cwnd; /* cwnd at the last loss event */ + __u32 bictcp_last_cwnd; /* last cwnd when we called bic_update */ + __u32 bictcp_last_time; /* last time when we called bic_update */ + + __u32 bictcp_cubic_origin_point; /* origin point of cubic function*/ + __u32 bictcp_cubic_K; /* time to origin point from the beginning of the current epoch */ + + __u32 bictcp_delay_min; /* min delay */ + __u32 bictcp_delay_max; /* max delay */ + __u32 bictcp_last_delay; /* last delay sample */ + __u32 bictcp_low_utilization_indication; /* 0: high; 1: low */ + __u32 bictcp_low_utilization_start; /* starting time of low utilization detection*/ + + __u32 bictcp_epoch_start; /* beginning of an epoch */ + __u32 bictcp_tcp_8times_scale; /* used to estimate tcp's increment per rtt */ + __u32 bictcp_ack_cnt; /* number of acks */ + __u32 bictcp_tcp_cwnd; /* estimated tcp cwnd */ + + __u32 bictcp_delayed_ack; /* estimate the ratio of Packets/ACKs */ + __u32 bictcp_max_packets_in_flight; /* for burst moderation */ + + /* from STCP, retrans queue hinting */ + struct sk_buff* mark_head_lost_skb_hint; + int mark_head_lost_cnt_hint; + + struct sk_buff* update_scoreboard_skb_hint; + + struct sk_buff* xmit_retransmit_queue_lost_skb_hint; + int xmit_retransmit_queue_lost_cnt_hint; + struct sk_buff* xmit_retransmit_queue_forward_skb_hint; + int xmit_retransmit_queue_forward_cnt_hint; + + /* from STCP, SACK fastpath */ + struct tcp_sack_block recv_sack_cache[4]; + int sackfastpath_facket_cnt_hint; + struct sk_buff* sackfastpath_skb_hint; + /* Two commonly used timers in both sender and receiver paths. */ unsigned long timeout; struct timer_list retransmit_timer; /* Resend (no ack) */ diff -Nuarp linux-2.4.25-web100/include/net/tcp.h linux-2.4.25-web100-cubic/include/net/tcp.h --- linux-2.4.25-web100/include/net/tcp.h 2004-07-02 14:38:30.000000000 -0500 +++ linux-2.4.25-web100-cubic/include/net/tcp.h 2004-07-02 14:49:45.000000000 -0500 @@ -476,7 +476,19 @@ extern int sysctl_WAD_IFQ; extern int sysctl_WAD_MaxBurst; extern int sysctl_WAD_FloydAIMD; extern int sysctl_web100_scalable_tcp; +/*bictcp Variables*/ +extern int sysctl_bictcp; +extern int sysctl_bictcp_1024times_beta; +extern int sysctl_bictcp_max_increment; +extern int sysctl_bictcp_1024times_cubic_scale; +extern int sysctl_bictcp_1024times_low_utilization_threshold; +extern int sysctl_bictcp_low_utilization_checking_period; +extern int sysctl_bictcp_ssthresh; +extern int sysctl_bictcp_fast_convergence; +extern int sysctl_bictcp_tcp_friendliness; #endif +extern int sysctl_bictcp_moderation; + #ifdef CONFIG_WEB100_STATS extern int sysctl_web100_sbufmode; extern int sysctl_web100_rbufmode; @@ -1075,6 +1087,12 @@ struct tcp_skb_cb { (skb != (struct sk_buff *)&(sk)->write_queue); \ skb=skb->next) +/*from STCP for fast SACK Process*/ +#define for_retrans_queue_from(skb, skb_init, sk, tp) \ + for (skb = (struct sk_buff *)(skb_init); \ + (skb != (tp)->send_head) && \ + (skb != (struct sk_buff *)&(sk)->write_queue); \ + skb=skb->next) #include @@ -1127,6 +1145,22 @@ static inline __u32 tcp_recalc_ssthresh( __u32 tmp = tp->snd_cwnd - ((tp->snd_cwnd*NET100_WAD(tp, WAD_MD,128))>>8); /* 8b fraction*/ return max(tmp, 2U); } + if (sysctl_bictcp) { + tp->bictcp_epoch_start = 0; + + if ((tp->bictcp_delay_min > 0) && (tp->bictcp_delay_max > tp->bictcp_delay_min)) /* in case of wrong delay_max*/ + tp->bictcp_delay_max = tp->bictcp_delay_min + ((tp->bictcp_delay_max - tp->bictcp_delay_min)* 90) / 100; + + if ((tp->snd_cwnd < tp->bictcp_last_max_cwnd) && (sysctl_bictcp_fast_convergence)) + tp->bictcp_last_max_cwnd = (tp->snd_cwnd * (1024+sysctl_bictcp_1024times_beta))>>11; + else + tp->bictcp_last_max_cwnd = tp->snd_cwnd; + + tp->bictcp_last_loss_cwnd = tp->snd_cwnd; + + return max((tp->snd_cwnd*sysctl_bictcp_1024times_beta)>>10, 2U); + } + /* otherwise do standard stuff ... */ #endif return max(tp->snd_cwnd >> 1U, 2U); @@ -1178,8 +1212,12 @@ static inline void __tcp_enter_cwr(struc { tp->undo_marker = 0; tp->snd_ssthresh = tcp_recalc_ssthresh(tp); - tp->snd_cwnd = min(tp->snd_cwnd, - tcp_packets_in_flight(tp) + 1U); + if (sysctl_bictcp_moderation){ + __u32 cap; + cap = max(tp->snd_ssthresh, tcp_packets_in_flight(tp) + 1U); + tp->snd_cwnd = min(tp->snd_cwnd, cap); + } else + tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1U); tp->snd_cwnd_cnt = 0; tp->high_seq = tp->snd_nxt; tp->snd_cwnd_stamp = tcp_time_stamp; @@ -1273,6 +1311,19 @@ static __inline__ int tcp_snd_wait(struc if ((tcp_packets_in_flight(tp) >= tp->snd_cwnd) && !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) return WC_SNDLIM_CWND; + + if (sysctl_bictcp_moderation && tp->bictcp_max_packets_in_flight){ /* burst moderation*/ + __u32 cap; + if (tp->ca_state == TCP_CA_Recovery) + cap = tp->bictcp_max_packets_in_flight; /* in recovery */ + else + cap = tp->bictcp_max_packets_in_flight + tcp_max_burst(tp) + (tp->snd_cwnd>>7); /* in other states*/ + + if ((tcp_packets_in_flight(tp) >= cap) && + !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) + return WC_SNDLIM_CWND; + } + if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd)) return WC_SNDLIM_RWIN; if (!(nonagle == 1 || tp->urg_mode || @@ -1960,4 +2011,41 @@ static inline void tcp_mib_init(void) TCP_ADD_STATS_USER(TcpMaxConn, -1); } +static inline void bictcp_init(struct tcp_opt *tp) +{ + tp->bictcp_cnt = 0; + + tp->bictcp_last_max_cwnd = 0; + tp->bictcp_last_loss_cwnd = 0; + tp->bictcp_last_cwnd = 0; + tp->bictcp_last_time = 0; + + tp->bictcp_cubic_origin_point = 0; + tp->bictcp_cubic_K = 0; + + tp->bictcp_delay_min = 0; + tp->bictcp_delay_max = 0; + tp->bictcp_last_delay = 0; + tp->bictcp_low_utilization_indication = 0; + tp->bictcp_low_utilization_start = 0; + + tp->bictcp_delayed_ack = 2*1024; /* Linux default */ + tp->bictcp_epoch_start = 0; + + tp->bictcp_ack_cnt = 0; + tp->bictcp_tcp_cwnd = 0; + tp->bictcp_tcp_8times_scale = 8*(1024+sysctl_bictcp_1024times_beta)/3/(1024-sysctl_bictcp_1024times_beta); + /* increment per RTT is 3*(1-beta)/(1+beta)*/ +} + +/*from STCP */ +static inline void clear_all_retrans_hints(struct tcp_opt *tp){ + tp->mark_head_lost_skb_hint = NULL; + tp->update_scoreboard_skb_hint = NULL; + tp->xmit_retransmit_queue_lost_skb_hint = NULL; + tp->xmit_retransmit_queue_forward_skb_hint = NULL; + tp->sackfastpath_skb_hint = NULL; +} + + #endif /* _TCP_H */ diff -Nuarp linux-2.4.25-web100/Makefile linux-2.4.25-web100-cubic/Makefile --- linux-2.4.25-web100/Makefile 2004-07-02 14:38:30.000000000 -0500 +++ linux-2.4.25-web100-cubic/Makefile 2004-06-26 23:49:39.000000000 -0500 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 SUBLEVEL = 25 -EXTRAVERSION = -web100 +EXTRAVERSION = -web100-cubic KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff -Nuarp linux-2.4.25-web100/net/core/dev.c linux-2.4.25-web100-cubic/net/core/dev.c --- linux-2.4.25-web100/net/core/dev.c 2004-02-18 07:36:32.000000000 -0600 +++ linux-2.4.25-web100-cubic/net/core/dev.c 2004-06-16 15:52:49.000000000 -0500 @@ -1291,8 +1291,12 @@ int netif_rx(struct sk_buff *skb) netdev_rx_stat[this_cpu].total++; if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { if (queue->input_pkt_queue.qlen) { - if (queue->throttle) - goto drop; + + //Enable or Disable Throttle Action + if (!sysctl_bictcp_throttle){ + if (queue->throttle) + goto drop; + } enqueue: dev_hold(skb->dev); diff -Nuarp linux-2.4.25-web100/net/ipv4/sysctl_net_ipv4.c linux-2.4.25-web100-cubic/net/ipv4/sysctl_net_ipv4.c --- linux-2.4.25-web100/net/ipv4/sysctl_net_ipv4.c 2004-07-02 14:38:30.000000000 -0500 +++ linux-2.4.25-web100-cubic/net/ipv4/sysctl_net_ipv4.c 2004-06-30 11:40:13.000000000 -0500 @@ -252,7 +252,30 @@ ctl_table ipv4_table[] = { &sysctl_WAD_FloydAIMD, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_WEB100_SCALABLE_TCP, "web100_scalable_tcp", &sysctl_web100_scalable_tcp, sizeof (int), 0644, NULL, &proc_dointvec}, + /*BICTCP sysctl variables*/ + {NET_IPV4_BICTCP, "bictcp", + &sysctl_bictcp, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_BICTCP_1024TIMES_BETA, "bictcp_1024times_beta", + &sysctl_bictcp_1024times_beta, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_BICTCP_MAX_INCREMENT, "bictcp_max_increment", + &sysctl_bictcp_max_increment, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_BICTCP_1024TIMES_CUBIC_SCALE, "bictcp_1024times_cubic_scale", + &sysctl_bictcp_1024times_cubic_scale, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_BICTCP_1024TIMES_LOW_UTILIZATION_THRESHOLD, "bictcp_1024times_low_utilization_threshold", + &sysctl_bictcp_1024times_low_utilization_threshold, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_BICTCP_LOW_UTILIZATION_CHECKING_PERIOD, "bictcp_low_utilization_checking_period", + &sysctl_bictcp_low_utilization_checking_period, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_BICTCP_SSTHRESH, "bictcp_ssthresh", + &sysctl_bictcp_ssthresh, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_BICTCP_FAST_CONVERGENCE, "bictcp_fast_convergence", + &sysctl_bictcp_fast_convergence, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_BICTCP_TCP_FRIENDLINESS, "bictcp_tcp_friendliness", + &sysctl_bictcp_tcp_friendliness, sizeof(int), 0644, NULL, &proc_dointvec}, #endif + {NET_IPV4_BICTCP_THROTTLE, "bictcp_throttle", + &sysctl_bictcp_throttle, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_BICTCP_MODERATION, "bictcp_moderation", + &sysctl_bictcp_moderation, sizeof(int), 0644, NULL, &proc_dointvec}, #ifdef CONFIG_WEB100_STATS {NET_IPV4_WEB100_FPERMS, "web100_fperms", &sysctl_web100_fperms, sizeof(int), 0644, NULL, &web100_proc_dointvec_update}, diff -Nuarp linux-2.4.25-web100/net/ipv4/tcp.c linux-2.4.25-web100-cubic/net/ipv4/tcp.c --- linux-2.4.25-web100/net/ipv4/tcp.c 2004-07-02 14:38:30.000000000 -0500 +++ linux-2.4.25-web100-cubic/net/ipv4/tcp.c 2004-07-02 10:12:11.000000000 -0500 @@ -286,11 +286,24 @@ int sysctl_web100_rcvbuf_emu = 1; #endif #ifdef CONFIG_WEB100_NET100 int sysctl_web100_no_metrics_save = 0; -int sysctl_WAD_IFQ = 0; +int sysctl_WAD_IFQ = 1; int sysctl_WAD_MaxBurst = 3; int sysctl_WAD_FloydAIMD = 0; int sysctl_web100_scalable_tcp = 0; +/*BICTCP Variables*/ +int sysctl_bictcp = 1; +int sysctl_bictcp_1024times_beta = 819; /* so beta = 819/1024 = 0.8*/ +int sysctl_bictcp_max_increment = 16; /* Smax = 16 */ +int sysctl_bictcp_1024times_cubic_scale = 41; /* so 41/1024 = 0.04 */ +int sysctl_bictcp_1024times_low_utilization_threshold = 153; /* threshold = 153/1024 = 15% */ +int sysctl_bictcp_low_utilization_checking_period = 2; /* delay < threshold for 2 second, then low util*/ +int sysctl_bictcp_ssthresh = 100; /* after 100, go to congestion avoidance */ +int sysctl_bictcp_fast_convergence = 1; /* turn on fast convergence */ +int sysctl_bictcp_tcp_friendliness = 1; /* be tcp friendly */ #endif +int sysctl_bictcp_throttle = 1; /* accept new packets as long as there are rooms in the backlog*/ +int sysctl_bictcp_moderation = 1; /* a new burst moderation*/ + #ifdef CONFIG_WEB100_STATS int sysctl_web100_fperms = CONFIG_WEB100_FPERMS; int sysctl_web100_gid = CONFIG_WEB100_GID; diff -Nuarp linux-2.4.25-web100/net/ipv4/tcp_input.c linux-2.4.25-web100-cubic/net/ipv4/tcp_input.c --- linux-2.4.25-web100/net/ipv4/tcp_input.c 2004-07-02 14:38:30.000000000 -0500 +++ linux-2.4.25-web100-cubic/net/ipv4/tcp_input.c 2004-07-02 14:51:04.000000000 -0500 @@ -881,6 +881,7 @@ tcp_sacktag_write_queue(struct sock *sk, int prior_fackets; u32 lost_retrans = 0; int flag = 0; + int dup_sack = 0; int i; WEB100_VAR_INC(tp, SACKsRcvd); @@ -890,12 +891,25 @@ tcp_sacktag_write_queue(struct sock *sk, tp->fackets_out = 0; prior_fackets = tp->fackets_out; - for (i=0; istart_seq); - __u32 end_seq = ntohl(sp->end_seq); - int fack_count = 0; - int dup_sack = 0; + /* SACK fastpath: + * if the only SACK change is the increase of the end_seq of + * the first block then only apply that SACK block + * and use retrans queue hinting otherwise slowpath */ + flag = 1; + for ( i=0; irecv_sack_cache[i].start_seq != start_seq) + flag = 0; + }else { + if ((tp->recv_sack_cache[i].start_seq != start_seq) || + (tp->recv_sack_cache[i].end_seq != end_seq)) + flag = 0; + } + tp->recv_sack_cache[i].start_seq = start_seq; + tp->recv_sack_cache[i].end_seq = end_seq; /* Check for D-SACK. */ if (i == 0) { @@ -930,15 +944,58 @@ tcp_sacktag_write_queue(struct sock *sk, if (before(ack, prior_snd_una-tp->max_window)) return 0; } + } + + if(flag) { + num_sacks=1; + }else{ + int j; + tp->sackfastpath_skb_hint = NULL; + + /* order SACK blocks to allow in order walk of the retrans queue */ + for(i=num_sacks-1; i > 0; i--){ + for(j=0; jrecv_sack_cache[j+1].start_seq); + sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq); + sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq); + sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq); + } + + } + } + } + + /* clear flag as used for different purpose in following code */ + flag = 0; + + for (i=0; istart_seq); + __u32 end_seq = ntohl(sp->end_seq); + int fack_count; + + /* Use SACK fastpath hint if valid */ + if( tp->sackfastpath_skb_hint != NULL){ + skb = tp->sackfastpath_skb_hint; + fack_count = tp->sackfastpath_facket_cnt_hint; + }else{ + skb = sk->write_queue.next; + fack_count = 0; + } + /* Event "B" in the comment above. */ if (after(end_seq, tp->high_seq)) flag |= FLAG_DATA_LOST; - for_retrans_queue(skb, sk, tp) { + for_retrans_queue_from(skb, skb, sk, tp) { u8 sacked = TCP_SKB_CB(skb)->sacked; int in_sack; + tp->sackfastpath_skb_hint = skb; + tp->sackfastpath_facket_cnt_hint = fack_count; + /* The retransmission queue is always in order, so * we can short-circuit the walk early. */ @@ -991,8 +1048,11 @@ tcp_sacktag_write_queue(struct sock *sk, TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); tp->lost_out--; tp->retrans_out--; - } - } else { + + /* clear lost hint */ + tp->xmit_retransmit_queue_lost_skb_hint = NULL; + } + } else { /* New sack for not retransmitted frame, * which was in hole. It is reordering. */ @@ -1003,6 +1063,9 @@ tcp_sacktag_write_queue(struct sock *sk, if (sacked & TCPCB_LOST) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; tp->lost_out--; + + /* clear lost hint */ + tp->xmit_retransmit_queue_lost_skb_hint = NULL; } } @@ -1026,6 +1089,7 @@ tcp_sacktag_write_queue(struct sock *sk, (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out--; + tp->xmit_retransmit_queue_lost_skb_hint = NULL; } } } @@ -1051,6 +1115,9 @@ tcp_sacktag_write_queue(struct sock *sk, TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out--; + /* clear lost hint */ + tp->xmit_retransmit_queue_lost_skb_hint = NULL; + if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { tp->lost_out++; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; @@ -1156,6 +1223,11 @@ void tcp_enter_frto_loss(struct sock *sk tp->ca_state = TCP_CA_Loss; tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); + +#ifdef CONFIG_WEB100_NET100 + bictcp_init(tp); +#endif + clear_all_retrans_hints(tp); } void tcp_clear_retrans(struct tcp_opt *tp) @@ -1190,6 +1262,7 @@ void tcp_enter_loss(struct sock *sk, int tp->prior_ssthresh = tcp_current_ssthresh(tp); tp->snd_ssthresh = tcp_recalc_ssthresh(tp); } + tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; @@ -1222,6 +1295,11 @@ void tcp_enter_loss(struct sock *sk, int tp->ca_state = TCP_CA_Loss; tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); + +#ifdef CONFIG_WEB100_NET100 + bictcp_init(tp); +#endif + clear_all_retrans_hints(tp); } static int tcp_check_sack_reneging(struct sock *sk, struct tcp_opt *tp) @@ -1237,8 +1315,7 @@ static int tcp_check_sack_reneging(struc if ((skb = skb_peek(&sk->write_queue)) != NULL && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { NET_INC_STATS_BH(TCPSACKReneging); - - tcp_enter_loss(sk, 1); + tcp_enter_loss(sk, 1); tp->retransmits++; tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); @@ -1438,19 +1515,40 @@ static inline void tcp_reset_reno_sack(s static void tcp_mark_head_lost(struct sock *sk, struct tcp_opt *tp, int packets, u32 high_seq) { - struct sk_buff *skb; - int cnt = packets; + struct sk_buff *skb; + int cnt; - BUG_TRAP(cnt <= tp->packets_out); + BUG_TRAP(packets <= tp->packets_out); + + if ( tp->mark_head_lost_skb_hint != NULL ) { + skb = tp->mark_head_lost_skb_hint; + cnt = tp->mark_head_lost_cnt_hint; + }else{ + skb = sk->write_queue.next; + cnt = 0; + } + + for_retrans_queue_from(skb, skb, sk, tp) { + /* TODO: do this better */ + /* this is not the most efficient way to do this... */ + tp->mark_head_lost_skb_hint = skb; + tp->mark_head_lost_cnt_hint = cnt; + if (++cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq)) + break; + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out++; + + /* clear xmit_retransmit_queue hints + * if this is beyond hint */ + if(tp->xmit_retransmit_queue_lost_skb_hint != NULL && + before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->xmit_retransmit_queue_lost_skb_hint)->seq) ){ + + tp->xmit_retransmit_queue_lost_skb_hint = NULL; + } + } + } - for_retrans_queue(skb, sk, tp) { - if (--cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) - break; - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out++; - } - } tcp_sync_left_out(tp); } @@ -1475,13 +1573,32 @@ static void tcp_update_scoreboard(struct if (tcp_head_timedout(sk, tp)) { struct sk_buff *skb; - for_retrans_queue(skb, sk, tp) { - if (tcp_skb_timedout(tp, skb) && - !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out++; - } - } + if( tp->update_scoreboard_skb_hint != NULL ) { + skb = tp->update_scoreboard_skb_hint; + }else{ + skb = sk->write_queue.next; + } + + for_retrans_queue_from(skb,skb, sk, tp) { + if(tcp_skb_timedout(tp, skb)){ + if(!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)){ + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out++; + /* clear xmit_retrans hint */ + if(tp->xmit_retransmit_queue_lost_skb_hint != NULL && + before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->xmit_retransmit_queue_lost_skb_hint)->seq) ){ + + tp->xmit_retransmit_queue_lost_skb_hint = NULL; + } + } + }else{ + break; + } + + } + + tp->update_scoreboard_skb_hint = skb; + tcp_sync_left_out(tp); } } @@ -1491,6 +1608,11 @@ static void tcp_update_scoreboard(struct */ static __inline__ void tcp_moderate_cwnd(struct tcp_opt *tp) { + if (sysctl_bictcp_moderation) { + tp->snd_cwnd_stamp = tcp_time_stamp; + return; + } + #ifdef CONFIG_WEB100_STATS { u32 t = tcp_packets_in_flight(tp) + tcp_max_burst(tp); @@ -1516,10 +1638,21 @@ static void tcp_cwnd_down(struct tcp_opt tp->snd_cwnd_cnt = decr&1; decr >>= 1; +#ifdef CONFIG_WEB100_NET100 + if (decr && tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd -= decr; +#else if (decr && tp->snd_cwnd > tp->snd_ssthresh/2) tp->snd_cwnd -= decr; +#endif - tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); + if (sysctl_bictcp_moderation){ + __u32 cap; + cap = max(tp->snd_ssthresh, tcp_packets_in_flight(tp)+1); + tp->snd_cwnd = min(tp->snd_cwnd, cap); + } else + tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); + tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -1551,10 +1684,16 @@ static void DBGUNDO(struct sock *sk, str static void tcp_undo_cwr(struct tcp_opt *tp, int undo) { if (tp->prior_ssthresh) { - tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); + +#ifdef CONFIG_WEB100_NET100 + if (sysctl_bictcp) + tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp_last_max_cwnd); + else +#endif + tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { - tp->snd_ssthresh = tp->prior_ssthresh; + tp->snd_ssthresh = tp->prior_ssthresh; TCP_ECN_withdraw_cwr(tp); } WEB100_VAR_INC(tp, CongestionOverCount); @@ -1563,6 +1702,10 @@ static void tcp_undo_cwr(struct tcp_opt } tcp_moderate_cwnd(tp); tp->snd_cwnd_stamp = tcp_time_stamp; + + /* There is something screwy going on with the retrans hints after + an undo */ + clear_all_retrans_hints(tp); } static inline int tcp_may_undo(struct tcp_opt *tp) @@ -1645,6 +1788,8 @@ static int tcp_try_undo_loss(struct sock for_retrans_queue(skb, sk, tp) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } + clear_all_retrans_hints(tp); + DBGUNDO(sk, tp, "partial loss"); tp->lost_out = 0; tp->left_out = tp->sacked_out; @@ -1921,6 +2066,274 @@ tcp_ack_update_rtt(struct sock *sk, int tcp_ack_no_tstamp(sk, seq_rtt, flag); } + +/* 65536 times the cubic root of 0, 1, 2, 3, 4, 5, 6, 7*/ +static __u64 bictcp_table[8] = {0, 65536, 82570, 94519, 104030, 112063, 119087, 125367}; + +/* calculate the cubic root of x + the basic idea is that x can be expressed as i*8^j + so cubic_root(x) = cubic_root(i)*2^j + in the following code, x is i, and y is 2^j + because of integer calculation, there are errors in calculation + so finally use binary search to find out the exact solution*/ +static __u32 bictcp_cubic_root(__u64 x) +{ + __u64 y, app, target, start, end, mid, start_diff, end_diff; + + if (x == 0) + return 0; + + target = x; + + /*first estimate lower and upper bound*/ + y = 1; + while (x >= 8){ + x = (x >> 3); + y = (y << 1); + } + start = (y*bictcp_table[x])>>16; + if (x==7) + end = (y<<1); + else + end = (y*bictcp_table[x+1]+65535)>>16; + + /*binary search for more accurate one*/ + while (start < end-1) { + mid = (start+end) >> 1; + app = mid*mid*mid; + if (app < target) + start = mid; + else if (app > target) + end = mid; + else + return mid; + } + + /*find the most accurate one from start and end*/ + app = start*start*start; + if (app < target) + start_diff = target - app; + else + start_diff = app - target; + app = end*end*end; + if (app < target) + end_diff = target - app; + else + end_diff = app - target; + + if (start_diff < end_diff) + return (__u32)start; + else + return (__u32)end; +} + +static __u32 bictcp_K(__u32 dist, __u32 srtt) +{ + __u64 d64; + __u32 d32; + __u32 count; + __u32 result; + + /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 + so K = cubic_root( (wmax-cwnd)*rtt/c ) + the unit of K is bictcp_HZ=2^10, not HZ + + c = sysctl_bictcp_1024times_cubic_scale >> 10 + rtt = (tp->srtt >> 3 ) / HZ + + the following code has been designed and tested for + cwnd < 1 million packets + RTT < 100 seconds + HZ < 1,000,00 (corresponding to 10 nano-second) + + */ + + /* 1/c * 2^2*bictcp_HZ */ + d32 = (1 << (10+2*bictcp_HZ)) / sysctl_bictcp_1024times_cubic_scale; + d64 = (__u64)d32; + + /* srtt * 2^count / HZ + 1) to get a better accuracy of the following d32, + the larger the "count", the better the accuracy + 2) and avoid overflow of the following d64 + the larger the "count", the high possibility of oveflow + 3) so find a "count" between bictcp_hz-3 and bictcp_hz + "count" may be less than bictcp_HZ, + then d64 becomes 0. that is OK + */ + d32 = srtt; + count = 0; + while (((d32 & 0x80000000)==0) && (count < bictcp_HZ)){ + d32 = d32 << 1; + count++; + } + d32 = d32 / HZ; + + /* (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) */ + d64 = (d64 * dist * d32) >> (count+3-bictcp_HZ); + + /* cubic root */ + d64 = bictcp_cubic_root(d64); + + result = (__u32)d64; + return result; +} + +/*update bictcp_cnt*/ +static void bictcp_update(struct tcp_opt *tp) +{ + __u64 d64; + __u32 d32, t, srtt, cubic_target, cubic_increment, min_cnt; + + tp->bictcp_ack_cnt++; /* count the number of ACKs */ + + /* since this function is called for every ack, so it should be very fast + therefore, we do not update tp->bictcp_cnt for every ack */ + if ((tp->bictcp_last_cwnd == tp->snd_cwnd) && ((tcp_time_stamp-tp->bictcp_last_time)<=(HZ>>5)) ) + return; /* same cwnd and time interval less than 1/32 s, then no update*/ + tp->bictcp_last_cwnd = tp->snd_cwnd; + tp->bictcp_last_time = tcp_time_stamp; + + /* In case that we still do not have RTT samples */ + //if (tp->bictcp_delay_min == 0) + // srtt = 1; + //else + // srtt = tp->bictcp_delay_min; + srtt = (HZ <<3) / 10 ; /* use real time-based growth function */ + + if (tp->bictcp_epoch_start == 0) { + tp->bictcp_epoch_start = tcp_time_stamp; /* the beginning of an epoch */ + tp->bictcp_ack_cnt = 1; /* start counting */ + tp->bictcp_tcp_cwnd = tp->snd_cwnd; /* syn with cubic */ + + if (tp->bictcp_last_max_cwnd <= tp->snd_cwnd) { /* this usually happens in slow start*/ + tp->bictcp_cubic_K = 0; + tp->bictcp_cubic_origin_point = tp->snd_cwnd; + } else { + tp->bictcp_cubic_K = bictcp_K(tp->bictcp_last_max_cwnd - tp->snd_cwnd, srtt); + tp->bictcp_cubic_origin_point = tp->bictcp_last_max_cwnd; + } + } + + /* cubic function - calc*/ + /* calculate c * time^3 / rtt, + while considering overflow in calculation of time^3 (so time^3 is done by using d64) + and without the support of division of 64bit numbers (so all divisions are done by using d32) + also NOTE the unit of thos veriables + time = (t - K) / 2^bictcp_HZ + c = sysctl_bictcp_cubic_scale >> 10 + rtt = (srtt >> 3) / HZ + !!! The following code does not have overflow problems, if the cwnd < 1 millon packets !!! + */ + t = ((tcp_time_stamp+(tp->bictcp_delay_min>>3)-tp->bictcp_epoch_start) << bictcp_HZ) / HZ; /* change the unit from HZ to bictcp_HZ*/ + if (t < tp->bictcp_cubic_K) /* t - K */ + d32 = tp->bictcp_cubic_K - t ; + else + d32 = t - tp->bictcp_cubic_K ; + d64 = (__u64)d32; + d32 = (sysctl_bictcp_1024times_cubic_scale << 3) * HZ / srtt; /* 1024*c/rtt */ + d64 = (d32 * d64 * d64 * d64) >> (10+3*bictcp_HZ); /* c/rtt * (t-K)^3 */ + d32 = (__u32)d64; + if (t < tp->bictcp_cubic_K) /* below origin*/ + cubic_target = tp->bictcp_cubic_origin_point - d32; + else /* above origin*/ + cubic_target = tp->bictcp_cubic_origin_point + d32; + + /* cubic function - calc bictcp_cnt*/ + if (cubic_target > tp->snd_cwnd) { + cubic_increment = cubic_target - tp->snd_cwnd; + tp->bictcp_cnt = tp->snd_cwnd / cubic_increment; + } else { + tp->bictcp_cnt = 100*tp->snd_cwnd; /* very small increment*/ + } + + /* max increment =Smax * rtt / 0.1 */ + if ((tp->bictcp_last_loss_cwnd == 0) || /* could be aggressive in slow start */ + ((tp->snd_cwnd > tp->bictcp_last_loss_cwnd) && (tp->bictcp_low_utilization_indication == 1))) /* could be aggressive in low utilization */ + min_cnt = 20; /* 5% of cwnd */ + else + min_cnt = (tp->snd_cwnd*HZ*8)/(10*sysctl_bictcp_max_increment*tp->bictcp_delay_min); /* regular Smax */ + if (tp->bictcp_cnt < min_cnt) + tp->bictcp_cnt = min_cnt; + + /* TCP Friendly */ + if (sysctl_bictcp_tcp_friendliness) { + d32 = (tp->snd_cwnd*tp->bictcp_tcp_8times_scale) >> 3; + while (tp->bictcp_ack_cnt > d32) { /* update tcp cwnd */ + tp->bictcp_ack_cnt -= d32; + tp->bictcp_tcp_cwnd++; + } + if (tp->bictcp_tcp_cwnd > tp->snd_cwnd){ /* if cubic is slower than tcp */ + d32 = tp->bictcp_tcp_cwnd - tp->snd_cwnd; + tp->bictcp_cnt = tp->snd_cwnd / d32; + } + } + + tp->bictcp_cnt = (tp->bictcp_cnt << 10)/tp->bictcp_delayed_ack; /*for linux delayed-ack*/ + + if (tp->bictcp_cnt == 0) /* cannot be zero */ + tp->bictcp_cnt = 1; +} + +/*Detect low utilization in congestion avoidance*/ +static __inline__ void bictcp_low_utilization(struct tcp_opt *tp, int flag) +{ + __u32 dist, delay; + + if (!(tp->saw_tstamp && tp->rcv_tsecr) || /* No time stamp */ + (tcp_time_stamp < tp->bictcp_epoch_start+HZ) || /* Discard delay samples right after fast recovery*/ + (flag == 0)) { /* this delay samples may not be accurate*/ + tp->bictcp_low_utilization_indication = 0; + tp->bictcp_low_utilization_start = 0; + tp->bictcp_last_delay = 0; + return; + } + + delay = (tp->bictcp_last_delay<<3); /* use the same scale as tp->srtt*/ + tp->bictcp_last_delay = tcp_time_stamp - tp->rcv_tsecr; + if (delay == 0) { /* no previous delay sample */ + tp->bictcp_low_utilization_indication = 0; + tp->bictcp_low_utilization_start = 0; + return; + } + + //first time call + if (tp->bictcp_delay_min ==0 ) { + tp->bictcp_delay_min = delay; + tp->bictcp_delay_max = delay; + tp->bictcp_low_utilization_indication = 0; + tp->bictcp_low_utilization_start = 0; + return; + } + + //update global min delay + if (tp->bictcp_delay_min > delay){ /* in case that link delay decreases */ + tp->bictcp_delay_min = delay; + tp->bictcp_delay_max = delay; + tp->bictcp_low_utilization_indication = 0; + tp->bictcp_low_utilization_start = 0; + } + + //update global max delay + if (tp->bictcp_delay_max < delay) + tp->bictcp_delay_max = delay; + + //utilization is low, if avg delay < dist*threshold for checking_period time + dist = tp->bictcp_delay_max - tp->bictcp_delay_min; + if ((dist <= (tp->bictcp_delay_min>>6)) || + ((tp->srtt - tp->bictcp_delay_min) >= ((dist*sysctl_bictcp_1024times_low_utilization_threshold)>>10)) ){ + tp->bictcp_low_utilization_indication = 0; + tp->bictcp_low_utilization_start = 0; + }else{ + if (tp->bictcp_low_utilization_start == 0){ + tp->bictcp_low_utilization_indication = 0; + tp->bictcp_low_utilization_start = tcp_time_stamp; + }else if ((tcp_time_stamp - tp->bictcp_low_utilization_start) > (sysctl_bictcp_low_utilization_checking_period*HZ) ) + tp->bictcp_low_utilization_indication = 1; + } + +} + /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ @@ -1980,15 +2393,27 @@ static __inline__ void tcp_cong_avoid(st tp->snd_cwnd++; tp->snd_cwnd_cnt = 0; } + if (sysctl_bictcp) { + bictcp_update(tp); + if (tp->snd_cwnd_cnt > (tp->bictcp_cnt<<3)) { + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd++; + } + } else { + while (tp->snd_cwnd_cnt > tp->snd_cwnd<<3) { + tp->snd_cwnd_cnt -= tp->snd_cwnd<<3; + tp->snd_cwnd++; + } + } + #else - tp->snd_cwnd_cnt += 1<<3; -#endif + tp->snd_cwnd_cnt += 1<<3; + while (tp->snd_cwnd_cnt > tp->snd_cwnd<<3) { + tp->snd_cwnd_cnt -= tp->snd_cwnd<<3; + tp->snd_cwnd++; + } - while (tp->snd_cwnd_cnt > tp->snd_cwnd<<3) { - tp->snd_cwnd_cnt -= tp->snd_cwnd<<3; - tp->snd_cwnd++; - } - +#endif WEB100_VAR_INC(tp, CongAvoid); } tp->snd_cwnd = min(tp->snd_cwnd, (__u32)tp->snd_cwnd_clamp); @@ -2016,6 +2441,7 @@ static int tcp_clean_rtx_queue(struct so __u32 now = tcp_time_stamp; int acked = 0; __s32 seq_rtt = -1; + __u32 cnt = 0; while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); @@ -2066,6 +2492,8 @@ static int tcp_clean_rtx_queue(struct so tp->packets_out--; __skb_unlink(skb, skb->list); tcp_free_skb(sk, skb); + clear_all_retrans_hints(tp); + cnt++; } if (acked&FLAG_ACKED) { @@ -2073,6 +2501,13 @@ static int tcp_clean_rtx_queue(struct so tcp_ack_packets_out(sk, tp); } +#ifdef CONFIG_WEB100_NET100 + if ((tp->ca_state == TCP_CA_Open) && (cnt>0) && (sysctl_bictcp)) { + /* new_value = old_value*15/16 + new_sample/16 */ + tp->bictcp_delayed_ack = ((tp->bictcp_delayed_ack*15)>>4) + (cnt<<6); + } +#endif + #if FASTRETRANS_DEBUG > 0 BUG_TRAP((int)tp->sacked_out >= 0); BUG_TRAP((int)tp->lost_out >= 0); @@ -2220,6 +2655,7 @@ static int tcp_ack(struct sock *sk, stru u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; int prior_packets; + int bic_flag; /* If the ack is newer than sent or older than previous acks * then we can probably ignore it. @@ -2267,13 +2703,14 @@ static int tcp_ack(struct sock *sk, stru goto no_queue; prior_in_flight = tcp_packets_in_flight(tp); - + /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk); if (tp->frto_counter) tcp_process_frto(sk, prior_snd_una); + bic_flag = 0; if (tcp_ack_is_dubious(tp, flag)) { /* Advanve CWND, if state allows this. */ if ((flag&FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd && @@ -2283,8 +2720,15 @@ static int tcp_ack(struct sock *sk, stru } else { if ((flag&FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd) tcp_cong_avoid(tp); + if (flag & FLAG_DATA_ACKED) + bic_flag = 1; } +#ifdef CONFIG_WEB100_NET100 + if (sysctl_bictcp) + bictcp_low_utilization(tp, bic_flag); +#endif + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) dst_confirm(sk->dst_cache); @@ -3299,7 +3743,12 @@ void tcp_cwnd_application_limited(struct u32 win_used = max(tp->snd_cwnd_used, 2U); if (win_used < tp->snd_cwnd) { tp->snd_ssthresh = tcp_current_ssthresh(tp); - tp->snd_cwnd = (tp->snd_cwnd+win_used)>>1; + if (sysctl_bictcp_moderation){ + __u32 cap; + cap = max(tp->snd_ssthresh, (tp->snd_cwnd+win_used)>>1); + tp->snd_cwnd = min(tp->snd_cwnd, cap); + } else + tp->snd_cwnd = (tp->snd_cwnd+win_used)>>1; WEB100_VAR_INC(tp, OtherReductions); WEB100_VAR_INC(tp, X_OtherReductionsCV); } @@ -3582,6 +4031,8 @@ int tcp_rcv_established(struct sock *sk, { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + tp->bictcp_max_packets_in_flight = 0; /* reset burst moderation count */ + /* * Header prediction. * The code loosely follows the one in the famous diff -Nuarp linux-2.4.25-web100/net/ipv4/tcp_ipv4.c linux-2.4.25-web100-cubic/net/ipv4/tcp_ipv4.c --- linux-2.4.25-web100/net/ipv4/tcp_ipv4.c 2004-07-02 14:38:30.000000000 -0500 +++ linux-2.4.25-web100-cubic/net/ipv4/tcp_ipv4.c 2004-06-26 23:35:18.000000000 -0500 @@ -2051,6 +2051,12 @@ static int tcp_v4_init_sock(struct sock tp->snd_cwnd_clamp = ~0; tp->mss_cache = 536; +#ifdef CONFIG_WEB100_NET100 + bictcp_init(tp); + if (sysctl_bictcp && sysctl_bictcp_ssthresh) + tp->snd_ssthresh = sysctl_bictcp_ssthresh; +#endif + tp->reordering = sysctl_tcp_reordering; sk->state = TCP_CLOSE; diff -Nuarp linux-2.4.25-web100/net/ipv4/tcp_minisocks.c linux-2.4.25-web100-cubic/net/ipv4/tcp_minisocks.c --- linux-2.4.25-web100/net/ipv4/tcp_minisocks.c 2004-07-02 14:38:30.000000000 -0500 +++ linux-2.4.25-web100-cubic/net/ipv4/tcp_minisocks.c 2004-06-26 23:35:41.000000000 -0500 @@ -712,6 +712,12 @@ struct sock *tcp_create_openreq_child(st newtp->fackets_out = 0; newtp->snd_ssthresh = 0x7fffffff; +#ifdef CONFIG_WEB100_NET100 + bictcp_init(newtp); + if (sysctl_bictcp && sysctl_bictcp_ssthresh) + newtp->snd_ssthresh = sysctl_bictcp_ssthresh; +#endif + /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control * algorithms that we must have the following bandaid to talk diff -Nuarp linux-2.4.25-web100/net/ipv4/tcp_output.c linux-2.4.25-web100-cubic/net/ipv4/tcp_output.c --- linux-2.4.25-web100/net/ipv4/tcp_output.c 2004-07-02 14:38:32.000000000 -0500 +++ linux-2.4.25-web100-cubic/net/ipv4/tcp_output.c 2004-06-26 23:42:18.000000000 -0500 @@ -198,6 +198,9 @@ int tcp_transmit_skb(struct sock *sk, st int sysctl_flags; int err; + if (tp->bictcp_max_packets_in_flight == 0) + tp->bictcp_max_packets_in_flight = tcp_packets_in_flight(tp)+tcp_max_burst(tp); + #define SYSCTL_FLAG_TSTAMPS 0x1 #define SYSCTL_FLAG_WSCALE 0x2 #define SYSCTL_FLAG_SACK 0x4 @@ -450,6 +453,8 @@ static int tcp_fragment(struct sock *sk, int nsize = skb->len - len; u16 flags; + clear_all_retrans_hints(tp); + if (skb_cloned(skb) && skb_is_nonlinear(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) @@ -778,6 +783,9 @@ static void tcp_retrans_try_collapse(str ((skb_size + next_skb_size) > mss_now)) return; + /* changing transmit queue under us so clear hints */ + clear_all_retrans_hints(tp); + /* Ok. We will be able to collapse the packet. */ __skb_unlink(next_skb, next_skb->list); @@ -848,6 +856,8 @@ void tcp_simple_retransmit(struct sock * } } + clear_all_retrans_hints(tp); + if (!lost) return; @@ -975,20 +985,39 @@ void tcp_xmit_retransmit_queue(struct so { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; - int packet_cnt = tp->lost_out; + int packet_cnt; + + if( tp->xmit_retransmit_queue_lost_skb_hint != NULL ){ + skb = tp->xmit_retransmit_queue_lost_skb_hint; + packet_cnt = tp->xmit_retransmit_queue_lost_cnt_hint; + }else{ + skb = sk->write_queue.next; + packet_cnt = 0; + } /* First pass: retransmit lost packets. */ - if (packet_cnt) { - for_retrans_queue(skb, sk, tp) { + if (tp->lost_out) { + for_retrans_queue_from(skb, skb, sk, tp) { __u8 sacked = TCP_SKB_CB(skb)->sacked; + /* we could do better than to assign each time */ + tp->xmit_retransmit_queue_lost_skb_hint = skb; + tp->xmit_retransmit_queue_lost_cnt_hint = packet_cnt; + if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) return; + if (sysctl_bictcp_moderation && tp->bictcp_max_packets_in_flight) { /* burst moderation */ + if (tcp_packets_in_flight(tp) >= tp->bictcp_max_packets_in_flight) + return; + } + if (sacked&TCPCB_LOST) { if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { - if (tcp_retransmit_skb(sk, skb)) - return; + if (tcp_retransmit_skb(sk, skb)){ + tp->xmit_retransmit_queue_lost_skb_hint = NULL; + return; + } if (tp->ca_state != TCP_CA_Loss) NET_INC_STATS_BH(TCPFastRetrans); else @@ -998,8 +1027,8 @@ void tcp_xmit_retransmit_queue(struct so tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); } - if (--packet_cnt <= 0) - break; + if (++packet_cnt >= tp->lost_out) + break; } } } @@ -1024,21 +1053,38 @@ void tcp_xmit_retransmit_queue(struct so if (tcp_may_send_now(sk, tp)) return; - packet_cnt = 0; - - for_retrans_queue(skb, sk, tp) { + if ( tp->xmit_retransmit_queue_forward_skb_hint != NULL){ + skb = tp->xmit_retransmit_queue_forward_skb_hint; + packet_cnt = tp->xmit_retransmit_queue_forward_cnt_hint; + } else{ + skb = sk->write_queue.next; + packet_cnt = 0; + } + + for_retrans_queue_from(skb,skb, sk, tp) { + tp->xmit_retransmit_queue_forward_cnt_hint = packet_cnt; + tp->xmit_retransmit_queue_forward_skb_hint = skb; + if(++packet_cnt > tp->fackets_out) break; if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) break; + + if (sysctl_bictcp_moderation && tp->bictcp_max_packets_in_flight) { /* burst moderation */ + if (tcp_packets_in_flight(tp) >= tp->bictcp_max_packets_in_flight) + return; + } + if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) continue; /* Ok, retransmit it. */ - if(tcp_retransmit_skb(sk, skb)) - break; + if(tcp_retransmit_skb(sk, skb)){ + tp->xmit_retransmit_queue_forward_skb_hint = NULL; + break; + } if (skb == skb_peek(&sk->write_queue)) tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);