TCP拥塞控制算法内核实现剖析

分类：Linux Kernel2011-12-05 17:10 内核版本：2.6.37

主要源文件：linux-2.6.37/ net/ ipv4/ Tcp_cong.c

一、RENO及拥塞控制算法基础

1.1 RENO拥塞控制算法

===========================================================

struct sock *sk 和struct tcp_sock *tp 的转换

[cpp]view plaincopy

1.在include/ linux/ Tcp.h中，

2.static inline struct tcp_sock *tcp_sk(const struct sock *sk)

3.{

4.return (struct tcp_sock *)sk ;

5.}

7.给出struct sock *sk,

8.struct tcp_sock *tp = tcp_sk(sk) ;

tcp_sock结构

[cpp]view plaincopy

1.struct tcp_sock

2.{

3. ...

4. u32 window_clamp ; /* Maximal window to advertise */

5. u32 rcv_ssthresh ; /* Current window clamp */

6. u32 rcv_wnd ; /* Current receiver window */

7. ...

8./* snd_wll 记录发送窗口更新时，造成窗口更新的那个数据报的第一个序号。

9. * 它主要用于在下一次判断是否需要更新发送窗口。

10. */

11. u32 snd_wll ; /* Sequence for window update */

12. u32 snd_wnd ; /* 发送窗口的大小，直接取值于来自对方的数据报的TCP首部 */

13./* Maximal window ever seen from peer 记录来自对方通告的窗口的最大值 */

14./* First byte we want an ack for 发送窗口的左边沿 */

15. u32 max_window ; u32 snd_una ;

16. ...

17./*

18. * Slow start and congestion control

19. */

20. u32 snd_ssthresh ; /* Slow start size threshold */

21. u32 snd_cwnd ; /* Sending congestion window */

22./*表示在当前的拥塞控制窗口中已经发送的数据段的个数*/

23. u32 snd_cwnd_cnt ; /* Linear increase counter */

24. u32 snd_cwnd_clamp ; /* Do not allow snd_cwnd to grow above this */

25. ...

26. u32 mss_cache ; /* cached effective mss , not including SACKS */

27. u32 bytes_acked ; /* Appropriate Byte Counting - RFC3465 */

28. ...

29.}

拥塞避免算法关键部分

[cpp]view plaincopy

1./* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd ( or alternative w ) *

2.void tcp_cong_avoid_ai(struct tcp_sock *tp , u32 w)

3.{

4.if ( tp->snd_cwnd_cnt >= w) {

5.if ( tp->snd_cwnd < tp->snd_cwnd_clamp)

6. tp->snd_cwnd++ ;

7. tp->snd_cwnd_cnt = 0 ;

8. } else {

9. tp->snd_cwnd_cnt ++ ;

10. }

11.}

12.EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai) ;

慢启动算法

tcp_max_ssthresh 参数的含义

tcp_max_ssthresh 参数是实现RFC3742时引入的，离现在已经有4年的时间了。

07年5月内核实现了这个RFC。

但是，tcp的这个参数在内核文档（ip-sysctl.txt）中找不到任何的说明。

慢启动阶段，就是当前拥塞窗口值比慢启动阈值(snd_ssthresh)小的时候，所处的阶段就叫做慢启动阶段。当我们收到一个新的ACK时，则会调用tcp_slow_start()这个函数，并且为拥塞窗口增加1.

(Linux中拥塞窗口的值代表数据包的个数，而不是实际的发送字节数目。实际可以发送的字节数等于可以发送的数据包个数*MSS。)

直到慢启动阶段出现数据包的丢失。

而引入了tcp_max_ssthresh 这个参数后，则可以控制在慢启动阶段拥塞窗口增加的频度。

默认这个参数不打开，如果这个参数的值设置为1000，则当拥塞窗口值大于1000时，

则没收到一个ACK，并不再增加拥塞窗口一个单位了，而是约收到2个ACK才增加一个窗口单位。

注意：收到2ACK并不是决定值！！

需要根据当前的拥塞窗口值，tcp_max_ssthresh值进行判断。

参见tcp.txt（documentation/networking）

The following variables are used in the tcp_sock for congestion control:

snd_cwnd The size of the congestion window

snd_ssthresh Slow start threshold. We are in slow start if snd_cwnd is less than this.

snd_cwnd_cnt A counter used to slow down the rate of increase once we exceed slow start threshold.

snd_cwnd_clamp This is the maximum size that snd_cwnd can grow to.

snd_cwnd_stamp Timestamp for when congestion window last validated.

snd_cwnd_used Used as a highwater mark for how much of the congestion window is in use. It is used to adjust snd_cwnd down when the link is limited by the application rather than the network.

1.void tcp_slow_start( struct tcp_sock *tp )

2.{

3.int cnt ; /* increase in packets */

5./* RFC3465 : ABC slow start

6. * Increase only after a full MSS of bytes is acked

7. *

8. * TCP sender SHOULD increase cwnd by the number of

9. * previously unacknowledged bytes ACKed by each incoming

10. * acknowledgment , provided the increase is not more than L

11. */

12./* ack的数据少于MSS，tcp_abc默认关闭 */

13.if ( sysctl_tcp_abc && tp->bytes_acked < tp->mss_cached )

14.return ;

15.

16./* 当前拥塞窗口值大于sysctl_tcp_max_ssthresh. 限制拥塞窗口增长值 */

17.if ( sysctl_tcp_max_ssthresh >0 && tcp->snd_cwnd >sysctl_tcp_max_ssthresh)

18. cnt = sysctl_tcp_max_ssthresh >> 1 ; /* limited slow start */

19.else

20. cnt = tp->snd_cwnd ; /* exponential increase */

21.

22./* RFC3465 : ABC

23. * We MAY increase by 2 if discovered delayed ack

24. */

25./* 如果接收方启用了延时确认，此时收到的确认代表两个MSS数据报*/

26.if ( sysctl_tcp_abc >1 && tp->bytes_acked >= 2*tp->mss_cache )

27. cnt <<= 1 ;

28.

29.tp->bytes_acked = 0 ;

30.tp->snd_cwnd_cnt += cnt ; /* 此时snd_cwnd_cnt等于snd_cwnd或2*snd_cwnd */

31.

32./* 最多增加1/2 sysctl_tcp_max_ssthresh字节。也就是说，至少2个ACK才能让拥塞窗口

增加1. */

33.while( tp->snd_cwnd_cnt >= tp->snd_cwnd ) {

34. tp->snd_cwnd_cnt -= tp->snd_cwnd ;

35.if( tp->snd_cwnd < tp->snd_cwnd_clamp )

36. tp->snd_cwnd++ ;

37. }

38.}

39.EXPORT_SYMBOL_GPL( tcp_slow_start ) ;

1.2 拥塞控制算法基础

1.2.1 代表拥塞算法的结构体

1.#define TCP_CA_NAME_MAX 16

2.struct tcp_congestion_ops {

3.struct list_head list ;

4. unsigned long flags ;

5./* initialize private data (optional) */

6.void (*init) (struct sock *sk) ;

7./* cleanup private data (optional) */

8.void (*release) (struct sock *sk) ;

9./* return slow start threshold (required) */

10. u32 (*ssthresh) (struct sock *sk) ;

11./* lower bound for congestion window (optional) */

12. u32 (*min_cwnd) (const struct sock *sk) ;

13./* do new cwnd calculation (required) */

14.void (*cong_avoid) (struct sock *sk , u32 ack , u32 in_flight ) ;

15./* call before changing ca_state (optional) */

16.void (*set_state) (struct sock *sk , u8 new_state) ;

17./* call when cwnd event occurs (optional) */

18.void (*cwnd_event) (struct sock *sk , enum tcp_ca_event ev) ;

19./* new value of cwnd after loss (optional) */

20. u32 (*undo_cwnd) (struct sock *sk) ;

21./* hook for packet ack accounting (optional) */

22.void (*pkts_acked) (struct sock *sk , u32 num_acked , s32 rtt_us) ;

23./* get info for inet_diag (optional) */

24.void (*get_info) (struct sock *sk , u32 ext , struct sk_buff *skb) ;

25.char name[TCP_CA_NAME_MAX] ;

26.struct module *owner ;

27.}

在Tcp_cong.c中，有全局变量：

int sysctl_tcp_max_ssthresh = 0 ;

/* define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) */

static DEFINE_SPINLOCK( tcp_cong_list_lock ) ;

static LIST_HEAD( tcp_cong_list ) ; // tcp拥塞控制算法链表，其元素为

tcp_congestion_ops

BUG_ON( ) ; 如果BUG_ON中的条件为真就调用BUG，它输出一些信息，然后调用panic 函数挂起系统。

char *strncpy( char * dest , char *src , size_t n ) ;

它与strcpy不同之处在于复制n个字符，而不是把所有的字符拷贝（包括结尾'\0'）。

当src的长度小于n时，dst内的未复制空间用'\0'填充。否则，复制n个字符到dst，没有加'\0'。这里就要注意在字符串dst结尾处理加'\0'的情况了。

rcu_read_lock() // 读者在读取由RCU保护的共享数据时使用该函数标记它进入读端临界区。

rcu_read_unlock() // 该函数与rcu_read_lock配对使用，用以标记读者退出读端临界区。

1.2.2 对拥塞控制算法的一些操作（读写增减注册等）

[cpp]view plaincopy

1./* Get current default congestion control */

2.void tcp_get_default_congestion_control( char *name )

3.{

4.struct tcp_congestion_ops *ca ;

5./* We will always have reno */

6. BUG_ON( list_empty( &tcp_cong_list) ) ;

8. rcu_read_lock( ) ;

9. ca = list_entry( tcp_cong_list . next , struct tcp_congestion_ops ,

list ) ;

10. strncpy( name , ca->name , TCP_CA_NAME_MAX ) ;

11. rcu_read_unlock( ) ;

12.}

struct sock——representation of sockets

struct inet_sock——representation of INET sockets

struct inet_connection_sock——INET connection oriented sockets

struct tcp_sock——tcp sockets

以上几种socket越分越细，比如inet_connection_sock是在inet_sock上的扩展，具有自己特有的属性。

tcp_sock是TCP协议专用的一个socket表示，它是在struct inet_connection_sock基础进行扩展，主要是增加了滑动窗口协议，避免拥塞算法等一些TCP专有属性。

[cpp]view plaincopy

1.struct inet_connection_sock {

2. ...

3.// Pluggable congestion control hook

4.const struct tcp_congestion_ops *icsk_ca_ops ;

5. ...

7. u32 icsk_ca_priv[16] ;

8.#define ICSK_CA_PRIV_SIZE (16*sizeof(u32))

9.}

举例：//有一个初始化了得struct sock *sk

struct inet_connection_sock *icsk = inet_csk( sk ) ;

printk(KERN_INFO "%s" , icsk->icsk_ca_ops->name) ; //当前连接拥塞控制算法名称

1.struct inet_sock {

2. ...

3./* Socket demultiplex comparisons on incoming packets */

4. __be32 inet_daddr ;

5. __be16 inet_dport ;

6. __be32 inet_saddr ;

7. __be16 inet_sport ;

8. __be16 inet_num ; // local port

9. __be32 inet_rcv_saddr ; // Bound local IPv4 addr

10. ...

11.}

1./* Built list of non-restricted congestion control values*/

2.void tcp_get_allowed_congestion_control( char *buf , size_t maxlen)

3.{

4.struct tcp_congestion_ops *ca ;

5.size_t offs = 0 ;

6. *buf = '\0' ; //有必要？

7. rcu_read_lock() ;

8. list_for_each_entry( ca , &tcp_cong_list , list ) {

9.if( !( ca->flags & TCP_CONG_NON_RESTRICTED)) //排除有限制的。限

制和非限制区别？

10.continue;

11. offs += snprintf( buf+offs , maxlen-offs , "%s%s" , offs ==

0?"" : " " , ca->name) ;

12. }

13. rcu_read_unlock() ;

14.}

1./* Simple linear search , don't expect many entries! */

2.static struct tcp_congestion_ops*tcp_ca_find( const char *name)

3.{

4.struct tcp_congestion_ops *e ;

5. list_for_each_entry_rcu( e , &tcp_cong_list , list ) {

6.if( strcmp(e->name , name)==0)

7.return e ;

8. }

9.return NULL ;

10.}

1./*

2. * Attach new congestion control algorithm to the list

3. * of available options.

4. */

5.int tcp_register_congestion_control( struct tcp_congestion_ops *ca )

6.{

7.int ret = 0 ;

8./* all algorithms must implement ssthresh and cong_avoid ops */

9.if ( !ca->ssthresh || !ca->cong_avoid ) {

10. printk(KERN_ERR "TCP %s does not implement required ops\n",

11. ca->name) ;

12.return -EINVAL ;

13. }

14.

15. spin_lock(&tcp_cong_list_lock) ;

16.if( tcp_ca_find (ca->name)) {

17. printk(KERN_NOTICE "TCP %s already registered\n", ca->name)

;

18. ret = -EEXIST; //不能直接return，不然会造成死锁

19. } else {

20. list_add_tail_rcu( &ca->list , &tcp_cong_list) ;

21. printk(KERN_INFO "TCP %s registered\n", ca->name) ;

22. }

23. spin_unlock(&tcp_cong_list_lock) ;

24.

25.return ret ;

26.}

二、其他拥塞控制算法

2.1 BIC拥塞控制算法

2.1.1 相关结构体和参数

1./* BIC TCP Parameters */

3.struct bictcp {

4. u32 cnt ; /* increase cwnd by 1 after ACKs */

5. u32 last_max_cwnd ; /* last maximum snd_cwnd */

6. u32 loss_cwnd ; /* congestion window at last loss */

7. u32 last_cwnd ; /* the last snd_cwnd */

8. u32 last_time ; /* time when updated last_cwnd */

9. u32 epoch_start ; /* beginning of an epoch */

10.#define ACK_RATIO_SHIFT 4

11. u32 delayed_ack ; /* estimate the ratio of Packets/ACKs << 4 */

12.} ;

1./* Scale factor beta calculation

2. * max_cwnd = snd_cwnd * beta

3. */

5.#define BICTCP_BETA_SCALE 1024

8./* In binary search ,

9. * go to point (max+min) / N

10. */

11.

12.#define BICTCP_B 4 /*并不是真正的二分*/

2.1.2 全局变量

static int fast_convergence = 1 ; /* BIC能快速的达到一个平衡值，开关*/

static int max_increment = 16 ; /* 每次增加的MSS 不能超过这个值，防止增长太过剧烈*/ static int low_window = 14 ; /* lower bound on congestion window , for TCP friendliness */

static int beta = 819 ; /* = 819 / 1024(BICTCP_BETA_SCALE) ，beta for multiplicative increase 。？*/

static int initial_ssthresh ; /* 初始的阈值*/

static int smooth_part = 20 ; /* log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax 。？*/

/* initial_ssthresh的初始值被设置成2^31-1=2147483647 */

bictcp结构体保存在：

[cpp]view plaincopy

1.struct inet_connection_sock {

3. ...

5. u32 icsk_ca_priv[16] ;

6.#define ICSK_CA_PRIV_SIZE (16*sizeof(u32))

7.}

9.static inline void *inet_csk_ca( const struct sock *sk )

10.{

11.return (void *)inet_csk(sk)->icsk_ca_priv ;

12.}

=========================================

tcp_is_cwnd_limited的实现没弄明白

1./* Slow start with delack produces 3 packets of burst , so that it is

2. * safe "de facto". This will be default - same as the default reordering

3. * threshold - but if reordering increases , we must be able to allow

4. * cwnd to burst at least this much in order to not pull it back when

5. * holes are filled.

6. */

8.static __inline__ __u32 tcp_max_burst ( const struct tcp_sock *sk )

9.{

10.return tp->reordering ;

11.}

12./* u8 reordering ; Packets reordering metric */

1./* RFC2681 Check whether we are limited by application or congestion

2. * window . This is the inverse of cwnd check in tcp_tso_should_defer

3. */

4./* 返回0，不需要增加cwnd ; 返回1，cwnd被限制，需要增加 */

6.int tcp_is_cwnd_limited ( const struct sock *sk , u32 in_flight )

7.{

8.const struct tcp_sock *tp = tcp_sk(sk) ;

9. u32 left ;

10.

11.if( in_flight >= tp->snd_cwnd ) /* 不是规定

in_flight < snd_cwnd ? */

12.return 1 ;

13.

14. left = tp->snd_cwnd - in_flight ;

15.if( sk_can_gso(sk) &&

16. left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&

17. left * tp->mss_cache < sk->sk_gso_max_size )

18.return 1 ;

19.

20.return left <= tcp_max_busrt( tp ) ;

21.}

==========================================

2.1.3 bictcp拥塞避免

1.static void bictcp_cong_avoid ( struct sock *sk , u32 ack , u32 in_flight )

2.{

3.struct tcp_sock *tp = tcp_sk(sk) ;

4.struct bictcp *ca = inet_csk_ca(sk) ;

6./* 如果发送拥塞窗口不被限制，不能再增加，则返回 */

7.if( !tcp_is_cwnd_limited(sk , in_flight))

8.return ;

10.if( tp->snd_cwnd < tp->snd_ssthresh )

11. tcp_slow_start( tp ) ;

12.else {

13. bictcp_update(ca , tp->snd_cwnd ) ;

14. tcp_cong_avoid_ai( tp , ca->cnt ) ;

15. }

16.}

从以上函数可以看出，BIC的慢启动和reno相同。在拥塞避免阶段，当snd_cwnd <= low_window ，两者也采用相同方法。

只有当snd_cwnd > low_window时，BIC才开始显示出它的特性。

在include/ net / tcp.h中，

/* TCP timestamps are only 32-bits */

#define tcp_time_stamps ((__u32)(jiffies))

2.1.4 bictcp结构体的更新（BIC算法关键）

1./*

2. * Compute congestion window to use.

3. */

4.static inline void bictcp_update( struct bictcp *ca , u32 cwnd )

5.{

6./* 31.25ms以内不更新ca！！！*/

7.if ( ca->last_cwnd == cwnd &&

8. (s32) ( tcp_time_stamp - ca->last_time) <= HZ / 32 )

9.return ;

10.

11. ca->last_cwnd = cwnd ;

12. ca->last_time = tcp_time_stamp ;

13.

14.if ( ca->epoch_start == 0 ) /* recording the beginning of an epoch *

15. ca->epoch_start = tcp_time_stamp ;

16.

17./* start off normal */

18.if( cwnd <= low_window ) { /*为了保持友好性*/

19. ca->cnt = cwnd ; /*这样14个以内的ack，可使snd_cwnd++ */

20.return ;

21. }

22.

23./* binary increase */

24.if ( cwnd < ca->last_max_cwnd ) { /*上次掉包前一个snd_cwnd */

25. __u32 dist = (ca->last_max_cwnd - cwnd) / BICTCP_B ; /* 四分

之一 */

26.if ( dist > max_increment ) /* linear increase */

27./*dist > 16，处于线性增长阶段，每收到16个ACK，会使

snd_cwnd++ */

28. ca->cnt = cwnd / max_increment ;

29.else if ( dist <= 1U ) /* binary search increase */

30./* dist <=1 ， ca->cnt=5*cwnd，会造成snd_cwnd增长极其缓慢，即处

于稳定阶段 */

31. ca->cnt = (cwnd * smooth_part ) / BICTCP_B ;

32.else/* binary search increase */

33./* 1 < dist <= 16 ，每收到dist个ACK，会使snd_cwnd++，故增长很

快 */

34. ca->cnt = cwnd / dist ;

35. } else { /* 进入max_probing阶段 */

36./* cwnd < ca->last_max_cwnd + 4 */

37.if ( cwnd < ca->last_max_cwnd + BICTCP_B )

38./* ca->cnt = 5*cwnd ; slow start */

39. ca->cnt = (cwnd * smooth_part ) / BICTCP_B ;

40.else if ( cwnd < ca->last_max_cwnd + max_increment * ( BICTC

P_B - 1))

41./* 增长率从5/(3*cwnd)~47/(3*cwnd)，snd_cwnd的增长加快

42. ca->cnt = (cwnd * (BICTCP_B - 1)) /

43. (cwnd - ca->last_max_cwnd) ;

44.else

45. ca->cnt = cwnd / max_increment ;/* 增长率为16/cwnd ，

更快 */

46. }

47.

48./* if in slow start or link utilization is very low */

49.if ( ca->loss_cwnd == 0 ) { /* 没有发生过丢包，所以snd_cwnd增长应该快

点*/

50.if ( ca->cnt > 20 )/* increase cwnd 5% per RTT */

51. ca->cnt = 20 ;

52. }

53.

54./* 相当于乘与delayed_ack的百分比，delayed得越严重，则snd_cwnd应该增加越

快*/

55./* 这样有无delayed对snd_cwnd的影响不大*/

56. ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack ;

57.

58./* ca->cnt cannot be zero */

59.if ( ca->cnt == 0)

60. ca->cnt = 1 ;

}

小结：

从以上函数可以看出，和reno相比，BIC在拥塞避免阶段snd_cwnd增长极快。

当ca->last_max_cwnd - snd_cwnd >= 4 时，snd_cwnd最慢的增长率为1/16 。

而当ca->last_max_cwnd - snd_cwnd <4 时，增长率非常低，可以使当前的snd_cwnd维持很长一段时间，即以最合适的snd_cwnd发送数据。

这两点使BIC在高带宽、长时延的环境下能达到较高的吞吐量。

1. 搜索阶段

(1) cwnd < last_max_cwnd - 64，则cnt = cwnd / 16

(2) last_max_cwnd - 64 <= cwnd < last_max_cwnd -4 ，则cnt = cwnd / dist

(3) last_max_cwnd - 4 <= cwnd < last_max_cwnd ，则cnt = 5*cwnd

总体来说，snd_cwnd增长先快后慢，趋于稳定。

2. max probing阶段

(1) last_max_cwnd <= cwnd < last_max_cwnd + 4，则cnt = 5*cwnd

(2) last_max_cwnd + 4 <= cwnd < last_max_cwnd + 48 ，则cnt = 3*cwnd / (cwnd -

last_max_cwnd)

(3) cwnd >= last_max_cwnd + 48 ，则cnt = cwnd / 16

总体来说，snd_cwnd的增长先慢后快，越来越快。

=================================================================== ====================================

来看一下初始化和重置

1.static inline void bictcp_reset( struct bictcp *ca )

2.{

3. ca->cnt = 0 ;

4. ca->last_max_cwnd = 0 ;

5. ca->loss_cwnd = 0 ;

6. ca->last_cwnd = 0 ;

7. ca->last_time = 0 ;

8. ca->epoch_start = 0 ;

9. ca->delayed_ack = 2 << ACK_RATIO_SHIFT ; // 默认50%的delayed包

10.}

bictcp_reset在两种情况下被调用：初始化时（bictcp_init ）、进入拥塞处理时

（bictcp_state 状态为TCP_CA_Loss）。

[cpp]view plaincopy

1.static void bictcp_init( struct sock *sk )

2.{

3. bictcp_reset( inet_csk_ca( sk) ) ;

5./* 加载模块时设置了。否则，其值 = 2^31 - 1 */

6.if ( initial_ssthresh )

7. tcp_sk(sk)->snd_ssthesh = initial_ssthresh ;

8.}

=================================================================== ==========================================

2.1.5 慢启动阈值调整

我们知道，对一个拥塞控制算法而言，有两个函数必不可少，除了上面分析过的

bictcp_cong_avoid（拥塞避免），还有bictcp_recalc_ssthresh（慢启动阈值重新计算）。RENO只是简单的把发生拥塞时的窗口除以2，而BIC则增加了一些东西。

1./*

2. * behave like Reno until low_window is reached ,

3. * then increase congestion window slowly

4. */

5.static u32 bictcp_recalc_ssthresh( struct sock *sk )

6.{

7.const struct tcp_sock *tp = tcp_sk(sk) ;

8.struct bictcp *ca = inet_csk_ca(sk) ;

9. ca->epoch_start = 0 ; /* end of epoch，平静的日子结束了 */

10.

11./* Wmax and fast convergence

12. * fast? 好像是更安全点吧。丢包点比上次低，说明恶化，则主动降低。

13. * 丢包点比上次高，则说明更好，当然采用更大的。

14. */

15.if ( tp->snd_cwnd < ca->last_max_cwnd && fast_convergence )

16./* 就是last_max_cwnd = 0.9 * snd_cwnd */

17. ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + bet

a )) / ( 2 * BICTCP_BETA_SCALE ) ;

18. else

19. ca->last_max_cwnd = tp->snd_cwnd ;

20.

21. ca->loss_cwnd = tp->snd_cwnd ;

22.

23./* snd_cwnd<=14时，同reno，保持友好性 */

24.if ( tp->snd_cwnd <= low_window )

25.return max( tp->snd_cwnd >> 1U , 2U ) ;

26. esle

27./* 就是snd_ssthresh=0.8*snd_cwnd ，很大的一个数，能充分利用带

宽 */

28.return max( tp->snd_cwnd * beta ) / BICTCP_BETA_SCALE , 2U )

;

29.}

bictcp_recalc_ssthresh做了两件事：重赋值last_max_cwnd、返回新的慢启动阈值。

特别值得注意的是，snd_ssthresh = 0.8 * snd_cwnd 。这个可比RENO的snd_ssthresh = 0.5 * snd_cwnd 大了很多。

所以说BIC能够更有效的利用大带宽。

=================================================================== ====================================

计算delayed packets ratio

1./* Track delayed acknowledgement ratio using sliding window

2. * ratio = (15*ratio + sample) / 16

3. * sample是此时的cnt，而本来的ratio = delayed_ack / 16

4. * 按如下函数计算后，现在的ratio = (15*ratio) /16 + cnt /16

5. * cnt = cnt - 原来的ratio

6. */

8.static void bictcp_acked( struct sock *sk , u32 cnt , s32 rtt )

9.{

10.const struct inet_connection_sock *icsk = inet_csk(sk) ;

11.

12.if ( icsk->icsk_ca_state == TCP_CA_Open ) {

13.struct bictcp *ca = inet_csk_ca(sk) ;

14./* 作者似乎很注重delayed包对snd_cwnd的影响，要尽量削弱 */

15. cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT ;

16. ca->delayed_ack += cnt ;

17. }

18.}

在struct inet_connection_sock中，有__u8 icsk_ca_state，表示拥塞控制的状态。

在tcp.h中，

1.enum tcp_ca_state {

2. TCP_CA_Open = 0,

3.#define TCPF_CA_Open (1<

4. TCP_CA_Disorder = 1,

5.#define TCPF_CA_Disorder (1<

6. TCP_CA_CWR = 2,

7.#define TCPF_CA_CWR (1<

8. TCP_CA_Recovery = 3,

9.#define TCPF_CA_Recovery (1<

10. TCP_CA_Loss = 4

11.#define TCPF_CA_Loss (1<

12.};

=================================================================== =========================================

1.static u32 bictcp_undo_cwnd( struct sock *sk )

2.{

3.const struct tcp_sock *tp = tcp_sk(sk) ;

4.const struct bictcp *ca = inet_csk_ca(sk) ;

5.return max( tp->snd_cwnd , ca->last_max_cwnd ) ;

6.}

此函数在退出拥塞处理时调用，而下面的bictcp_state则是在进入拥塞处理时调用。

1.static void bictcp_state( struct sock *sk , u8 new_state )

2.{

3.if ( new_state == TCP_CA_Loss )

4. bictcp_reset( inet_csk_ca(sk) ) ;

5.}

=================================================================== =========================================

bictcp算法结构体

1.static struct tcp_congestion_ops bictcp = {

2. . init = bictcp_init ,

3. . ssthresh = bictcp_recalc_ssthresh ,

4. . cong_avoid = bictcp_cong_avoid ,

5. . set_state = bictcp_state ,

6. . undo_cwnd = bictcp_undo_cwnd ,

7. . pkts_acked = bictcp_acked ,

8. . owner = THIS_MODULE ,

9. . name = "bic" ,

10.} ;

bictcp注册函数

1.static int __init bictcp_register(void)

2.{

3./* bic算法的参数不能太多，多于16个u32 */

4. BUILD_BUG_ON( sizeof( struct bictcp) ) > ICSK_CA_PRIV_SIZE ) ;

5.return tcp_register_congestion_control( &bictcp ) ;

6.}

OK，关于BIC的代码分析告一段落，接下来看看相关函数是在什么样的情况下，以什么顺序来调用的。

=================================================================== ===================================

2.1.6 BIC函数的调用时机

1. 连接每收到一个ack，则调用tcp_ack

2. tcp_ack会调用bictcp_acked，用来更新cnt和delayed_ack（用来消除delay包的影响）

3. tcp_ack会调用bictcp_cong_avoid，这是分两种情况：

（1）snd_cwnd小于慢启动阈值，处于慢启动阶段，则调用tcp_slow_start

（2）snd_cwnd大于慢启动阈值，处于拥塞避免阶段，则调用bictcp_update来更新bictcp，再调用tcp_cong_avoid_ai

4. tcp_ack中如果检测到丢包，进入拥塞处理阶段，则调用bictcp_recalc_ssthresh来更新慢启动阈值。

5. tcp_ack中完成丢包重传后，退出拥塞处理阶段，则调用bictcp_undo_cwnd来更新snd_cwnd。

快速重传：tcp_ack中的丢包检测，即检测到连续3个重复ACK。

快速恢复：bictcp_undo_cwnd，直接把snd_cwnd更新为max(snd_cwnd，last_max_cwnd)，和掉包前相差不大。

2.2 常见拥塞控制算法简介

1. Cubic

Cubic，立方的。顾名思义，此算法一个特征为它的窗口增长函数为一立方函数。

The protocol modifies the linear window growth function of existing TCP standards to be a cubic function in order to improve the scalability of TCP over fast and long distance networks. It also achieves more equitable bandwith allocations among flows with different RTTs by making the window growth to be independent of RTT——thus those flows grow their congestion window at the same rate. During steady state, Cubic increases the window size aggresively when the window is far from the saturation point, and the slowly when it is close to the saturation point.

Cubic不仅在高带宽迟延(BDP)网络中能达到较高的吞吐量，而且它还具有很好的RTT公平性。因此，它是一个TCP-Friendly和High-Speed的算法。

Cubic有很好的稳定性。但是如果在易变的网络环境下，这将是一把双刃剑。

Because it stays longer near the previous saturation point than other variants, it can be sluggish to find the new saturation point if the saturation point has increased far beyond the last one.

Cubic有很好的公平性。但是当RTT较小时，这种公平性会降低算法的带宽竞争力。

The key feature of Cubic is that its window growth depends only on the real tiime between two consecutive congestion events. When RTTs are short, since the window growth rate is fixed, its growth rate could be slower than TCP standards.

Cubic取消了max increment。

This feature is not needed after entensive testing due to the increased stability of Cubic. Cubic replaced BIC-TCP as the default algorithm in 2006 after version 2.6.18.