基于 kernel 3.10.105 分析。
softnet_data 每个CPU都有队列,用来接收进来的帧。因为每个CPU都有其数据结构用来处理入口和出口流量,因此,不同CPU之间没必要使用上锁机制。此队列的数据结构softnet_data定义在include/linux/netdevice.h
中,如下所示:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 struct softnet_data { struct Qdisc *output_queue ; struct Qdisc **output_queue_tailp ; struct list_head poll_list ; struct sk_buff *completion_queue ; struct sk_buff_head process_queue ; unsigned int processed; unsigned int time_squeeze; unsigned int cpu_collision; unsigned int received_rps; #ifdef CONFIG_RPS struct softnet_data *rps_ipi_list ; struct call_single_data csd ____cacheline_aligned_in_smp ; struct softnet_data *rps_ipi_next ; unsigned int cpu; unsigned int input_queue_head; unsigned int input_queue_tail; #endif unsigned int dropped; struct sk_buff_head input_pkt_queue ; struct napi_struct backlog ; };
初始化 初始化在文件net/core/dev.c
的函数net_dev_init
中:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 static int __init net_dev_init (void ) { …… for_each_possible_cpu(i) { struct softnet_data *sd = &per_cpu(softnet_data, i); memset (sd, 0 , sizeof (*sd)); skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); sd->completion_queue = NULL ; INIT_LIST_HEAD(&sd->poll_list); sd->output_queue = NULL ; sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS sd->csd.func = rps_trigger_softirq; sd->csd.info = sd; sd->csd.flags = 0 ; sd->cpu = i; #endif sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; sd->backlog.gro_list = NULL ; sd->backlog.gro_count = 0 ; } …… }
非NAPI 以vortex_rx
为例。
vortex_interrupt
为中断处理函数,收包调用vortex_rx
。
vortex_rx 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 static int vortex_rx (struct net_device *dev) { struct vortex_private *vp = netdev_priv(dev); void __iomem *ioaddr = vp->ioaddr; int i; short rx_status; …… int pkt_len = rx_status & 0x1fff ; struct sk_buff *skb ; skb = netdev_alloc_skb(dev, pkt_len + 5 ); …… …… skb->protocol = eth_type_trans(skb, dev); netif_rx(skb); dev->stats.rx_packets++; …… }
netif_rx 非NAPI处理报文上半部函数为netif_rx
,代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 int netif_rx (struct sk_buff *skb) { int ret; if (netpoll_rx(skb)) return NET_RX_DROP; net_timestamp_check(netdev_tstamp_prequeue, skb); trace_netif_rx(skb); #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow , *rflow = &voidflow; int cpu; preempt_disable(); rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu < 0 ) cpu = smp_processor_id(); ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); preempt_enable(); } else #endif { unsigned int qtail; ret = enqueue_to_backlog(skb, get_cpu(), &qtail); put_cpu(); } return ret; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 static int enqueue_to_backlog (struct sk_buff *skb, int cpu, unsigned int *qtail) { struct softnet_data *sd ; unsigned long flags; sd = &per_cpu(softnet_data, cpu); local_irq_save(flags); rps_lock(sd); if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { if (skb_queue_len(&sd->input_pkt_queue)) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); input_queue_tail_incr_save(sd, qtail); rps_unlock(sd); local_irq_restore(flags); return NET_RX_SUCCESS; } if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { if (!rps_ipi_queued(sd)) ____napi_schedule(sd, &sd->backlog); } goto enqueue; } sd->dropped++; rps_unlock(sd); local_irq_restore(flags); atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; }
在下半部处理函数 net_rx_action
中,
1 2 3 4 if (test_bit(NAPI_STATE_SCHED, &n->state)) { work = n->poll(n, weight); trace_napi_poll(n); }
会调用CPU默认处理函数process_backlog
。
process_backlog 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 static int process_backlog (struct napi_struct *napi, int quota) { int work = 0 ; struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); #ifdef CONFIG_RPS if (sd->rps_ipi_list) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } #endif napi->weight = weight_p; local_irq_disable(); while (work < quota) { struct sk_buff *skb ; unsigned int qlen; while ((skb = __skb_dequeue(&sd->process_queue))) { rcu_read_lock(); local_irq_enable(); __netif_receive_skb(skb); rcu_read_unlock(); local_irq_disable(); input_queue_head_incr(sd); if (++work >= quota) { local_irq_enable(); return work; } } rps_lock(sd); qlen = skb_queue_len(&sd->input_pkt_queue); if (qlen) skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); if (qlen < quota - work) { list_del(&napi->poll_list); napi->state = 0 ; quota = work + qlen; } rps_unlock(sd); } local_irq_enable(); return work; }
小结 非NAPI是一次中断一次上送,流量突增时,则中断增多,CPU处理时间变少。
NAPI NAPI混合了中断事件和轮询,而不使用纯粹的中断事件驱动模型。如果接收到新帧时,内核还没完成处理前几个帧的工作,驱动程序就没必要产生其他中断事件:让内核一直处理设备输入队列中的数据会比较简单(该设备中断功能关闭),然后当队列为空时,再重新开启中断功能。
net_device内相关结构 1 struct list_head napi_list ;
初始化函数为
1 2 3 4 5 6 7 8 struct net_device *alloc_netdev_mqs (int sizeof_priv, const char *name, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs) { …… INIT_LIST_HEAD(&dev->napi_list); …… }
在netif_napi_add
中更改
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 void netif_napi_add (struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int ), int weight) { INIT_LIST_HEAD(&napi->poll_list); napi->gro_count = 0 ; napi->gro_list = NULL ; napi->skb = NULL ; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) pr_err_once("netif_napi_add() called with weight %d on device %s\n" , weight, dev->name); napi->weight = weight; list_add(&napi->dev_list, &dev->napi_list); napi->dev = dev; #ifdef CONFIG_NETPOLL spin_lock_init(&napi->poll_lock); napi->poll_owner = -1 ; #endif set_bit(NAPI_STATE_SCHED, &napi->state); }
而netif_napi_add
一般在驱动函数中调用,这里以ixgb
为例:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 static int ixgb_probe (struct pci_dev *pdev, const struct pci_device_id *ent) { …… netdev = alloc_etherdev(sizeof (struct ixgb_adapter)); netdev->netdev_ops = &ixgb_netdev_ops; ixgb_set_ethtool_ops(netdev); netdev->watchdog_timeo = 5 * HZ; adapter = netdev_priv(netdev); …… netif_napi_add(netdev, &adapter->napi, ixgb_clean, 64 ); …… }
ixgb_netdev_ops 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 static const struct net_device_ops ixgb_netdev_ops = { .ndo_open = ixgb_open, .ndo_stop = ixgb_close, .ndo_start_xmit = ixgb_xmit_frame, .ndo_get_stats = ixgb_get_stats, .ndo_set_rx_mode = ixgb_set_multi, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = ixgb_set_mac, .ndo_change_mtu = ixgb_change_mtu, .ndo_tx_timeout = ixgb_tx_timeout, .ndo_vlan_rx_add_vid = ixgb_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = ixgb_vlan_rx_kill_vid, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = ixgb_netpoll, #endif .ndo_fix_features = ixgb_fix_features, .ndo_set_features = ixgb_set_features, };
其中,驱动open函数为 ixgb_open
。
ixgb_open 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 static int ixgb_open (struct net_device *netdev) { struct ixgb_adapter *adapter = netdev_priv(netdev); int err; err = ixgb_setup_tx_resources(adapter); if (err) goto err_setup_tx; netif_carrier_off(netdev); err = ixgb_setup_rx_resources(adapter); if (err) goto err_setup_rx; err = ixgb_up(adapter); if (err) goto err_up; netif_start_queue(netdev); return 0 ; …… }
ixgb_setup_rx_resources 函数ixgb_setup_rx_resources
是为收包分配资源的函数。
主要数据结构为
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 struct ixgb_buffer { struct sk_buff *skb ; dma_addr_t dma; unsigned long time_stamp; u16 length; u16 next_to_watch; u16 mapped_as_page; }; struct ixgb_desc_ring { void *desc; dma_addr_t dma; unsigned int size; unsigned int count; unsigned int next_to_use; unsigned int next_to_clean; struct ixgb_buffer *buffer_info ; }; struct ixgb_rx_desc { __le64 buff_addr; __le16 length; __le16 reserved; u8 status; u8 errors; __le16 special; };
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 int ixgb_setup_rx_resources (struct ixgb_adapter *adapter) { struct ixgb_desc_ring *rxdr = &adapter->rx_ring; struct pci_dev *pdev = adapter->pdev; int size; size = sizeof (struct ixgb_buffer) * rxdr->count; rxdr->buffer_info = vzalloc(size); if (!rxdr->buffer_info) return -ENOMEM; rxdr->size = rxdr->count * sizeof (struct ixgb_rx_desc); rxdr->size = ALIGN(rxdr->size, 4096 ); rxdr->desc = dma_alloc_coherent(&pdev->dev, rxdr->size, &rxdr->dma, GFP_KERNEL); if (!rxdr->desc) { vfree(rxdr->buffer_info); return -ENOMEM; } memset (rxdr->desc, 0 , rxdr->size); rxdr->next_to_clean = 0 ; rxdr->next_to_use = 0 ; return 0 ; }
完成之后的内存如下:
在ixgb_alloc_rx_buffers
中完成skb到DMA的流式映射。
ixgb_setup_tx_resources
与ixgb_setup_tx_resources
类似。
ixgb_up 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 int ixgb_up (struct ixgb_adapter *adapter) { struct net_device *netdev = adapter->netdev; int err, irq_flags = IRQF_SHARED; int max_frame = netdev->mtu + ENET_HEADER_SIZE + ENET_FCS_LENGTH; struct ixgb_hw *hw = &adapter->hw; …… err = request_irq(adapter->pdev->irq, ixgb_intr, irq_flags, netdev->name, netdev); …… clear_bit(__IXGB_DOWN, &adapter->flags); napi_enable(&adapter->napi); ixgb_irq_enable(adapter); netif_wake_queue(netdev); mod_timer(&adapter->watchdog_timer, jiffies); return 0 ; }
ixgb_intr 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 static irqreturn_t ixgb_intr (int irq, void *data) { struct net_device *netdev = data; struct ixgb_adapter *adapter = netdev_priv(netdev); struct ixgb_hw *hw = &adapter->hw; u32 icr = IXGB_READ_REG(hw, ICR); …… if (napi_schedule_prep(&adapter->napi)) { IXGB_WRITE_REG(&adapter->hw, IMC, ~0 ); __napi_schedule(&adapter->napi); } return IRQ_HANDLED; }
__napi_schedule 1 2 3 4 5 6 7 void __napi_schedule(struct napi_struct *n){ unsigned long flags; local_irq_save(flags); ____napi_schedule(&__get_cpu_var(softnet_data), n); local_irq_restore(flags); }
1 2 3 4 5 6 7 static inline void ____napi_schedule(struct softnet_data *sd, struct napi_struct *napi) { list_add_tail(&napi->poll_list, &sd->poll_list); __raise_softirq_irqoff(NET_RX_SOFTIRQ); }
net_rx_action 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 static void net_rx_action (struct softirq_action *h) { struct softnet_data *sd = &__get_cpu_var(softnet_data); unsigned long time_limit = jiffies + 2 ; int budget = netdev_budget; void *have; local_irq_disable(); while (!list_empty(&sd->poll_list)) { struct napi_struct *n ; int work, weight; if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) goto softnet_break; local_irq_enable(); n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); have = netpoll_poll_lock(n); weight = n->weight; work = 0 ; if (test_bit(NAPI_STATE_SCHED, &n->state)) { work = n->poll(n, weight); trace_napi_poll(n); } WARN_ON_ONCE(work > weight); budget -= work; local_irq_disable(); if (unlikely(work == weight)) { if (unlikely(napi_disable_pending(n))) { local_irq_enable(); napi_complete(n); local_irq_disable(); } else { if (n->gro_list) { local_irq_enable(); napi_gro_flush(n, HZ >= 1000 ); local_irq_disable(); } list_move_tail(&n->poll_list, &sd->poll_list); } } netpoll_poll_unlock(have); } out: net_rps_action_and_irq_enable(sd); #ifdef CONFIG_NET_DMA dma_issue_pending_all(); #endif return ; softnet_break: sd->time_squeeze++; __raise_softirq_irqoff(NET_RX_SOFTIRQ); goto out; }
ixgb_clean 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 static int ixgb_clean (struct napi_struct *napi, int budget) { struct ixgb_adapter *adapter = container_of(napi, struct ixgb_adapter, napi); int work_done = 0 ; ixgb_clean_tx_irq(adapter); ixgb_clean_rx_irq(adapter, &work_done, budget); if (work_done < budget) { napi_complete(napi); if (!test_bit(__IXGB_DOWN, &adapter->flags)) ixgb_irq_enable(adapter); } return work_done; }
ixgb_clean_rx_irq 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 static bool ixgb_clean_rx_irq (struct ixgb_adapter *adapter, int *work_done, int work_to_do) { struct ixgb_desc_ring *rx_ring = &adapter->rx_ring; struct net_device *netdev = adapter->netdev; struct pci_dev *pdev = adapter->pdev; struct ixgb_rx_desc *rx_desc , *next_rxd ; struct ixgb_buffer *buffer_info , *next_buffer , *next2_buffer ; u32 length; unsigned int i, j; int cleaned_count = 0 ; bool cleaned = false ; i = rx_ring->next_to_clean; rx_desc = IXGB_RX_DESC(*rx_ring, i); buffer_info = &rx_ring->buffer_info[i]; while (rx_desc->status & IXGB_RX_DESC_STATUS_DD) { struct sk_buff *skb ; u8 status; if (*work_done >= work_to_do) break ; (*work_done)++; rmb(); status = rx_desc->status; skb = buffer_info->skb; buffer_info->skb = NULL ; prefetch(skb->data - NET_IP_ALIGN); if (++i == rx_ring->count) i = 0 ; next_rxd = IXGB_RX_DESC(*rx_ring, i); prefetch(next_rxd); j = i + 1 ; if (j == rx_ring->count) j = 0 ; next2_buffer = &rx_ring->buffer_info[j]; prefetch(next2_buffer); next_buffer = &rx_ring->buffer_info[i]; cleaned = true ; cleaned_count++; dma_unmap_single(&pdev->dev, buffer_info->dma, buffer_info->length, DMA_FROM_DEVICE); buffer_info->dma = 0 ; length = le16_to_cpu(rx_desc->length); rx_desc->length = 0 ; if (unlikely(!(status & IXGB_RX_DESC_STATUS_EOP))) { pr_debug("Receive packet consumed multiple buffers length<%x>\n" , length); dev_kfree_skb_irq(skb); goto rxdesc_done; } …… ixgb_check_copybreak(netdev, buffer_info, length, &skb); skb_put(skb, length); ixgb_rx_checksum(adapter, rx_desc, skb); skb->protocol = eth_type_trans(skb, netdev); if (status & IXGB_RX_DESC_STATUS_VP) __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), le16_to_cpu(rx_desc->special)); netif_receive_skb(skb); …… }
小结 NAPI 是中断时利用__napi_schedule 将设备poll_list加到cpu的处理链表,之后唤醒下半部,下半部继续调用驱动层的处理函数poll,其中一次处理多个skb,而非传统的一个skb进行一次中断。达到了网络性能的提升。