阅读本文最好有内核网络源码基础。
本文源码基于Linxu 3.10。
Netfilter Netfilter是Linux 2.4.x引入的一个子系统,提供一整套的hook函数的管理机制,使得诸如数据包过滤、网络地址转换(NAT)和基于协议类型的连接跟踪成为了可能。
网络层作为ISO 7层协议(网络关系图可参考 http://www.52im.net/thread-180-1-1.html )的第三层,其代表协议为IP(Internet Protocol)协议,协议号为 0x0800。协议处理流程大致如下:
而Netfilter的核心就是在整个网络流程的若干位置放置了一些检测点(HOOK),而在每个检测点上登记了一些处理函数进行处理。
NF_HOOK实现 一个普通的系统(无桥、vlan等配置)中,流量在IPv4的处理函数ip_rcv
最后会走到Netfilter架构的第一个钩子点:
1 2 3 4 5 6 7 int ip_rcv (struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { …… return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL , ip_rcv_finish); …… }
在网络层处理协议中,此种形式的钩子点如图所示:
上图即为Netfilter经典的五个处理点。
深入到NF_HOOK
的内部可发现最重要的结构为内核全局变量nf_hooks:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 static inline int NF_HOOK (uint8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct sk_buff *)) { return NF_HOOK_THRESH(pf, hook, skb, in, out, okfn, INT_MIN); } --->>> static inline int NF_HOOK_THRESH (uint8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct sk_buff *), int thresh) { int ret = nf_hook_thresh(pf, hook, skb, in, out, okfn, thresh); if (ret == 1 ) ret = okfn(skb); return ret; } --->>> static inline int nf_hook_thresh (u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), int thresh) { if (nf_hooks_active(pf, hook)) return nf_hook_slow(pf, hook, skb, indev, outdev, okfn, thresh); return 1 ; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 int nf_hook_slow (u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), int hook_thresh) { struct nf_hook_ops *elem ; unsigned int verdict; int ret = 0 ; rcu_read_lock(); elem = list_entry_rcu(&nf_hooks[pf][hook], struct nf_hook_ops, list ); next_hook: verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev, outdev, &elem, okfn, hook_thresh); ………… if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1 ; } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { kfree_skb(skb); ret = NF_DROP_GETERR(verdict); if (ret == 0 ) ret = -EPERM; } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { int err = nf_queue(skb, elem, pf, hook, indev, outdev, okfn, verdict >> NF_VERDICT_QBITS); if (err < 0 ) { if (err == -ECANCELED) goto next_hook; if (err == -ESRCH && (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) goto next_hook; kfree_skb(skb); } } rcu_read_unlock(); return ret; }
其中函数nf_iterate
的定义如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 unsigned int nf_iterate (struct list_head *head, struct sk_buff *skb, unsigned int hook, const struct net_device *indev, const struct net_device *outdev, struct nf_hook_ops **elemp, int (*okfn)(struct sk_buff *), int hook_thresh) { unsigned int verdict; list_for_each_entry_continue_rcu((*elemp), head, list ) { if (hook_thresh > (*elemp)->priority) continue ; repeat: verdict = (*elemp)->hook(hook, skb, indev, outdev, okfn); if (verdict != NF_ACCEPT) { #ifdef CONFIG_NETFILTER_DEBUG if (unlikely((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT)) { NFDEBUG("Evil return from %p(%u).\n" , (*elemp)->hook, hook); continue ; } #endif if (verdict != NF_REPEAT) return verdict; goto repeat; } } return NF_ACCEPT; }
核心数据结构 Netfilter中有两个数据比较主要: **nf_hooks**
和**nf_hook_ops
**。
本质上,nf_hooks
是一个二维hash头。
1 extern struct list_head nf_hooks [NFPROTO_NUMPROTO ][NF_MAX_HOOKS ];
支持的Netfilter Protocol定义如下:
1 2 3 4 5 6 7 8 9 enum { NFPROTO_UNSPEC = 0 , NFPROTO_IPV4 = 2 , NFPROTO_ARP = 3 , NFPROTO_BRIDGE = 7 , NFPROTO_IPV6 = 10 , NFPROTO_DECNET = 12 , NFPROTO_NUMPROTO, };
Protocol中最大Hook数量为:
nf_hook_ops
的相关定义如下
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 typedef unsigned int nf_hookfn (unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) ;struct nf_hook_ops { struct list_head list ; nf_hookfn *hook; struct module *owner ; u_int8_t pf; unsigned int hooknum; int priority; };
网络层优先级的枚举如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 enum nf_ip_hook_priorities { NF_IP_PRI_FIRST = INT_MIN, NF_IP_PRI_CONNTRACK_DEFRAG = -400 , NF_IP_PRI_RAW = -300 , NF_IP_PRI_SELINUX_FIRST = -225 , NF_IP_PRI_CONNTRACK = -200 , NF_IP_PRI_MANGLE = -150 , NF_IP_PRI_NAT_DST = -100 , NF_IP_PRI_FILTER = 0 , NF_IP_PRI_SECURITY = 50 , NF_IP_PRI_NAT_SRC = 100 , NF_IP_PRI_SELINUX_LAST = 225 , NF_IP_PRI_CONNTRACK_HELPER = 300 , NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX, NF_IP_PRI_LAST = INT_MAX, };
定义的hook
函数指针所指向的函数中,其返回值的枚举如下
1 2 3 4 5 6 7 8 #define NF_DROP 0 #define NF_ACCEPT 1 #define NF_STOLEN 2 #define NF_QUEUE 3 #define NF_REPEAT 4 #define NF_STOP 5 #define NF_MAX_VERDICT NF_STOP
注册 Netfilter架构利用函数nf_register_hooks
向nf_hooks
注册各模块定义的nf_hook_ops
数组。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 int nf_register_hooks (struct nf_hook_ops *reg, unsigned int n) { unsigned int i; int err = 0 ; for (i = 0 ; i < n; i++) { err = nf_register_hook(®[i]); if (err) goto err; } return err; …… } --->>> int nf_register_hook (struct nf_hook_ops *reg) { struct nf_hook_ops *elem ; int err; err = mutex_lock_interruptible(&nf_hook_mutex); if (err < 0 ) return err; list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list ) { if (reg->priority < elem->priority) break ; } list_add_rcu(®->list , elem->list .prev); mutex_unlock(&nf_hook_mutex); #if defined(CONFIG_JUMP_LABEL) static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif return 0 ; }
以bridge
模块举例:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 static struct nf_hook_ops br_nf_ops [] __read_mostly = { { .hook = br_nf_pre_routing, .owner = THIS_MODULE, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_PRE_ROUTING, .priority = NF_BR_PRI_BRNF, }, { .hook = br_nf_local_in, .owner = THIS_MODULE, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_IN, .priority = NF_BR_PRI_BRNF, }, ………… };
由其定义可看出:
模块中一般定义一个nf_hook_ops结构的弹性数组。 本模块可定义其他模块pf。 pf-hooknum-priority,三者保证唯一性。 1 2 ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
Iptables Netfilter是内核的一种网络架构,而iptables
是netfilter
的用户态配置程序。
既然是一种配置程序,那么下刷的数据最终是要配置到netfilter
架构——nf_hooks
中生效。因此,按照上面的步骤继续分析。
对于开启了iptables功能的系统
可见iptables
作为模块存于kernel中,而模块ip_tables
作为基础模块由其他五个模块(kernel version不同,可能缺少iptable_security
,模块命名方式:iptables_表名)引用。
其中模块ip_tabls
为承接模块,承接user/kernel的信息交互 。而其他五个模块主要提供不同的(iptables 表)类型到nf_hooks
的映射。
模块间的交互 以iptable_filter
为例,分析一下配置下刷到nf_hooks
。
从模块初始化函数分析:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 static int __init iptable_filter_init (void ) { int ret; ret = register_pernet_subsys(&iptable_filter_net_ops); if (ret < 0 ) return ret; filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook); if (IS_ERR(filter_ops)) { ret = PTR_ERR(filter_ops); unregister_pernet_subsys(&iptable_filter_net_ops); } return ret; }
register_pernet_subsys 对网络命名空间的数据注册。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 static struct pernet_operations iptable_filter_net_ops = { .init = iptable_filter_net_init, .exit = iptable_filter_net_exit, }; ret = register_pernet_subsys(&iptable_filter_net_ops); --->>> int register_pernet_subsys (struct pernet_operations *ops) { int error; mutex_lock(&net_mutex); error = register_pernet_operations(first_device, ops); mutex_unlock(&net_mutex); return error; } --->>> static int register_pernet_operations (struct list_head *list , struct pernet_operations *ops) { ………… error = __register_pernet_operations(list , ops); ………… } --->>> static int __register_pernet_operations(struct list_head *list , struct pernet_operations *ops) { struct net *net ; int error; LIST_HEAD(net_exit_list); list_add_tail(&ops->list , list ); if (ops->init || (ops->id && ops->size)) { for_each_net(net) { error = ops_init(ops, net); if (error) goto out_undo; list_add_tail(&net->exit_list, &net_exit_list); } } return 0 ; ………… }
因此,再来看一下iptables_filter
模块的init函数内部流程,不过之前需要熟悉一下其用到的数据。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 #define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) static const struct xt_table packet_filter = { .name = "filter" , .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_FILTER, };
正式开始分析初始化函数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 static int __net_init iptable_filter_net_init (struct net *net) { struct ipt_replace *repl ; repl = ipt_alloc_initial_table(&packet_filter); if (repl == NULL ) return -ENOMEM; ((struct ipt_standard *)repl->entries)[1 ].target.verdict = forward ? -NF_ACCEPT - 1 : -NF_DROP - 1 ; net->ipv4.iptable_filter = ipt_register_table(net, &packet_filter, repl); kfree(repl); return PTR_RET(net->ipv4.iptable_filter); }
第一步:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 void *ipt_alloc_initial_table (const struct xt_table *info) { return xt_alloc_initial_table(ipt, IPT); } ######################################################## void *ipt_alloc_initial_table(const struct xt_table *info) { unsigned int hook_mask = info->valid_hooks; unsigned int nhooks = hweight32(hook_mask); unsigned int bytes = 0 , hooknum = 0 , i = 0 ; struct { struct ipt_replace repl ; struct ipt_standard entries [nhooks ]; struct ipt_error term ; } *tbl = kzalloc(sizeof (*tbl), GFP_KERNEL); if (tbl == NULL ) return NULL ; strncpy (tbl->repl.name, info->name, sizeof (tbl->repl.name)); tbl->term = (struct ipt_error)IPT_ERROR_INIT; tbl->repl.valid_hooks = hook_mask; tbl->repl.num_entries = nhooks + 1 ; tbl->repl.size = nhooks * sizeof (struct ipt_standard) + sizeof (struct ipt_error); for (; hook_mask != 0 ; hook_mask >>= 1 , ++hooknum) { if (!(hook_mask & 1 )) continue ; tbl->repl.hook_entry[hooknum] = bytes; tbl->repl.underflow[hooknum] = bytes; tbl->entries[i++] = (struct ipt_standard) IPT_STANDARD_INIT(NF_ACCEPT); bytes += sizeof (struct ipt_standard); } return tbl; } --->>> #define IPT_STANDARD_INIT(__verdict) \ { \ .entry = IPT_ENTRY_INIT(sizeof(struct ipt_standard)), \ .target = XT_TARGET_INIT(XT_STANDARD_TARGET, \ sizeof(struct xt_standard_target)), \ .target.verdict = -(__verdict) - 1, \ }
第二步:
1 2 3 4 ((struct ipt_standard *)repl->entries)[1 ].target.verdict = forward ? -NF_ACCEPT - 1 : -NF_DROP - 1 ;
第三步:
可直接跳到 第四步。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 net->ipv4.iptable_filter = ipt_register_table(net, &packet_filter, repl); --->>> struct xt_table *ipt_register_table (struct net *net, const struct xt_table *table, const struct ipt_replace *repl) { int ret; struct xt_table_info *newinfo ; struct xt_table_info bootstrap = {0 }; void *loc_cpu_entry; struct xt_table *new_table ; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) { ret = -ENOMEM; goto out; } loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; memcpy (loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); if (ret != 0 ) goto out_free; new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { ret = PTR_ERR(new_table); goto out_free; } return new_table; out_free: xt_free_table_info(newinfo); out: return ERR_PTR(ret); }
第四步:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 struct xt_table_info *xt_alloc_table_info (unsigned int size) { struct xt_table_info *newinfo ; int cpu; if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) return NULL ; newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL); if (!newinfo) return NULL ; newinfo->size = size; for_each_possible_cpu(cpu) { if (size <= PAGE_SIZE) newinfo->entries[cpu] = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); else newinfo->entries[cpu] = vmalloc_node(size, cpu_to_node(cpu)); if (newinfo->entries[cpu] == NULL ) { xt_free_table_info(newinfo); return NULL ; } } return newinfo; }
第五步:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 static int translate_table (struct net *net, struct xt_table_info *newinfo, void *entry0, const struct ipt_replace *repl) { struct ipt_entry *iter ; unsigned int i; int ret = 0 ; newinfo->size = repl->size; newinfo->number = repl->num_entries; for (i = 0 ; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = 0xFFFFFFFF ; newinfo->underflow[i] = 0xFFFFFFFF ; } duprintf("translate_table: size %u\n" , newinfo->size); i = 0 ; xt_entry_foreach(iter, entry0, newinfo->size) { ret = check_entry_size_and_hooks(iter, newinfo, entry0, entry0 + repl->size, repl->hook_entry, repl->underflow, repl->valid_hooks); if (ret != 0 ) return ret; ++i; if (strcmp (ipt_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0 ) ++newinfo->stacksize; } if (i != repl->num_entries) { duprintf("translate_table: %u not %u entries\n" , i, repl->num_entries); return -EINVAL; } for (i = 0 ; i < NF_INET_NUMHOOKS; i++) { if (!(repl->valid_hooks & (1 << i))) continue ; if (newinfo->hook_entry[i] == 0xFFFFFFFF ) { duprintf("Invalid hook entry %u %u\n" , i, repl->hook_entry[i]); return -EINVAL; } if (newinfo->underflow[i] == 0xFFFFFFFF ) { duprintf("Invalid underflow %u %u\n" , i, repl->underflow[i]); return -EINVAL; } } if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) return -ELOOP; i = 0 ; xt_entry_foreach(iter, entry0, newinfo->size) { ret = find_check_entry(iter, net, repl->name, repl->size); if (ret != 0 ) break ; ++i; } if (ret != 0 ) { xt_entry_foreach(iter, entry0, newinfo->size) { if (i-- == 0 ) break ; cleanup_entry(iter, net); } return ret; } for_each_possible_cpu(i) { if (newinfo->entries[i] && newinfo->entries[i] != entry0) memcpy (newinfo->entries[i], entry0, newinfo->size); } return ret; }
第六步:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 struct xt_table *xt_register_table (struct net *net, const struct xt_table *input_table, struct xt_table_info *bootstrap, struct xt_table_info *newinfo) { int ret; struct xt_table_info *private ; struct xt_table *t , *table ; table = kmemdup(input_table, sizeof (struct xt_table), GFP_KERNEL); if (!table) { ret = -ENOMEM; goto out; } ret = mutex_lock_interruptible(&xt[table->af].mutex); if (ret != 0 ) goto out_free; list_for_each_entry(t, &net->xt.tables[table->af], list ) { if (strcmp (t->name, table->name) == 0 ) { ret = -EEXIST; goto unlock; } } table->private = bootstrap; if (!xt_replace_table(table, 0 , newinfo, &ret)) goto unlock; private = table->private ; pr_debug("table->private->number = %u\n" , private ->number); private ->initial_entries = private ->number; list_add(&table->list , &net->xt.tables[table->af]); mutex_unlock(&xt[table->af].mutex); return table; unlock: mutex_unlock(&xt[table->af].mutex); out_free: kfree(table); out: return ERR_PTR(ret); }
xt_hook_link iptables 和 内核数据结构进行关联
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook); -->> struct nf_hook_ops *xt_hook_link (const struct xt_table *table, nf_hookfn *fn) { unsigned int hook_mask = table->valid_hooks; uint8_t i, num_hooks = hweight32(hook_mask); uint8_t hooknum; struct nf_hook_ops *ops ; int ret; ops = kmalloc(sizeof (*ops) * num_hooks, GFP_KERNEL); if (ops == NULL ) return ERR_PTR(-ENOMEM); for (i = 0 , hooknum = 0 ; i < num_hooks && hook_mask != 0 ; hook_mask >>= 1 , ++hooknum) { if (!(hook_mask & 1 )) continue ; ops[i].hook = fn; ops[i].owner = table->me; ops[i].pf = table->af; ops[i].hooknum = hooknum; ops[i].priority = table->priority; ++i; } ret = nf_register_hooks(ops, num_hooks); if (ret < 0 ) { kfree(ops); return ERR_PTR(ret); } return ops; } -->> int nf_register_hooks (struct nf_hook_ops *reg, unsigned int n) { unsigned int i; int err = 0 ; for (i = 0 ; i < n; i++) { err = nf_register_hook(®[i]); if (err) goto err; } return err; …… } -->> int nf_register_hook (struct nf_hook_ops *reg) { struct nf_hook_ops *elem ; int err; err = mutex_lock_interruptible(&nf_hook_mutex); if (err < 0 ) return err; list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list ) { if (reg->priority < elem->priority) break ; } list_add_rcu(®->list , elem->list .prev); mutex_unlock(&nf_hook_mutex); #if defined(CONFIG_JUMP_LABEL) static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif return 0 ; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 static unsigned int iptable_filter_hook (unsigned int hook, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { const struct net *net ; if (hook == NF_INET_LOCAL_OUT && (skb->len < sizeof (struct iphdr) || ip_hdrlen(skb) < sizeof (struct iphdr))) return NF_ACCEPT; net = dev_net((in != NULL ) ? in : out); return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter); }
模块内的交互 所有关于netfilter的user-kernel交互,注册都在iptabls模块。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 ret = nf_register_sockopt(&ipt_sockopts); -->> static int do_ipt_set_ctl (struct sock *sk, int cmd, void __user *user, unsigned int len) { …… switch (cmd) { case IPT_SO_SET_REPLACE: ret = do_replace(sock_net(sk), user, len); break ; ………… } return ret; } -->> static int do_replace (struct net *net, const void __user *user, unsigned int len) { int ret; struct ipt_replace tmp ; struct xt_table_info *newinfo ; void *loc_cpu_entry; struct ipt_entry *iter ; if (copy_from_user(&tmp, user, sizeof (tmp)) != 0 ) return -EFAULT; if (tmp.num_counters >= INT_MAX / sizeof (struct xt_counters)) return -ENOMEM; if (tmp.num_counters == 0 ) return -EINVAL; tmp.name[sizeof (tmp.name)-1 ] = 0 ; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; if (copy_from_user(loc_cpu_entry, user + sizeof (tmp), tmp.size) != 0 ) { ret = -EFAULT; goto free_newinfo; } ret = translate_table(net, newinfo, loc_cpu_entry, &tmp); if (ret != 0 ) goto free_newinfo; duprintf("Translated table\n" ); ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); ………… } -->> static int __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table_info *newinfo, unsigned int num_counters, void __user *counters_ptr) { int ret; struct xt_table *t ; struct xt_table_info *oldinfo ; struct xt_counters *counters ; void *loc_cpu_old_entry; struct ipt_entry *iter ; ret = 0 ; counters = vzalloc(num_counters * sizeof (struct xt_counters)); if (!counters) { ret = -ENOMEM; goto out; } t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), "iptable_%s" , name); if (IS_ERR_OR_NULL(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free_newinfo_counters_untrans; } if (valid_hooks != t->valid_hooks) { duprintf("Valid hook crap: %08X vs %08X\n" , valid_hooks, t->valid_hooks); ret = -EINVAL; goto put_module; } oldinfo = xt_replace_table(t, num_counters, newinfo, &ret); if (!oldinfo) goto put_module; ………… }
可以简单理解为:
内核每个命名空间注册时:nf_hooks 与 net->ipv4.iptable_filter 做关联,iptable_filter 与 xt.tables 做关联 用户态进行更新时:更新相应的xt.tables
优秀资料 iptables详解(1):iptables概念
Linux 防火墙在内核中的实现
netfilter/iptables 简介