《Linux-netfilter源码分析教程》pdf电子书免费下载

下载方式一：

百度网盘下载地址：https://pan.baidu.com/s/1l7aKJnst3Zu3mqhDQpet6g

百度网盘密码：1111

下载方式二：

http://ziliaoshare.cn/Download/ad_123563_do_Linux-netfilterYMFXJC.zip

作者：empty

出版社：empty

《Linux-netfilter源码分析教程》介绍

Linux netfilter源码分析

内容基本上来自两篇文章:

《Netfilter源码分析》—（独孤九贱http://www.skynet.org.cn/index.php）

《Linux Netfilter实现机制和扩展技术》——（杨沙洲国防科技大学计算机学院）

一、 IP报文的接收到hook函数的调用

1.1 ip_input.c ip_rcv()函数

以接收到的报文为例，类似的还有ip_forward(ip_forward.c)和ip_output(ip_output.c)

int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)

{

struct iphdr *iph; //定义一个ip报文的数据报头

u32 len;

if (skb->pkt_type == PACKET_OTHERHOST)

goto drop; //数据包不是发给我们的

IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES); //收到数据包统计量加1

if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)

{

/* 如果数据报是共享的，则复制一个出来，此时复制而出的已经和socket脱离了关系 */

IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);

goto out;

}

if (!pskb_may_pull(skb, sizeof(struct iphdr)))

goto inhdr_error; //对数据报的头长度进行检查，

iph = skb->nh.iph; //取得数据报的头部位置

if (iph->ihl 5 || iph->version != 4) //版本号或者头长度不对，

goto inhdr_error; //头长度是以4字节为单位的，所以5表示的是20字节

if (!pskb_may_pull(skb, iph->ihl*4))

goto inhdr_error;

if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))

goto inhdr_error; //检查报文的检验和字段

len = ntohs(iph->tot_len);

if (skb->len len || len (iph->ihl*4))

goto inhdr_error; //整个报文长度不可能比报头长度小

if (pskb_trim_rcsum(skb, len))

{ //对数据报进行裁减，这样可以分片发送过来的数据报不会有重复数据

IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);

goto drop;

}

return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,

ip_rcv_finish); //通过回调函数调用ip_rcv_finish

inhdr_error:

IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);

drop:

kfree_skb(skb); //丢掉数据报

out:

return NET_RX_DROP;

}

1.2 include/linux/netfilter.h NF_HOOK宏

#ifdef CONFIG_NETFILTER_DEBUG

#define NF_HOOK(pf, hook, skb, indev, outdev, okfn)

nf_hook_slow((pf), (hook), (skb), (indev), (outdev), (okfn), INT_MIN)

#define NF_HOOK_THRESH nf_hook_slow

#else

#define NF_HOOK(pf, hook, skb, indev, outdev, okfn)

(list_empty(&nf_hooks[(pf)][(hook)])

? (okfn)(skb)

: nf_hook_slow((pf), (hook), (skb), (indev), (outdev), (okfn), INT_MIN))

#define NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, thresh)

(list_empty(&nf_hooks[(pf)][(hook)])

? (okfn)(skb)

: nf_hook_slow((pf), (hook), (skb), (indev), (outdev), (okfn), (thresh)))

#endif

/* 如果nf_hooks[PF_INET][NF_IP_FORWARD]所指向的链表为空（即该钩子上没有挂处理函数），则直接调用okfn；否则，则调用net/core/netfilter.c::nf_hook_slow()转入Netfilter的处理。 */

1.3 net/core/netfilter.c nf_kook_slow()函数

int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,

struct net_device *indev,

struct net_device *outdev,

int (*okfn)(struct sk_buff *),

int hook_thresh)

{

struct list_head *elem;

unsigned int verdict;

int ret = 0;

rcu_read_lock();

/*取得对应的链表首部*/

elem = &nf_hooks[pf][hook];

next_hook:

/*调用对应的钩子函数*/

verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,

outdev, &elem, okfn, hook_thresh);

/*判断返回值，做相应的处理*/

if (verdict == NF_ACCEPT || verdict == NF_STOP) {

ret = 1; /*前面提到过，返回1，则表示装继续调用okfn函数指针*/

goto unlock;

} else if (verdict == NF_DROP) {

kfree_skb(*pskb); /*删除数据包，需要释放skb*/

ret = -EPERM;

} else if (verdict == NF_QUEUE) {

NFDEBUG( nf_hook: Verdict = QUEUE. n );

if (!nf_queue(*pskb, elem, pf, hook, indev, outdev, okfn))

goto next_hook;

}

unlock:

rcu_read_unlock();

return ret;

}

1.4 net/core/netfilter.c nf_iterate()函数

static unsigned int nf_iterate(struct list_head *head,

struct sk_buff **skb,

int hook,

const struct net_device *indev,

const struct net_device *outdev,

struct list_head **i,

int (*okfn)(struct sk_buff *),

int hook_thresh)

{

* The caller must not block between calls to this

* function because of risk of continuing from deleted element.

/* 依次调用指定hook点下的所有nf_hook_ops->(*hook)函数，这些nf_hook_ops里有filter表注册的，有mangle表注册的，等等。

list_for_each_continue_rcu函数是一个for循环的宏，当调用结点中的hook函数后，根据返回值进行相应处理。如果hook函数的返回值是NF_QUEUE,NF_STOLEN,NF_DROP时，函数返回该值；如果返回值是NF_REPEAT时，则跳到前一个结点继续处理；如果是其他值，由下一个结点继续处理。如果整条链表处理完毕，返回值不是上面四个值，则返回NF_ACCEPT。*/

list_for_each_continue_rcu(*i, head) {

struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;

if (hook_thresh > elem->priority)

continue;

switch (elem->hook(hook, skb, indev, outdev, okfn)) {

case NF_QUEUE:

return NF_QUEUE;

case NF_STOLEN:

return NF_STOLEN;

case NF_DROP:

return NF_DROP;

case NF_REPEAT:

*i = (*i)->prev;

break;

}

return NF_ACCEPT;

}

二、ipt_table数据结构和表的初始化

2.1 include/linux/netfilter_ipv4/ip_tables.h struct ipt_table 表结构

struct ipt_table

{

struct list_head list;

/* 表链 */

char name[IPT_TABLE_MAXNAMELEN];

/* 表名，如 filter 、 nat 等，为了满足自动模块加载的设计，包含该表的模块应命名为iptable_'name'.o */

struct ipt_replace *table;

/* 表模子，初始为initial_table.repl */

unsigned int valid_hooks;

/* 位向量，标示本表所影响的HOOK */

rwlock_t lock;

/* 读写锁，初始为打开状态 */

struct ipt_table_info *private;

/* iptable的数据区，见下 */

struct module *me;

/* 是否在模块中定义 */

};

2.2 struct ipt_table_info是实际描述表的数据结构 ip_tables.c

struct ipt_table_info

{

unsigned int size;

/* 表大小 */

unsigned int number;

/* 表中的规则数 */

unsigned int initial_entries;

/* 初始的规则数，用于模块计数 */

unsigned int hook_entry[NF_IP_NUMHOOKS];

/* 记录所影响的HOOK的规则入口相对于下面的entries变量的偏移量 */

unsigned int underflow[NF_IP_NUMHOOKS];

/* 与hook_entry相对应的规则表上限偏移量，当无规则录入时，相应的hook_entry和underflow均为0 */

char entries[0] ____cacheline_aligned;

/* 规则表入口 */

};

2.3 include/linux/netfilter_ipv4 规则用struct ipt_entry结构表示，包含匹配用的IP头部分、一个Target和0个或多个Match。由于Match数不定，所以一条规则实际的占用空间是可变的。结构定义如下

struct ipt_entry

{

struct ipt_ip ip;

/* 所要匹配的报文的IP头信息 */

unsigned int nfcache;

/* 位向量，标示本规则关心报文的什么部分，暂未使用 */

u_int16_t target_offset;

/* target区的偏移，通常target区位于match区之后，而match区则在ipt_entry的末尾；

初始化为sizeof(struct ipt_entry)，即假定没有match */

u_int16_t next_offset;

/* 下一条规则相对于本规则的偏移，也即本规则所用空间的总和，

初始化为sizeof(struct ipt_entry)+sizeof(struct ipt_target)，即没有match */

unsigned int comefrom;

/* 规则返回点，标记调用本规则的HOOK号，可用于检查规则的有效性 */

struct ipt_counters counters;

/* 记录该规则处理过的报文数和报文总字节数 */

unsigned char elems[0];

/*target或者是match的起始位置 */

}

2.4 iptables的初始化init(void) ，以filter表为例 iptable_filter.c

static int __init init(void)

{

int ret;

if (forward 0 || forward > NF_MAX_VERDICT) {

printk( iptables forward must be 0 or 1 n );

return -EINVAL;

}

/* Entry 1 is the FORWARD hook */

initial_table.entries[1].target.verdict = -forward - 1;

/* Register table */

ret = ipt_register_table(&packet_filter); //注册filter表

if (ret 0)

return ret;

/* Register hooks */

ret = nf_register_hook(&ipt_ops[0]); //注册三个HOOK

if (ret 0)

goto cleanup_table;

ret = nf_register_hook(&ipt_ops[1]);

if (ret 0)

goto cleanup_hook0;

ret = nf_register_hook(&ipt_ops[2]);

if (ret 0)

goto cleanup_hook1;

return ret;

cleanup_hook1:

nf_unregister_hook(&ipt_ops[1]);

cleanup_hook0:

nf_unregister_hook(&ipt_ops[0]);

cleanup_table:

ipt_unregister_table(&packet_filter);

return ret;

}

/* ipt_register_table函数的参数packet_filter包含了待注册表的各个参数 */

static struct ipt_table packet_filter = {

.name= filter ,

.table= &initial_table.repl,

.valid_hooks= FILTER_VALID_HOOKS,

.lock= RW_LOCK_UNLOCKED,

.me= THIS_MODULE

};

/* 上面的&initial_table.repl是一个ipt_replace结构，也就是ipt_table-〉*table的初始值。

下面是ipt_replace结构的定义，它和ipt_table_info很相似，基本上就是用来初始化ipt_table中的ipt_table_info *private的，这个结构不同于ipt_table_info之处在于，它还要保存表的旧的规则信息 */

struct ipt_replace

{

char name[IPT_TABLE_MAXNAMELEN]; /* 表名 */

unsigned int valid_hooks; /* 影响的hook */

unsigned int num_entries; /* entry数 */

unsigned int size; /* entry的总大小 */

unsigned int hook_entry[NF_IP_NUMHOOKS]; /* 规则入口的偏移值 */

unsigned int underflow[NF_IP_NUMHOOKS]; /* 规则的最大偏移值 */

unsigned int num_counters; /* 规则数 */

struct ipt_counters __user *counters;

struct ipt_entry entries[0]; /* 规则入口 */

};

/* 下面是initial_table.repl的初始化 */

static struct

{

struct ipt_replace repl;

struct ipt_standard entries[3];

struct ipt_error term;

} initial_table __initdata

= { { filter , FILTER_VALID_HOOKS, 4,

sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),

{ [NF_IP_LOCAL_IN] = 0,

[NF_IP_FORWARD] = sizeof(struct ipt_standard),

[NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },

{ [NF_IP_LOCAL_IN] = 0,

[NF_IP_FORWARD] = sizeof(struct ipt_standard),

[NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },

0, NULL, { } },

{

/* LOCAL_IN */

{ { { { 0 }, { 0 }, { 0 }, { 0 }, , , { 0 }, { 0 }, 0, 0, 0 },

sizeof(struct ipt_entry),

sizeof(struct ipt_standard),

0, { 0, 0 }, { } },

{ { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), } }, { } },

-NF_ACCEPT - 1 } },

/* FORWARD */

{ { { { 0 }, { 0 }, { 0 }, { 0 }, , , { 0 }, { 0 }, 0, 0, 0 },

sizeof(struct ipt_entry),

sizeof(struct ipt_standard),

0, { 0, 0 }, { } },

{ { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), } }, { } },

-NF_ACCEPT - 1 } },

/* LOCAL_OUT */

{ { { { 0 }, { 0 }, { 0 }, { 0 }, , , { 0 }, { 0 }, 0, 0, 0 },

sizeof(struct ipt_entry),

sizeof(struct ipt_standard),

0, { 0, 0 }, { } },

{ { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), } }, { } },

-NF_ACCEPT - 1 } }

/* ERROR */

{ { { { 0 }, { 0 }, { 0 }, { 0 }, , , { 0 }, { 0 }, 0, 0, 0 },

sizeof(struct ipt_entry),

sizeof(struct ipt_error),

0, { 0, 0 }, { } },

{ { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } },

{ } },

ERROR

}

};

三、ipt_table表的注册

init（）函数初始化时调用了ipt_register_table函数进行表的注册

3.1 ip_tables.c 表的注册 ipt_register_table

int ipt_register_table(struct ipt_table *table)

{

int ret;

struct ipt_table_info *newinfo;

static struct ipt_table_info bootstrap

= { 0, 0, 0, { 0 }, { 0 }, { } };

/*宏MOD_INC_USE_COUNT用于模块计数器累加，主要是为了防止模块异常删除，对应的宏MOD_DEC_USE_COUNT就是累减了*/

MOD_INC_USE_COUNT;

/*为每个CPU分配规则空间*/

newinfo = vmalloc(sizeof(struct ipt_table_info)

+ SMP_ALIGN(table->table->size) * smp_num_cpus);

if (!newinfo) {

ret = -ENOMEM;

MOD_DEC_USE_COUNT;

return ret;

}

/*将规则项拷贝到新表项的第一个cpu空间里面*/

memcpy(newinfo->entries, table->table->entries, table->table->size);

/*translate_table函数将newinfo表示的table的各个规则进行边界检查，然后对于newinfo所指的ipt_talbe_info结构中的hook_entries和underflows赋予正确的值，最后将表项向其他cpu拷贝*/

ret = translate_table(table->name, table->valid_hooks,

newinfo, table->table->size,

table->table->num_entries,

table->table->hook_entry,

table->table->underflow);

if (ret != 0) {

vfree(newinfo);

MOD_DEC_USE_COUNT;

return ret;

}

ret = down_interruptible(&ipt_mutex);

if (ret != 0) {

vfree(newinfo);

MOD_DEC_USE_COUNT;

return ret;

}

/* 如果注册的table已经存在，释放空间并且递减模块计数 */

/* Don't autoload: we'd eat our tail... */

if (list_named_find(&ipt_tables, table->name)) {

ret = -EEXIST;

goto free_unlock;

}

/* 替换table项. */

/* Simplifies replace_table code. */

table->private = &bootstrap;

if (!replace_table(table, 0, newinfo, &ret))

goto free_unlock;

duprintf( table->private->number = %u n ,

table->private->number);

/* 保存初始规则计数器 */

/* save number of initial entries */

table->private->initial_entries = table->private->number;

table->lock = RW_LOCK_UNLOCKED;

/*将表添加进链表*/

list_prepend(&ipt_tables, table);

unlock:

up(&ipt_mutex);

return ret;

free_unlock:

vfree(newinfo);

MOD_DEC_USE_COUNT;

goto unlock;

}

3.2 ip_tables.c translate_table()函数

/* 函数:translate_table()

* 参数：

* name:表名称；

* valid_hooks：当前表所影响的hook

* newinfo：包含当前表的所有信息的结构

* size：表的大小

* number：表中的规则数

* hook_entries：记录所影响的HOOK的规则入口相对于下面的entries变量的偏移量

* underflows：与hook_entry相对应的规则表上限偏移量

* 作用：

* translate_table函数将newinfo表示的table的各个规则进行边界检查，然后对于newinfo所指的ipt_talbe_info结构中的hook_entries和underflows赋予正确的值，最后将表项向其他cpu拷贝

* 返回值：

* int ret==0表示成功返回

static int

translate_table(const char *name,

unsigned int valid_hooks,

struct ipt_table_info *newinfo,

unsigned int size,

unsigned int number,

const unsigned int *hook_entries,

const unsigned int *underflows)

{

unsigned int i;

int ret;

newinfo->size = size;

newinfo->number = number;

/* 初始化所有Hooks为不可能的值. */

for (i = 0; i NF_IP_NUMHOOKS; i++) {

newinfo->hook_entry[i] = 0xFFFFFFFF;

newinfo->underflow[i] = 0xFFFFFFFF;

}

duprintf( translate_table: size %u n , newinfo->size);

i = 0;

/* 遍历所有规则，检查所有偏量，检查的工作都是由IPT_ENTRY_ITERATE这个宏来完成，并且它的最后一个参数i，返回表的所有规则数. */

ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,

check_entry_size_and_hooks,

newinfo,

newinfo->entries,

newinfo->entries + size,

hook_entries, underflows, &i);

if (ret != 0)

return ret;

/*实际计算得到的规则数与指定的不符*/

if (i != number) {

duprintf( translate_table: %u not %u entries n ,

i, number);

return -EINVAL;

}

/* 因为函数一开始将HOOK的偏移地址全部初始成了不可能的值，而在上一个宏的遍历中设置了hook_entries和underflows的值，这里对它们进行检查 */

for (i = 0; i NF_IP_NUMHOOKS; i++) {

/* 只检查当前表所影响的hook */

if (!(valid_hooks & (1 i)))

continue;

if (newinfo->hook_entry[i] == 0xFFFFFFFF) {

duprintf( Invalid hook entry %u %u n ,

i, hook_entries[i]);

return -EINVAL;

}

if (newinfo->underflow[i] == 0xFFFFFFFF) {

duprintf( Invalid underflow %u %u n ,

i, underflows[i]);

return -EINVAL;

}

/*确保新的table中不存在规则环*/

if (!mark_source_chains(newinfo, valid_hooks))

return -ELOOP;

/* 对tables中的规则项进行完整性检查，保证每一个规则项在形式上是合法的*/

i = 0;

ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,

check_entry, name, size, &i);

/*检查失败，释放空间，返回*/

if (ret != 0) {

IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,

cleanup_entry, &i);

return ret;

}

/* 为每个CPU复制一个完整的table项*/

for (i = 1; i smp_num_cpus; i++) {

memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,

newinfo->entries,

SMP_ALIGN(newinfo->size));

}

return ret;

}

3.3 IPT_ENTRY_ITERAT宏 ip_tables.h

用来遍历每一个规则，然后调用其第三个参数（函数指针）进行处理，前两个参数分别表示规则的起始位置和规则总大小，后面的参数则视情况而定。

#define IPT_ENTRY_ITERATE(entries, size, fn, args...)

({

unsigned int __i;

int __ret = 0;

struct ipt_entry *__entry;

for (__i = 0; __i (size); __i += __entry->next_offset) {

__entry = (void *)(entries) + __i;

__ret = fn(__entry , ## args);

if (__ret != 0)

break;

}

__ret;

})

/* translate_table中出现了三次，分别是 */

IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,

check_entry_size_and_hooks,

newinfo,

newinfo->entries,

newinfo->entries + size,

hook_entries, underflows, &i);

IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,

check_entry, name, size, &i);

IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,

cleanup_entry, &i);

即是在遍历到每条entry时分别调用

check_entry_size_and_hooks，check_entry, cleanup_entry,三个函数

check_entry有大用处，后面解释

3.4 list_named_find（）函数 listhelp.h

在注册函数中，调用

list_named_find(&ipt_tables, table->name)

来检查当前表是否已被注册过了。可见，第一个参数为链表首部，第二个参数为当前表名。

其原型如下：

#define list_named_find(head, name)

LIST_FIND(head, __list_cmp_name, void *, name)

#define LIST_FIND(head, cmpfn, type, args...)

({

const struct list_head *__i = (head);

ASSERT_READ_LOCK(head);

do {

__i = __i->next;

if (__i == (head)) {

__i = NULL;

break;

}

} while (!cmpfn((const type)__i , ## args));

(type)__i;

})

前面提过，表是一个双向链表，在宏当中，以while进行循环，以__i = __i->next;

进行遍历，然后调用比较函数进行比较，传递过来的比较函数是__list_cmp_name。

比较函数很简单：

static inline int __list_cmp_name(const void *i, const char *name)

{

return strcmp(name, i+sizeof(struct list_head)) == 0;

}

3.5 replace_table（）函数 ip_tables.c

表中以struct ipt_table_info *private;表示实际数据区。但是在初始化赋值的时候，被设为NULL，而表的初始变量都以模版的形式，放在struct ipt_replace *table;中。

注册函数一开始，就声明了：struct ipt_table_info *newinfo;

然后对其分配了空间，将模块中的初值拷贝了进来。所以replace_table要做的工作，主要就是把newinfo中的值传递给table结构中的private成员。

replace_table(struct ipt_table *table,

unsigned int num_counters,

struct ipt_table_info *newinfo,

int *error)

{

struct ipt_table_info *oldinfo;

write_lock_bh(&table->lock);

if (num_counters != table->private->number) {

duprintf( num_counters != table->private->number (%u/%u) n ,

num_counters, table->private->number);

/* ipt_register_table函数中，replace_table函数之前有一句 table->private = &bootstrap;将private初始化为bootstrap，即{ 0，0，0，{0}，{0}，{}} */

write_unlock_bh(&table->lock);

*error = -EAGAIN;

return NULL;

}

oldinfo = table->private;

table->private = newinfo;

newinfo->initial_entries = oldinfo->initial_entries;

write_unlock_bh(&table->lock);

return oldinfo;

}

3.6 list_prepend（）函数 listhelp.h

当所有的初始化工作结束，就调用list_prepend来构建链表了。

static inline void

list_prepend(struct list_head *head, void *new)

{

ASSERT_WRITE_LOCK(head); /*设置写互斥*/

list_add(new, head); /*将当前表节点添加进链表*/

}

list_add就是一个构建双向链表的过程：

static __inline__ void list_add(struct list_head *new, struct list_head *head)

{

__list_add(new, head, head->next);

}

static __inline__ void __list_add(struct list_head * new,

struct list_head * prev,

struct list_head * next)

{

next->prev = new;

new->next = next;

new->prev = prev;

prev->next = new;

}

四、nf_hook_ops 钩子的注册

在filter表的初始化函数static int __init init(void)中除了有一个nf_register_hook函数注册一个tables外，还由nf_register_hook函数注册了3个hook

4.1 nf_hook_ops数据结构 netfilter.h

struct nf_hook_ops

{

struct list_head list; //链表成员

/* User fills in from here down. */

nf_hookfn *hook; //钩子函数指针

struct module *owner;

int pf; //协议簇，对于ipv4而言，是PF_INET

int hooknum; //hook类型

/* Hooks are ordered in ascending priority. */

int priority; //优先级

};

list成员用于维护Netfilter hook的列表。

hook成员是一个指向nf_hookfn类型的函数的指针，该函数是这个hook被调用时执行的函数。nf_hookfn同样在linux/netfilter.h中定义。

pf这个成员用于指定协议族。有效的协议族在linux/socket.h中列出，但对于IPv4我们使用协议族PF_INET。

hooknum这个成员用于指定安装的这个函数对应的具体的hook类型:

NF_IP_PRE_ROUTING 在完整性校验之后，选路确定之前

NF_IP_LOCAL_IN 在选路确定之后，且数据包的目的是本地主机

NF_IP_FORWARD 目的地是其它主机地数据包

NF_IP_LOCAL_OUT 来自本机进程的数据包在其离开本地主机的过程中

NF_IP_POST_ROUTING 在数据包离开本地主机“上线”之前

再看看它的初始化，仍以filter表为例

static struct nf_hook_ops ipt_ops[]

= { { { NULL, NULL }, ipt_hook, PF_INET, NF_IP_LOCAL_IN, NF_IP_PRI_FILTER },

{ { NULL, NULL }, ipt_hook, PF_INET, NF_IP_FORWARD, NF_IP_PRI_FILTER },

{ { NULL, NULL }, ipt_local_out_hook, PF_INET, NF_IP_LOCAL_OUT,

NF_IP_PRI_FILTER }

};

4.2 int nf_register_hook函数 netfilter.c

注册实际上就是在一个nf_hook_ops链表中再插入一个nf_hook_ops结构

int nf_register_hook(struct nf_hook_ops *reg)

{

struct list_head *i;

spin_lock_bh(&nf_hook_lock);

list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) {

if (reg->priority ((struct nf_hook_ops *)i)->priority)

break;

}

list_add_rcu(®->list, i->prev);

spin_unlock_bh(&nf_hook_lock);

synchronize_net();

return 0;

}

list_for_each 函数遍历当前待注册的钩子的协议pf及Hook类型所对应的链表，其首地址是&nf_hooks[reg->pf][reg->hooknum]，如果当前待注册钩子的优先级小于匹配的的节点的优先级，则找到了待插入的位置，也就是说，按优先级的升序排列。

list_add_rcu把当前节点插入到查到找的适合的位置，这样，完成后，所有pf协议下的hooknum类型的钩子，都被注册到&nf_hooks[reg->pf][reg->hooknum]为首的链表当中了。

4.3 ipt_hook钩子函数 iptable_raw.c

注册nf_hook_ops，也就向内核注册了一个钩子函数，这些函数有ipt_hook，ipt_local_hook，ipt_route_hook，ipt_local_out_hook等。

前面在nf_iterate()里调用的钩子函数就是它了

下面是ipt_hook函数的定义：

static unsigned int

ipt_hook(unsigned int hook, /* hook点 */

struct sk_buff **pskb,

const struct net_device *in,

const struct net_device *out,

int (*okfn)(struct sk_buff *)) /* 默认处理函数 */

{

/* 参数&packet_filter是由注册该nf_hook_ops的表（filter）决定的，也有可能是&packet_raw */

return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL);

}

实际上是直接调用ipt_do_table(ip_tables.c)函数

接下来就是根据table里面的entry来处理数据包了

一个table就是一组防火墙规则的集合

而一个entry就是一条规则，每个entry由一系列的matches和一个target组成

一旦数据包匹配了该某个entry的所有matches，就用target来处理它

Match又分为两部份，一部份为一些基本的元素，如来源/目的地址，进/出网口，协议等，对应了struct ipt_ip，我们常常将其称为标准的match，另一部份match则以插件的形式存在，是动态可选择，也允许第三方开发的，常常称为扩展的match，如字符串匹配，p2p匹配等。同样，规则的target也是可扩展的。这样，一条规则占用的空间，可以分为：struct ipt_ip+n*match+n*target，（n表示了其个数，这里的match指的是可扩展的match部份）。

五、 ipt_do_table()函数，数据包的过滤

5.1 ipt_entry 相关结构 ip_tables.h

ipt_entry结构前面有过了，再看一遍

struct ipt_entry

{

struct ipt_ip ip;

/* 所要匹配的报文的IP头信息 */

unsigned int nfcache;

/* 位向量，标示本规则关心报文的什么部分，暂未使用 */

u_int16_t target_offset;

/* target区的偏移，通常target区位于match区之后，而match区则在ipt_entry的末尾；

初始化为sizeof(struct ipt_entry)，即假定没有match */

u_int16_t next_offset;

/* 下一条规则相对于本规则的偏移，也即本规则所用空间的总和，

初始化为sizeof(struct ipt_entry)+sizeof(struct ipt_target)，即没有match */

unsigned int comefrom;

/* 位向量，标记调用本规则的HOOK号，可用于检查规则的有效性 */

struct ipt_counters counters;

/* 记录该规则处理过的报文数和报文总字节数 */

unsigned char elems[0];

/*target或者是match的起始位置 */

}

ipt_ip结构 ip_tables.h

struct ipt_ip {

struct in_addr src, dst; /* 来源/目的地址 */

struct in_addr smsk, dmsk; /* 来源/目的地址的掩码 */

char iniface[IFNAMSIZ], outiface[IFNAMSIZ]; /*输入输出网络接口*/

unsigned char iniface_mask[IFNAMSIZ], outiface_mask[IFNAMSIZ];

u_int16_t proto; /* 协议, 0 = ANY */

u_int8_t flags; /* 标志字段 */

u_int8_t invflags; /* 取反标志 */

};

5.2 ipt_do_table函数 ip_tables.c

unsigned int

ipt_do_table(struct sk_buff **pskb,

unsigned int hook,

const struct net_device *in,

const struct net_device *out,

struct ipt_table *table,

void *userdata)

{

static const char nulldevname[IFNAMSIZ]

__attribute__((aligned(sizeof(long))));

u_int16_t offset;

struct iphdr *ip;

u_int16_t datalen;

int hotdrop = 0;

/* Initializing verdict to NF_DROP keeps gcc happy. */

unsigned int verdict = NF_DROP;

const char *indev, *outdev;

void *table_base;

struct ipt_entry *e, *back;

/* Initialization */

ip = (*pskb)->nh.iph; /* 获取IP头 */

datalen = (*pskb)->len - ip->ihl * 4; /*指向数据区*/

indev = in ? in->name : nulldevname; /*取得输入设备名*/

outdev = out ? out->name : nulldevname; /*取得输出设备名*/

offset = ntohs(ip->frag_off) & IP_OFFSET; /*设置分片包的偏移*/

read_lock_bh(&table->lock); /*设置互斥锁*/

IP_NF_ASSERT(table->valid_hooks & (1 hook));

/*检验HOOK，debug用的*/

/*获取当前表的当前CPU的规则入口*/

table_base = (void *)table->private->entries

+ TABLE_OFFSET(table->private, smp_processor_id());

/*获得当前表的当前Hook的规则的起始偏移量*/

e = get_entry(table_base, table->private->hook_entry[hook]);

/*获得当前表的当前Hook的规则的上限偏移量*/

/* For return from builtin chain */

back = get_entry(table_base, table->private->underflow[hook]);

/* do …… while（！hotdrop）

进行规则的匹配 */

do {

IP_NF_ASSERT(e);

IP_NF_ASSERT(back);

(*pskb)->nfcache |= e->nfcache;

匹配IP包，成功则继续匹配下去，否则跳到下一个规则

ip_packet_match匹配标准match, 也就是ip报文中的一些基本的元素，如来源/目的地址，进/出网口，协议等，因为要匹配的内容是固定的，所以具体的函数实现也是固定的。

而IPT_MATCH_ITERATE （应该猜到实际是调用第二个参数do_match函数）匹配扩展的match，如字符串匹配，p2p匹配等，因为要匹配的内容不确定，所以函数的实现也是不一样的，所以do_match的实现就和具体的match模块有关了。

这里的&e->ip就是上面的ipt_ip结构

if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {

struct ipt_entry_target *t;

if (IPT_MATCH_ITERATE(e, do_match,

*pskb, in, out,

offset, &hotdrop) != 0)

goto no_match; /*不匹配则跳到 no_match，往下一个规则*/

/* 匹配则继续执行 */

/* 这个宏用来分别处理字节计数器和分组计数器这两个计数器 */

ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);

/*获取规则的target的偏移地址*/

t = ipt_get_target(e);

IP_NF_ASSERT(t->u.kernel.target);

/* 下面开始匹备target */

/* Standard target? */

if (!t->u.kernel.target->target) {

int v;

v = ((struct ipt_standard_target *)t)->verdict;

if (v 0) {

/* Pop from stack? */

if (v != IPT_RETURN) {

verdict = (unsigned)(-v) - 1;

break;

}

e = back;

back = get_entry(table_base,

back->comefrom);

continue;

}

if (table_base + v

!= (void *)e + e->next_offset) {

/* Save old back ptr in next entry */

struct ipt_entry *next

= (void *)e + e->next_offset;

next->comefrom

= (void *)back - table_base;

/* set back pointer to next entry */

back = next;

}

e = get_entry(table_base, v);

} else {

verdict = t->u.kernel.target->target(pskb,

in, out,

hook,

t->data,

userdata);

/* Target might have changed stuff. */

ip = (*pskb)->nh.iph;

datalen = (*pskb)->len - ip->ihl * 4;

if (verdict == IPT_CONTINUE)

e = (void *)e + e->next_offset;

else

/* Verdict */

break;

}

} else {

no_match:

e = (void *)e + e->next_offset; /* 匹配失败，跳到下一个规则 */

}

} while (!hotdrop);

read_unlock_bh(&table->lock);

#ifdef DEBUG_ALLOW_ALL

return NF_ACCEPT;

#else

if (hotdrop)

return NF_DROP;

else return verdict;

#endif

}

5.3 标准的match ip_packet_match函数 ip_tables.c

static inline int

ip_packet_match(const struct iphdr *ip,

const char *indev,

const char *outdev,

const struct ipt_ip *ipinfo,

int isfrag)

{

size_t i;

unsigned long ret;

/*定义一个宏，当bool和invflg的是一真一假的情况时，返回真。注意这里使用两个“！”的目的是使得这样计算后的值域只取0和1两个值*/

#define FWINV(bool,invflg) ((bool) ^ !!(ipinfo->invflags & invflg))

/*处理源和目标ip地址，这个if语句的意义是：到达分组的源ip地址经过掩码处理后与规则中的ip不匹配并且规则中没有包含对ip地址的取反，或者规则中包含了对匹配地址的取反，但到达分组的源ip与规则中的ip地址匹配，if的第一部分返回真，同样道理处理到达分组的目的ip地址。这两部分任意部分为真时，源或者目标地址不匹配。*/

if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,

IPT_INV_SRCIP)

|| FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,

IPT_INV_DSTIP)) {

dprintf( Source or dest mismatch. n );

dprintf( SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s n ,

NIPQUAD(ip->saddr),

NIPQUAD(ipinfo->smsk.s_addr),

NIPQUAD(ipinfo->src.s_addr),

ipinfo->invflags & IPT_INV_SRCIP ? (INV) : );

dprintf( DST: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s n ,

NIPQUAD(ip->daddr),

NIPQUAD(ipinfo->dmsk.s_addr),

NIPQUAD(ipinfo->dst.s_addr),

ipinfo->invflags & IPT_INV_DSTIP ? (INV) : );

return 0;

}

/*接着处理输入和输出的接口，for语句处理接口是否与规则中的接口匹配，不匹配时，ret返回非零，离开for语句后，处理接口的取反问题：当接口不匹配并且接口不取反，或者接口匹配，但是接口取反，说明接口不匹配。*/

/* Look for ifname matches; this should unroll nicely. */

/*输入接口*/

for (i = 0, ret = 0; i IFNAMSIZ/sizeof(unsigned long); i++) {

ret |= (((const unsigned long *)indev)[i]

^ ((const unsigned long *)ipinfo->iniface)[i])

& ((const unsigned long *)ipinfo->iniface_mask)[i];

}

if (FWINV(ret != 0, IPT_INV_VIA_IN)) {

dprintf( VIA in mismatch (%s vs %s).%s n ,

indev, ipinfo->iniface,

ipinfo->invflags&IPT_INV_VIA_IN ? (INV) : );

return 0;

}

/*输出接口*/

for (i = 0, ret = 0; i IFNAMSIZ/sizeof(unsigned long); i++) {

ret |= (((const unsigned long *)outdev)[i]

^ ((const unsigned long *)ipinfo->outiface)[i])

& ((const unsigned long *)ipinfo->outiface_mask)[i];

}

if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {

dprintf( VIA out mismatch (%s vs %s).%s n ,

outdev, ipinfo->outiface,

ipinfo->invflags&IPT_INV_VIA_OUT ? (INV) : );

return 0;

}

/* 检查协议是否匹配 */

/* Check specific protocol */

if (ipinfo->proto

&& FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {

dprintf( Packet protocol %hi does not match %hi.%s n ,

ip->protocol, ipinfo->proto,

ipinfo->invflags&IPT_INV_PROTO ? (INV) : );

return 0;

}

/*处理分片包的匹配情况*/

/* If we have a fragment rule but the packet is not a fragment

* then we return zero */

if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) {

dprintf( Fragment rule but not fragment.%s n ,

ipinfo->invflags & IPT_INV_FRAG ? (INV) : );

return 0;

}

return 1; /* 以上所有都匹配则返回1 */

}

六、扩展的match

6.1 do_match函数 ip_tables.c

do_match通过IPT_MATCH_ITERATE宏来调用,

IPT_MATCH_ITERATE是在ipt_do_table函数中调用的宏

IPT_MATCH_ITERATE(e, do_match,

*pskb, in, out,

offset, &hotdrop)

定义如下：

#define IPT_MATCH_ITERATE(e, fn, args...)

({

unsigned int __i;

int __ret = 0;

struct ipt_entry_match *__match;

for (__i = sizeof(struct ipt_entry);

__i (e)->target_offset;

__i += __match->u.match_size) {

__match = (void *)(e) + __i;

__ret = fn(__match , ## args);

if (__ret != 0)

break;

}

__ret;

})

下面就是do_match函数：

static inline

int do_match(struct ipt_entry_match *m,

const struct sk_buff *skb,

const struct net_device *in,

const struct net_device *out,

int offset,

const void *hdr,

u_int16_t datalen,

int *hotdrop)

{

/* Stop iteration if it doesn't match */

if (!m->u.kernel.match->match(skb, in, out, m->data,

offset, hdr, datalen, hotdrop))

return 1;

else

return 0;

}

实际上就是调用了m->u.kernel.match->match，这个东西应该就是调用后面解释

这里还出现了一个ipt_entry_match结构，它用来把match的内核态与用户态关连起来

6.2 ipt_xxx.c文件

我们在编译内核的netfilter选项时，有ah、esp、length……等一大堆的匹配选项，他们既可以是模块的形式注册，又可以是直接编译进内核，所以，他们应该是以单独的文件形式，以:

module_init(init);

module_exit(cleanup);

这样形式存在的，我们在源码目录下边，可以看到Ipt_ah.c、Ipt_esp.c、Ipt_length.c等许多文件，这些就是我们所要关心的了，另一方面，基本的TCP/UDP 的端口匹配，ICMP类型匹配不在此之列，所以，应该有初始化的地方，

我们注意到Ip_tables.c的init中，有如下语句：

/* Noone else will be downing sem now, so we won't sleep */

down(&ipt_mutex);

list_append(&ipt_target, &ipt_standard_target);

list_append(&ipt_target, &ipt_error_target);

list_append(&ipt_match, &tcp_matchstruct);

list_append(&ipt_match, &udp_matchstruct);

list_append(&ipt_match, &icmp_matchstruct);

up(&ipt_mutex);

可以看到，这里注册了standard_target、error_target两个target和tcp_matchstruct等三个match。这两个地方，就是涉及到match在内核中的注册了，以Ipt_*.c为例，它们都是以下结构：

#include XXX

MODULE_AUTHOR（）

MODULE_DESCRIPTION（）

MODULE_LICENSE（）

static int match（） /* ipt_match中的匹配函数 */

{

}

static int checkentry（） /* 检查entry有效性 */

{

}

static struct ipt_match XXX_match = { { NULL, NULL }, XXX , &match,

&checkent

《Linux-netfilter源码分析教程》目录

empty

《Linux-netfilter源码分析教程》pdf电子书免费下载

下载方式一：

百度网盘下载地址：https://pan.baidu.com/s/1l7aKJnst3Zu3mqhDQpet6g

百度网盘密码：1111

下载方式二：

http://ziliaoshare.cn/Download/ad_123563_do_Linux-netfilterYMFXJC.zip

《Linux-netfilter源码分析教程》介绍

《Linux-netfilter源码分析教程》目录

计算机

python

AI人工智能

javascript

计算机网络/服务器

数据库技术

计算机F

考试教辅

考研考博

英语四六级

沪ICP备18046276号-5

上海秋旦网络科技中心：上海市奉贤区金大公路8218号1幢联系电话：15618918379

《Linux-netfilter源码分析教程》pdf电子书免费下载

下载方式一：

百度网盘下载地址：https://pan.baidu.com/s/1l7aKJnst3Zu3mqhDQpet6g

百度网盘密码：1111

下载方式二：

《Linux-netfilter源码分析教程》介绍

《Linux-netfilter源码分析教程》目录

上海秋旦网络科技中心：上海市奉贤区金大公路8218号1幢 联系电话：15618918379

上海秋旦网络科技中心：上海市奉贤区金大公路8218号1幢联系电话：15618918379