Linux Fault Injection原理探索

Linux Fault Injection原理

以fail_make_request故障注入类型为例,了解Linux的故障注入机制底层原理。

![image-20240326154347021](/imgs/Linux Fault Injection原理/image-20240326154347021.png)

![image-20240326154429947](/imgs/Linux Fault Injection原理/image-20240326154429947.png)

![image-20240326154444358](/imgs/Linux Fault Injection原理/image-20240326154444358.png)

![image-20240326154504406](/imgs/Linux Fault Injection原理/image-20240326154504406.png)

1.核心数据结构

include/linux/fault-inject.h中的fault_attr

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#ifdef CONFIG_FAULT_INJECTION
struct fault_attr {
// 故障注入的概率
unsigned long probability;
// 故障发生的间隔(注:单位是故障次数!后续看源码会知道)
unsigned long interval;
// 并发故障注入时保证对 times 和 space 字段的修改正确
atomic_t times;
atomic_t space;
// 故障触发后的内核打印信息输出方式
unsigned long verbose;
// 设置进程过滤,对未启用了 make-it-fail 的进程和在中断上下文的进程进行过滤
bool task_filter;
// 跟踪函数调用深度
unsigned long stacktrace_depth;
// 设置进程的虚拟地址空间过滤
unsigned long require_start;
unsigned long require_end;
unsigned long reject_start;
unsigned long reject_end;

// 统计故障注入点的执行次数
unsigned long count;
// 日志输出的频率控制
struct ratelimit_state ratelimit_state;
// 特定故障类型目录项如 fail_make_request
struct dentry *dname;
};

// fault_attr结构体初始化
#define FAULT_ATTR_INITIALIZER { \
.interval = 1, \
.times = ATOMIC_INIT(1), \
.require_end = ULONG_MAX, \
.stacktrace_depth = 32, \
.ratelimit_state = RATELIMIT_STATE_INIT_DISABLED, \
.verbose = 2, \
.dname = NULL, \
}
// 定义fault_attr结构体的宏
#define DECLARE_FAULT_ATTR(name) struct fault_attr name = FAULT_ATTR_INITIALIZER

// 根据启动参数进一步设置fault_attr结构体的属性字段
int setup_fault_attr(struct fault_attr *attr, char *str);
// 最终故障注入是否成功的判断逻辑
bool should_fail(struct fault_attr *attr, ssize_t size);

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

struct dentry *fault_create_debugfs_attr(const char *name,
struct dentry *parent, struct fault_attr *attr);

#else /* CONFIG_FAULT_INJECTION_DEBUG_FS */

static inline struct dentry *fault_create_debugfs_attr(const char *name,
struct dentry *parent, struct fault_attr *attr)
{
return ERR_PTR(-ENODEV);
}
#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
#endif /* CONFIG_FAULT_INJECTION */

该结构体是fault-injection实现的核心结构体,该结构体中的大多数字段都有一种似成相识的感觉,其实它们都对应到debugfs中的各个配置接口文件。最后的三个字段是用于功能实现控制用的,其中count用于统计故障注入点的执行次数,ratelimit_state用于日志输出频率控制,最后的dname表示故障的类型(即fail_make_request等)。

考虑这里的times和space为什么是atomic_t类型?ratelimit_state又是如何起到控制日志输出频率的作用的呢?

2.内核模块初始化

block/blk-core.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#ifdef CONFIG_FAIL_MAKE_REQUEST

// 定义故障注入的核心数据结构 fail_make_request
static DECLARE_FAULT_ATTR(fail_make_request);

static int __init setup_fail_make_request(char *str)
{
// 进一步初始化 fail_make_request 结构体
return setup_fault_attr(&fail_make_request, str);
}
// 内核初始化的行为,不用细看
__setup("fail_make_request=", setup_fail_make_request);

// 设备故障开关启用+通过should_fail
static bool should_fail_request(struct hd_struct *part, unsigned int bytes)
{
return part->make_it_fail && should_fail(&fail_make_request, bytes);
}

#else /* CONFIG_FAIL_MAKE_REQUEST */

// 未配置CONFIG_FAIL_MAKE_REQUEST内核编译选项,不进行故障注入,直接返回false
static inline bool should_fail_request(struct hd_struct *part,
unsigned int bytes)
{
return false;
}

#endif /* CONFIG_FAIL_MAKE_REQUEST */

代码中静态定义一个struct fault_attr结构体以实例化fail_make_request,用于描述fail_make_request类型故障注入,DECLARE_FAULT_ATTR是之前看过的宏定义。这里的__setup宏说明,在内核初始化阶段将处理“fail_make_request=xxx”的启动参数,注册的处理函数为setup_fail_make_request,它进一步调用通用函数setup_fault_attr,对fail_make_request结构体变量进一步初始化。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
int setup_fault_attr(struct fault_attr *attr, char *str)
{
unsigned long probability;
unsigned long interval;
int times;
int space;

// 启动参数配置解析
/* "<interval>,<probability>,<space>,<times>" */
if (sscanf(str, "%lu,%lu,%d,%d",
&interval, &probability, &space, &times) < 4) {
printk(KERN_WARNING
"FAULT_INJECTION: failed to parse arguments\n");
return 0;
}

attr->probability = probability;
attr->interval = interval;
atomic_set(&attr->times, times);
atomic_set(&attr->space, space);

return 1;
}
EXPORT_SYMBOL_GPL(setup_fault_attr);

内核模块初始化阶段除了创建了故障注入的核心数据结构fail_make_request变量外,还启用了debugfs的配置接口文件,它会在debugfs的目录下创建一个名为fail_make_request的attr目录:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
#ifdef CONFIG_FAIL_MAKE_REQUEST

// 创建 debugfs 配置文件
static int __init fail_make_request_debugfs(void)
{
struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
NULL, &fail_make_request);

return PTR_ERR_OR_ZERO(dir);
}

late_initcall(fail_make_request_debugfs);

#endif /* CONFIG_FAIL_MAKE_REQUEST */

让我们看看fault_create_debugfs_attr函数如何创建故障注入目录及其子配置文件的:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

struct dentry *fault_create_debugfs_attr(const char *name,
struct dentry *parent, struct fault_attr *attr)
{
// 文件模式:默认为用户可读可写的普通文件
umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
struct dentry *dir;

// 创建 debugfs 下的根配置目录如 fail_make_request
dir = debugfs_create_dir(name, parent);
if (IS_ERR(dir))
return dir;

// 创建根配置目录下的各个属性配置文件
debugfs_create_ul("probability", mode, dir, &attr->probability);
debugfs_create_ul("interval", mode, dir, &attr->interval);
debugfs_create_atomic_t("times", mode, dir, &attr->times);
debugfs_create_atomic_t("space", mode, dir, &attr->space);
debugfs_create_ul("verbose", mode, dir, &attr->verbose);
debugfs_create_u32("verbose_ratelimit_interval_ms", mode, dir,
&attr->ratelimit_state.interval);
debugfs_create_u32("verbose_ratelimit_burst", mode, dir,
&attr->ratelimit_state.burst);
debugfs_create_bool("task-filter", mode, dir, &attr->task_filter);

#ifdef CONFIG_FAULT_INJECTION_STACKTRACE_FILTER
debugfs_create_stacktrace_depth("stacktrace-depth", mode, dir,
&attr->stacktrace_depth);
debugfs_create_ul("require-start", mode, dir, &attr->require_start);
debugfs_create_ul("require-end", mode, dir, &attr->require_end);
debugfs_create_ul("reject-start", mode, dir, &attr->reject_start);
debugfs_create_ul("reject-end", mode, dir, &attr->reject_end);
#endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */

// 故障注入核心数据结构保存故障类型配置目录
attr->dname = dget(dir);
return dir;
}
EXPORT_SYMBOL_GPL(fault_create_debugfs_attr);

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

首先传入的parent为NULL,所以fail_make_request目录创建的点为degubfs的根目录,然后在该目录下依次创建probability、interval、times等等之前看到的属性配置文件,最后将目录的dentry保存到attr->dname字段中。

注意:我们实际对故障注入进行配置都是通过修改/sys/kernel/debugfs/fail_make_request/**等配置文件,通过文件操作指针,debugfs与fault_attr的字段关联起来,这会间接修改内核空间的fail_make_request结构体变量。

debug_create_ul创建probability文件为例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

static int debugfs_ul_set(void *data, u64 val)
{
*(unsigned long *)data = val;
return 0;
}

static int debugfs_ul_get(void *data, u64 *val)
{
*val = *(unsigned long *)data;
return 0;
}
// 对于debugfs_create_ul创建的文件如probability配置文件,定义文件操作
DEFINE_SIMPLE_ATTRIBUTE(fops_ul, debugfs_ul_get, debugfs_ul_set, "%llu\n");

static void debugfs_create_ul(const char *name, umode_t mode,
struct dentry *parent, unsigned long *value)
{
debugfs_create_file(name, mode, parent, value, &fops_ul);
}

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

但是stacktrace-depth文件比较特殊,虽然fault_attr结构体中该字段的类型也是unsigned long,但是因为该字段有最大值限制MAX_STACK_TRACE_DEPTH即32,因此创建stacktrace-depth文件需要使用debugfs_create_stacktrace_depth函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

#ifdef CONFIG_FAULT_INJECTION_STACKTRACE_FILTER

static int debugfs_stacktrace_depth_set(void *data, u64 val)
{
// stacktrace_depth最大值是MAX_STACK_TRACE_DEPTH(32)
*(unsigned long *)data =
min_t(unsigned long, val, MAX_STACK_TRACE_DEPTH);

return 0;
}

// 对于debugfs_create_stacktrace_depth创建的文件即stacktrace_depth配置文件,定义文件操作
DEFINE_SIMPLE_ATTRIBUTE(fops_stacktrace_depth, debugfs_ul_get,
debugfs_stacktrace_depth_set, "%llu\n");

static void debugfs_create_stacktrace_depth(const char *name, umode_t mode,
struct dentry *parent,
unsigned long *value)
{
debugfs_create_file(name, mode, parent, value, &fops_stacktrace_depth);
}

#endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

==可以看出,fops_stacktrace_depth与fops_ul之间的唯一差别就是set方法有所不同,前者需要进行最大值限制。==

3.块设备配置接口

block/partition-generic.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#ifdef CONFIG_FAIL_MAKE_REQUEST

static struct device_attribute dev_attr_fail =
__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);

ssize_t part_fail_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);

// 读取写入的make_it_fail值 0|1
return sprintf(buf, "%d\n", p->make_it_fail);
}

ssize_t part_fail_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct hd_struct *p = dev_to_part(dev);
int i;

// 写入的非零值内部统统变为 1
if (count > 0 && sscanf(buf, "%d", &i) > 0)
p->make_it_fail = (i == 0) ? 0 : 1;

return count;
}

#endif

基于sysfs的接口,当用户往/sys/block/sda/make-it-fail写入非0时,对应设备struct hd_struct的make_it_fail字段就被置位为1,开关就打开了。

4.IO故障流程

![image-20230815144612998-1711438503753-1-1711438507401-3](/imgs/Linux Fault Injection原理/image-20230815144612998-1711438503753-1-1711438507401-3.png)

block/blk-core.c

1
2
3
4
5
6
7
8
9
10
11
12
13
static noinline int should_fail_bio(struct bio *bio)
{
if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
return -EIO;
return 0;
}
ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);

// 块设备故障开关启用并且通过should_fail故障注入逻辑判断
static bool should_fail_request(struct hd_struct *part, unsigned int bytes)
{
return part->make_it_fail && should_fail(&fail_make_request, bytes);
}

在IO提交流程的should_fail_bio函数中会调用should_fail_request函数进行故障注入的判断,should_fail_request函数进而判断块设备故障开关make_it_fail是否开启,如果已开启则使用should_fail并根据传入的配置好的fail_make_request判断故障注入是否触发成功。

最终的故障注入判断逻辑如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
bool should_fail(struct fault_attr *attr, ssize_t size)
{
// #define in_task() (!(in_nmi() | in_hardirq() | in_serving_softirq()))
if (in_task()) {
unsigned int fail_nth = READ_ONCE(current->fail_nth);

// 忽略第1~fail_nth-1次的故障注入
if (fail_nth) {
if (!WRITE_ONCE(current->fail_nth, fail_nth - 1))
goto fail;
return false;
}
}

/* No need to check any other properties if the probability is 0 */
if (attr->probability == 0)
return false;

// 设置了task_filter选项并且进程符合条件,则通过故障注入判断
if (attr->task_filter && !fail_task(attr, current))
return false;

// 故障注入的可用次数还有剩余,则通过故障注入判断
if (atomic_read(&attr->times) == 0)
return false;

// 可读写容量还有剩余,则通过故障注入判断
if (atomic_read(&attr->space) > size) {
atomic_sub(size, &attr->space);
return false;
}

// 故障注入的间隔周期,每隔interval次通过故障注入判断
if (attr->interval > 1) {
// 统计故障注入点的执行次数
attr->count++;
if (attr->count % attr->interval)
return false;
}

// 落入概率区间
if (attr->probability <= prandom_u32() % 100)
return false;

if (!fail_stacktrace(attr))
return false;

fail:
// 通过上述所有故障注入判断逻辑后,故障得以真正注入,打印故障注入消息
fail_dump(attr);

// 递减故障注入的可用剩余次数
if (atomic_read(&attr->times) != -1)
atomic_dec_not_zero(&attr->times);

return true;
}
EXPORT_SYMBOL_GPL(should_fail);

进程过滤选项:

1
2
3
4
5
static bool fail_task(struct fault_attr *attr, struct task_struct *task)
{
// 不处于中断上下文(包括硬件中断、软件中断、不可屏蔽中断)和启用了mak_it_fail的进程才允许进行故障注入
return in_task() && task->make_it_fail;
}

stacktrace过滤选项:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#ifdef CONFIG_FAULT_INJECTION_STACKTRACE_FILTER
// 对于配置了CONFIG_FAULT_INJECTION_STACKTRACE_FILTER配置项的内核,fail_stacktrace有意义
static bool fail_stacktrace(struct fault_attr *attr)
{
int depth = attr->stacktrace_depth;
unsigned long entries[MAX_STACK_TRACE_DEPTH];
int n, nr_entries;
bool found = (attr->require_start == 0 && attr->require_end == ULONG_MAX);

if (depth == 0)
return found;

// 记录当前的函数调用栈,但是会去掉调用栈的第一项,即stack_trace_save本身
nr_entries = stack_trace_save(entries, depth, 1);
for (n = 0; n < nr_entries; n++) {
// 遇见处于[reject_start,reject_end)的函数调用点,则不允许故障注入,拒绝区的优先级更高
if (attr->reject_start <= entries[n] &&
entries[n] < attr->reject_end)
return false;
// 一旦发现处于[require_start,require_end)的函数调用点,则允许故障注入
if (attr->require_start <= entries[n] &&
entries[n] < attr->require_end)
found = true;
}
return found;
}

#else

// 对于未配置CONFIG_FAULT_INJECTION_STACKTRACE_FILTER配置项的内核,fail_stacktrace没有意义,直接通过
static inline bool fail_stacktrace(struct fault_attr *attr)
{
return true;
}

#endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */

打印故障注入消息:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
static void fail_dump(struct fault_attr *attr)
{
if (attr->verbose > 0 && __ratelimit(&attr->ratelimit_state)) {
// 打印故障注入消息
printk(KERN_NOTICE "FAULT_INJECTION: forcing a failure.\n"
"name %pd, interval %lu, probability %lu, "
"space %d, times %d\n", attr->dname,
attr->interval, attr->probability,
atomic_read(&attr->space),
atomic_read(&attr->times));
// 如果verbose大于1,则打印函数调用的堆栈信息
if (attr->verbose > 1)
dump_stack();
}
}

5.日志输出频率限制

include/linux/ratelimit.h

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
// 内核很多子系统的默认日志输出频率限制:10条log/5s
// 语义信息可以看后面的 ratelimit.c 代码
#define DEFAULT_RATELIMIT_INTERVAL (5 * HZ)
#define DEFAULT_RATELIMIT_BURST 10

/* issue num suppressed message on exit */
// 退出时回显丢弃日志数量信息,具体见下面的 ratelimit_state_exit
#define RATELIMIT_MSG_ON_RELEASE BIT(0)

struct ratelimit_state {
raw_spinlock_t lock; /* protect the state */

// intervel为0表示无日志输出频率限制,具体见 __ratelimit
int interval;
int burst;
int printed;
int missed;
unsigned long begin;
unsigned long flags;
};

#define RATELIMIT_STATE_INIT_FLAGS(name, interval_init, burst_init, flags_init) { \
.lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
.interval = interval_init, \
.burst = burst_init, \
.flags = flags_init, \
}
#define RATELIMIT_STATE_INIT(name, interval_init, burst_init) \
RATELIMIT_STATE_INIT_FLAGS(name, interval_init, burst_init, 0)

#define RATELIMIT_STATE_INIT_DISABLED \
RATELIMIT_STATE_INIT(ratelimit_state, 0, DEFAULT_RATELIMIT_BURST)

lib/ratelimit.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
/*
* __ratelimit - rate limiting
* @rs: ratelimit_state data
* @func: name of calling function
*
* This enforces a rate limit: not more than @rs->burst callbacks
* in every @rs->interval
*
* RETURNS:
* 0 means callbacks will be suppressed.
* 1 means go ahead and do it.
*/
int ___ratelimit(struct ratelimit_state *rs, const char *func)
{
/* Paired with WRITE_ONCE() in .proc_handler().
* Changing two values seperately could be inconsistent
* and some message could be lost. (See: net_ratelimit_state).
*/
int interval = READ_ONCE(rs->interval);
int burst = READ_ONCE(rs->burst);
unsigned long flags;
int ret;

// interval为0不限制日志输出频率
if (!interval)
return 1;

/*
* If we contend on this state's lock then almost
* by definition we are too busy to print a message,
* in addition to the one that will be printed by
* the entity that is holding the lock already:
*/
// 锁竞争的严峻态势间接表明了日志输出太过频繁
if (!raw_spin_trylock_irqsave(&rs->lock, flags))
return 0;

// 设置日志的打印起始时刻
if (!rs->begin)
rs->begin = jiffies;

// 已经超出设置的时间间隔
if (time_is_before_jiffies(rs->begin + interval)) {
if (rs->missed) {
if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) {
printk_deferred(KERN_WARNING
"%s: %d callbacks suppressed\n",
func, rs->missed);
rs->missed = 0;
}
}
// 重新设置日志的打印起始时刻
rs->begin = jiffies;
// 时间间隔内日志输出条数归零
rs->printed = 0;
}
// 未达到日志输出上限
if (burst && burst > rs->printed) {
// 记录输出的日志条数
rs->printed++;
ret = 1;
} else {
// 记录丢弃的日志条数
rs->missed++;
ret = 0;
}
// 释放锁资源
raw_spin_unlock_irqrestore(&rs->lock, flags);

return ret;
}
EXPORT_SYMBOL(___ratelimit);

6.总结思考

Linux Fault Injection是Linux新增的一个系统能力,为内核开发人员提供了代码健壮性检查的故障注入能力,整体思想就是预留有默认实现关键扩展点如should_fail函数(默认返回false)等,通过内核编译选项对应的宏定义编译不同行为的扩展点,同时方便快速修正和测试故障注入行为提供用户态下的DebugFS(文件操作指针与内核态中的故障注入属性绑定),可选的配置项有注入概率、间隔、最大次数、进程过滤、容量限制、内存区间、日志级别和频率。