跳过正文

libbpf skeleton 用户空间程序分析

·15745 字
Ebpf
目录

三大步骤:

  1. hello_verifier_bpf__open
  2. hello_verifier_bpf__load
  3. hello_verifier_bpf__attach
// https://github.com/lizrice/learning-ebpf/blob/e0fbafd4b88fa6a44eb9e4679e695b8f83b5a052/chapter6/hello-verifier.c#L29

int main()
{
	struct hello_verifier_bpf *skel;
	int err;
	struct perf_buffer *pb = NULL;

	libbpf_set_print(libbpf_print_fn); // 指定 libbpf 打印 log 时使用的函数

	// 内核在 load/verify eBPF 时使用一个单独的 log buffer, 用于保存所有级别的日志(默认只打印 warn/error)
	// 另外,也可以指定 opts 中的 btf_custom_path 来指定自定义的 btf/vmlinux raw data 文件。
	char log_buf[64 * 1024];
	LIBBPF_OPTS(bpf_object_open_opts, opts,
		    .kernel_log_buf = log_buf,
		    .kernel_log_size = sizeof(log_buf),
		    .kernel_log_level = 1,
		);

// 。。。
	skel = hello_verifier_bpf__open_opts(&opts);
	if (!skel) {
		printf("Failed to open BPF object\n");
		return 1;
	}

	err = hello_verifier_bpf__load(skel);
	// 打印 buffer 中 verify 的 log
	// Print the verifier log
	for (int i=0; i < sizeof(log_buf); i++) {
		if (log_buf[i] == 0 && log_buf[i+1] == 0) {
			break;
		}
		printf("%c", log_buf[i]);
	}
// 。。。

	// 在 attach 前修改 Maps
	// Configure a message to use only if the UID for the event is 501
	uint32_t key = 501;
	struct msg_t msg;
	const char *m = "hello Liz";
	strncpy((char *)&msg.message, m, strlen(m));
	bpf_map__update_elem(skel->maps.my_config, &key, sizeof(key), &msg, sizeof(msg), 0);

// 。。。
	// Attach the progam to the event
	err = hello_verifier_bpf__attach(skel);
	if (err) {
		fprintf(stderr, "Failed to attach BPF skeleton: %d\n", err);
		hello_verifier_bpf__destroy(skel);
		return 1;
	}
// 。。。

	// 基于 perf buff maps 创建一个 perf buff
	pb = perf_buffer__new(bpf_map__fd(skel->maps.output), 8, handle_event, lost_event, NULL, NULL);
	if (!pb) {
		err = -1;
		fprintf(stderr, "Failed to create ring buffer\n");
		hello_verifier_bpf__destroy(skel);
		return 1;
	}

	while (true) {
		err = perf_buffer__poll(pb, 100 /* timeout, ms */);
		// Ctrl-C gives -EINTR
		if (err == -EINTR) {
			err = 0;
			break;
		}
		if (err < 0) {
			printf("Error polling perf buffer: %d\n", err);
			break;
		}
	}

	perf_buffer__free(pb);
	hello_verifier_bpf__destroy(skel);
	return -err;
}

1 hello_verifier_bpf__open()
#

解析内嵌的字节码,返回一个 struct hello_verifier_bpf,其中关键点:

  1. struct bpf_object_open_opts: open 参数, 如设置自定义 btf path, verify log level 和 buff;
  2. struct bpf_object;
// https://github.com/libbpf/libbpf/blob/a6d7530cb7dff87ac1e64a540e63b67ddde2e0f9/src/libbpf.h#L1203
struct bpf_map_skeleton {
	const char *name;
	struct bpf_map **map;
	void **mmaped;
};

struct bpf_prog_skeleton {
	const char *name;
	struct bpf_program **prog;
	struct bpf_link **link;
};

struct bpf_object_skeleton {
	size_t sz; /* size of this struct, for forward/backward compatibility */

	const char *name;
	const void *data;
	size_t data_sz;

	struct bpf_object **obj;

	int map_cnt;
	int map_skel_sz; /* sizeof(struct bpf_map_skeleton) */
	struct bpf_map_skeleton *maps;

	int prog_cnt;
	int prog_skel_sz; /* sizeof(struct bpf_prog_skeleton) */
	struct bpf_prog_skeleton *progs;
};

// https://github.com/lizrice/learning-ebpf/blob/e0fbafd4b88fa6a44eb9e4679e695b8f83b5a052/chapter6/hello-verifier.skel.h#L11
struct hello_verifier_bpf {
	struct bpf_object_skeleton *skeleton;
	struct bpf_object *obj;
	struct {
		struct bpf_map *my_config;
		struct bpf_map *output;
		struct bpf_map *data;
		struct bpf_map *rodata;
	} maps;
	struct {
		struct bpf_program *kprobe_exec;
		struct bpf_program *xdp_hello;
	} progs;
	struct {
		struct bpf_link *kprobe_exec;
		struct bpf_link *xdp_hello;
	} links;
	struct hello_verifier_bpf__data {
		int c;
		char message[12];
	} *data;

// https://github.com/lizrice/learning-ebpf/blob/e0fbafd4b88fa6a44eb9e4679e695b8f83b5a052/chapter6/hello-verifier.skel.h#L85
static inline struct hello_verifier_bpf *
hello_verifier_bpf__open(void)
{
	return hello_verifier_bpf__open_opts(NULL);
}

// https://github.com/lizrice/learning-ebpf/blob/e0fbafd4b88fa6a44eb9e4679e695b8f83b5a052/chapter6/hello-verifier.skel.h#L58
static inline struct hello_verifier_bpf *
hello_verifier_bpf__open_opts(const struct bpf_object_open_opts *opts)
{
	struct hello_verifier_bpf *obj;
	int err;
...
	err = hello_verifier_bpf__create_skeleton(obj); // 设置 obj->skeleton.
	if (err)
		goto err_out;

	err = bpf_object__open_skeleton(obj->skeleton, opts);
...
}

// https://github.com/lizrice/learning-ebpf/blob/e0fbafd4b88fa6a44eb9e4679e695b8f83b5a052/chapter6/hello-verifier.skel.h#L129
static inline int
hello_verifier_bpf__create_skeleton(struct hello_verifier_bpf *obj)
{
	struct bpf_object_skeleton *s;

      // 设置 skeleton 的 name 和 obj
	s->sz = sizeof(*s);
	s->name = "hello_verifier_bpf";
	s->obj = &obj->obj;

     // 设置 maps
	s->map_cnt = 4;
	s->maps[0].name = "my_config";
	s->maps[0].map = &obj->maps.my_config;

	s->maps[1].name = "output";
	s->maps[1].map = &obj->maps.output;

	s->maps[2].name = "hello_ve.data";
	s->maps[2].map = &obj->maps.data;
	s->maps[2].mmaped = (void **)&obj->data;

	s->maps[3].name = "hello_ve.rodata";
	s->maps[3].map = &obj->maps.rodata;

      // 设置 Programs
	s->prog_cnt = 2;

      // 每个 Program 都包含 link 信息,后续在 Load 时自动 Link。
	s->progs[0].name = "kprobe_exec";
	s->progs[0].prog = &obj->progs.kprobe_exec;
	s->progs[0].link = &obj->links.kprobe_exec;

	s->progs[1].name = "xdp_hello";
	s->progs[1].prog = &obj->progs.xdp_hello;
	s->progs[1].link = &obj->links.xdp_hello;

      // 使用内嵌的 obj 字节码设置 s->data
	s->data = (void *)hello_verifier_bpf__elf_bytes(&s->data_sz);

	obj->skeleton = s;
	return 0;
}

// https://github.com/libbpf/libbpf/blob/a6d7530cb7dff87ac1e64a540e63b67ddde2e0f9/src/libbpf.c#L12202
int bpf_object__open_skeleton(struct bpf_object_skeleton *s,
			      const struct bpf_object_open_opts *opts)
{
	DECLARE_LIBBPF_OPTS(bpf_object_open_opts, skel_opts,
		.object_name = s->name,
	);
	struct bpf_object *obj;
	int err;

	/* Attempt to preserve opts->object_name, unless overriden by user
	 * explicitly. Overwriting object name for skeletons is discouraged,
	 * as it breaks global data maps, because they contain object name
	 * prefix as their own map name prefix. When skeleton is generated,
	 * bpftool is making an assumption that this name will stay the same.
	 */
	if (opts) {
		memcpy(&skel_opts, opts, sizeof(*opts));
		if (!opts->object_name)
			skel_opts.object_name = s->name;
	}

      // s->data 中保存了内嵌的 obj 字节码,这里使用它来创建一个 bpf_object.
	obj = bpf_object__open_mem(s->data, s->data_sz, &skel_opts);
	err = libbpf_get_error(obj);
	if (err) {
		pr_warn("failed to initialize skeleton BPF object '%s': %d\n",
			s->name, err);
		return libbpf_err(err);
	}

	*s->obj = obj;
	err = populate_skeleton_maps(obj, s->maps, s->map_cnt);
	if (err) {
		pr_warn("failed to populate skeleton maps for '%s': %d\n", s->name, err);
		return libbpf_err(err);
	}

	err = populate_skeleton_progs(obj, s->progs, s->prog_cnt);
	if (err) {
		pr_warn("failed to populate skeleton progs for '%s': %d\n", s->name, err);
		return libbpf_err(err);
	}

	return 0;
}

// 使用 字节码 obj buff 来创建一个 bpf_object 对象
struct bpf_object *
bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz,
		     const struct bpf_object_open_opts *opts)
{
	if (!obj_buf || obj_buf_sz == 0)
		return libbpf_err_ptr(-EINVAL);

      // bpf_object_open() 用于创建实际的 bpf_object 对象
	return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, opts));
}


// https://github.com/libbpf/libbpf/blob/a6d7530cb7dff87ac1e64a540e63b67ddde2e0f9/src/libbpf.c#L7132
static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, size_t obj_buf_sz,
					  const struct bpf_object_open_opts *opts)
{
	const char *obj_name, *kconfig, *btf_tmp_path;
	struct bpf_object *obj;
	char tmp_name[64];
	int err;
	char *log_buf;
	size_t log_size;
	__u32 log_level;

// 。。。
	obj->log_buf = log_buf;
	obj->log_size = log_size;
	obj->log_level = log_level;

	btf_tmp_path = OPTS_GET(opts, btf_custom_path, NULL);
	if (btf_tmp_path) {
		if (strlen(btf_tmp_path) >= PATH_MAX) {
			err = -ENAMETOOLONG;
			goto out;
		}
		obj->btf_custom_path = strdup(btf_tmp_path);
		if (!obj->btf_custom_path) {
			err = -ENOMEM;
			goto out;
		}
	}
//。。。
	err = bpf_object__elf_init(obj);
	err = err ? : bpf_object__check_endianness(obj);
	err = err ? : bpf_object__elf_collect(obj);
	err = err ? : bpf_object__collect_externs(obj);
	err = err ? : bpf_object__finalize_btf(obj);
	err = err ? : bpf_object__init_maps(obj, opts);
      // bpf_object_init_progs 创建 Progs,根据 SEC Name 来填充 Prog 的 Type 和 expected_attach_type。
	err = err ? : bpf_object_init_progs(obj, opts);
	err = err ? : bpf_object__collect_relos(obj);
。。。
}


// https://github.com/libbpf/libbpf/blob/a6d7530cb7dff87ac1e64a540e63b67ddde2e0f9/src/libbpf.c#L7099
static int bpf_object_init_progs(struct bpf_object *obj, const struct bpf_object_open_opts *opts)
{
	struct bpf_program *prog;
	int err;

	bpf_object__for_each_program(prog, obj) {

             // 重点:根据 Prog 的 section name 来获得对应的 Program Type,AttachType
		prog->sec_def = find_sec_def(prog->sec_name);
		if (!prog->sec_def) {
			/* couldn't guess, but user might manually specify */
			pr_debug("prog '%s': unrecognized ELF section name '%s'\n",
				prog->name, prog->sec_name);
			continue;
		}

            // 根据返回的 sec_def 中的内容来设置 Prog Type 和 Prog expected_attach_type。
		prog->type = prog->sec_def->prog_type;
		prog->expected_attach_type = prog->sec_def->expected_attach_type;

		/* sec_def can have custom callback which should be called
		 * after bpf_program is initialized to adjust its properties
		 */
		if (prog->sec_def->prog_setup_fn) {
			err = prog->sec_def->prog_setup_fn(prog, prog->sec_def->cookie);
			if (err < 0) {
				pr_warn("prog '%s': failed to initialize: %d\n",
					prog->name, err);
				return err;
			}
		}
	}

	return 0;
}

// https://github.com/libbpf/libbpf/blob/a6d7530cb7dff87ac1e64a540e63b67ddde2e0f9/src/libbpf.c#L8676
// 根据 sec_name 查找 section_def 数组,返回对应的定义。
static const struct bpf_sec_def *find_sec_def(const char *sec_name)
{
	const struct bpf_sec_def *sec_def;
	int i, n;

	n = custom_sec_def_cnt;
	for (i = 0; i < n; i++) {
		sec_def = &custom_sec_defs[i];
		if (sec_def_matches(sec_def, sec_name))
			return sec_def;
	}

      // 查找 section_def 数组
	n = ARRAY_SIZE(section_defs);
	for (i = 0; i < n; i++) {
		sec_def = &section_defs[i];
		if (sec_def_matches(sec_def, sec_name))
			return sec_def;
	}

	if (has_custom_fallback_def)
		return &custom_fallback_def;

	return NULL;
}

2 hello_verifier_bpf__load()
#

根据传入的 struct hello_verifier_bpf 调用 bpf_object_load()

  1. bump_rlimit_memlock
  2. err = err ? : bpf_object__load_vmlinux_btf(obj, false);
  3. err = err ? : bpf_object__resolve_externs(obj, obj->kconfig);
  4. err = err ? : bpf_object__sanitize_and_load_btf(obj);
  5. err = err ? : bpf_object__sanitize_maps(obj);
  6. err = err ? : bpf_object__init_kern_struct_ops_maps(obj);
  7. err = err ? : bpf_object__create_maps(obj); // 创建 Maps 和 pinned path
  8. err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path);
  9. err = err ? : bpf_object__load_progs(obj, extra_log_level);
  10. err = err ? : bpf_object_init_prog_arrays(obj);
  11. err = err ? : bpf_object_prepare_struct_ops(obj);
// https://github.com/lizrice/learning-ebpf/blob/e0fbafd4b88fa6a44eb9e4679e695b8f83b5a052/chapter6/hello-verifier.skel.h#L91
static inline int
hello_verifier_bpf__load(struct hello_verifier_bpf *obj)
{
	return bpf_object__load_skeleton(obj->skeleton);
}


// https://github.com/libbpf/libbpf/blob/a6d7530cb7dff87ac1e64a540e63b67ddde2e0f9/src/libbpf.c#L12317

int bpf_object__load_skeleton(struct bpf_object_skeleton *s)
{
	int i, err;

      // 重点,load bpf_object,包含 Maps, Programs, BTF 等.
	err = bpf_object__load(*s->obj);
	if (err) {
		pr_warn("failed to load BPF skeleton '%s': %d\n", s->name, err);
		return libbpf_err(err);
	}

	for (i = 0; i < s->map_cnt; i++) {
		struct bpf_map *map = *s->maps[i].map;
		size_t mmap_sz = bpf_map_mmap_sz(map);
		int prot, map_fd = bpf_map__fd(map);
		void **mmaped = s->maps[i].mmaped;

		if (!mmaped)
			continue;

		if (!(map->def.map_flags & BPF_F_MMAPABLE)) {
			*mmaped = NULL;
			continue;
		}

		if (map->def.map_flags & BPF_F_RDONLY_PROG)
			prot = PROT_READ;
		else
			prot = PROT_READ | PROT_WRITE;

		/* Remap anonymous mmap()-ed "map initialization image" as
		 * a BPF map-backed mmap()-ed memory, but preserving the same
		 * memory address. This will cause kernel to change process'
		 * page table to point to a different piece of kernel memory,
		 * but from userspace point of view memory address (and its
		 * contents, being identical at this point) will stay the
		 * same. This mapping will be released by bpf_object__close()
		 * as per normal clean up procedure, so we don't need to worry
		 * about it from skeleton's clean up perspective.
		 */
		*mmaped = mmap(map->mmaped, mmap_sz, prot,
				MAP_SHARED | MAP_FIXED, map_fd, 0);
		if (*mmaped == MAP_FAILED) {
			err = -errno;
			*mmaped = NULL;
			pr_warn("failed to re-mmap() map '%s': %d\n",
				 bpf_map__name(map), err);
			return libbpf_err(err);
		}
	}

	return 0;
}

// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L7911
int bpf_object__load(struct bpf_object *obj)
{
	return bpf_object_load(obj, 0, NULL);
}

// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L7842
static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const char *target_btf_path)
{
	int err, i;

	if (!obj)
		return libbpf_err(-EINVAL);

	if (obj->loaded) {
		pr_warn("object '%s': load can't be attempted twice\n", obj->name);
		return libbpf_err(-EINVAL);
	}

	if (obj->gen_loader)
		bpf_gen__init(obj->gen_loader, extra_log_level, obj->nr_programs, obj->nr_maps);

      // 先 BPF_PROG_TYPE_SOCKET_FILTER 和 BPF_PROG_TYPE_TRACEPOINT.
	err = bpf_object__probe_loading(obj);
	err = err ? : bpf_object__load_vmlinux_btf(obj, false);
	err = err ? : bpf_object__resolve_externs(obj, obj->kconfig);
	err = err ? : bpf_object__sanitize_and_load_btf(obj);
	err = err ? : bpf_object__sanitize_maps(obj);
	err = err ? : bpf_object__init_kern_struct_ops_maps(obj);
      // 创建 Maps
	err = err ? : bpf_object__create_maps(obj);
	err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path);
      // Load 所有 Program
	err = err ? : bpf_object__load_progs(obj, extra_log_level);
	err = err ? : bpf_object_init_prog_arrays(obj);
	err = err ? : bpf_object_prepare_struct_ops(obj);
...
}

// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L4485
static int
bpf_object__probe_loading(struct bpf_object *obj)
{
	char *cp, errmsg[STRERR_BUFSIZE];
	struct bpf_insn insns[] = {
		BPF_MOV64_IMM(BPF_REG_0, 0),
		BPF_EXIT_INSN(),
	};
	int ret, insn_cnt = ARRAY_SIZE(insns);

	if (obj->gen_loader)
		return 0;

	ret = bump_rlimit_memlock();
	if (ret)
		pr_warn("Failed to bump RLIMIT_MEMLOCK (err = %d), you might need to do it explicitly!\n", ret);

	/* make sure basic loading works */
	ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL);
	if (ret < 0)
		ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL);
	if (ret < 0) {
		ret = errno;
		cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg));
		pr_warn("Error in %s():%s(%d). Couldn't load trivial BPF "
			"program. Make sure your kernel supports BPF "
			"(CONFIG_BPF_SYSCALL=y) and/or that RLIMIT_MEMLOCK is "
			"set to big enough value.\n", __func__, cp, ret);
		return -ret;
	}
	close(ret);

	return 0;
}

// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/bpf.c#L250
int bpf_prog_load(enum bpf_prog_type prog_type,
		  const char *prog_name, const char *license,
		  const struct bpf_insn *insns, size_t insn_cnt,
		  struct bpf_prog_load_opts *opts)
{
	const size_t attr_sz = offsetofend(union bpf_attr, log_true_size);
	void *finfo = NULL, *linfo = NULL;
	const char *func_info, *line_info;
	__u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd;
	__u32 func_info_rec_size, line_info_rec_size;
	int fd, attempts;
	union bpf_attr attr;
	char *log_buf;

	bump_rlimit_memlock();
...
	attr.prog_type = prog_type;
	attr.expected_attach_type = OPTS_GET(opts, expected_attach_type, 0);

	attr.prog_btf_fd = OPTS_GET(opts, prog_btf_fd, 0);
	attr.prog_flags = OPTS_GET(opts, prog_flags, 0);
	attr.prog_ifindex = OPTS_GET(opts, prog_ifindex, 0);
	attr.kern_version = OPTS_GET(opts, kern_version, 0);

...
	if (log_level) {
		attr.log_buf = ptr_to_u64(log_buf);
		attr.log_size = log_size;
		attr.log_level = log_level;
	}

// 重点:调用内核 bpf() 系统调用来 Load。
	fd = sys_bpf_prog_load(&attr, attr_sz, attempts);
	OPTS_SET(opts, log_true_size, attr.log_true_size);
	if (fd >= 0)
		return fd;
...
}


// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L6815
static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog,
				struct bpf_insn *insns, int insns_cnt,
				const char *license, __u32 kern_version, int *prog_fd)
{
	LIBBPF_OPTS(bpf_prog_load_opts, load_attr);
	const char *prog_name = NULL;
	char *cp, errmsg[STRERR_BUFSIZE];
	size_t log_buf_size = 0;
	char *log_buf = NULL, *tmp;
	int btf_fd, ret, err;
	bool own_log_buf = true;
	__u32 log_level = prog->log_level;

	if (prog->type == BPF_PROG_TYPE_UNSPEC) {
		/*
		 * The program type must be set.  Most likely we couldn't find a proper
		 * section definition at load time, and thus we didn't infer the type.
		 */
		pr_warn("prog '%s': missing BPF prog type, check ELF section name '%s'\n",
			prog->name, prog->sec_name);
		return -EINVAL;
	}

	if (!insns || !insns_cnt)
		return -EINVAL;

。。。
	/* adjust load_attr if sec_def provides custom preload callback */
	if (prog->sec_def && prog->sec_def->prog_prepare_load_fn) {
		err = prog->sec_def->prog_prepare_load_fn(prog, &load_attr, prog->sec_def->cookie);
		if (err < 0) {
			pr_warn("prog '%s': failed to prepare load attributes: %d\n",
				prog->name, err);
			return err;
		}
		insns = prog->insns;
		insns_cnt = prog->insns_cnt;
	}
。。。
retry_load:
。。。
     // 重点,加载所有所有 Prog。
	ret = bpf_prog_load(prog->type, prog_name, license, insns, insns_cnt, &load_attr);
	if (ret >= 0) {
		if (log_level && own_log_buf) {
			pr_debug("prog '%s': -- BEGIN PROG LOAD LOG --\n%s-- END PROG LOAD LOG --\n",
				 prog->name, log_buf);
		}

		if (obj->has_rodata && kernel_supports(obj, FEAT_PROG_BIND_MAP)) {
			struct bpf_map *map;
			int i;

			for (i = 0; i < obj->nr_maps; i++) {
				map = &prog->obj->maps[i];
				if (map->libbpf_type != LIBBPF_MAP_RODATA)
					continue;

				if (bpf_prog_bind_map(ret, bpf_map__fd(map), NULL)) {
					cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg));
					pr_warn("prog '%s': failed to bind map '%s': %s\n",
						prog->name, map->real_name, cp);
					/* Don't fail hard if can't bind rodata. */
				}
			}
		}

		*prog_fd = ret;
		ret = 0;
		goto out;
	}
 。。。
}

其中 bpf_object__create_maps() 最终都是调用 ebpf 的 BPF_MAP_CREATE 来创建 Maps 的。

// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/bpf.c#L165
int bpf_map_create(enum bpf_map_type map_type,
		   const char *map_name,
		   __u32 key_size,
		   __u32 value_size,
		   __u32 max_entries,
		   const struct bpf_map_create_opts *opts)
{
	const size_t attr_sz = offsetofend(union bpf_attr, map_extra);
	union bpf_attr attr;
	int fd;

	bump_rlimit_memlock();

	memset(&attr, 0, attr_sz);

	if (!OPTS_VALID(opts, bpf_map_create_opts))
		return libbpf_err(-EINVAL);

	attr.map_type = map_type;
	if (map_name && kernel_supports(NULL, FEAT_PROG_NAME))
		libbpf_strlcpy(attr.map_name, map_name, sizeof(attr.map_name));
	attr.key_size = key_size;
	attr.value_size = value_size;
	attr.max_entries = max_entries;

	attr.btf_fd = OPTS_GET(opts, btf_fd, 0);
	attr.btf_key_type_id = OPTS_GET(opts, btf_key_type_id, 0);
	attr.btf_value_type_id = OPTS_GET(opts, btf_value_type_id, 0);
	attr.btf_vmlinux_value_type_id = OPTS_GET(opts, btf_vmlinux_value_type_id, 0);

	attr.inner_map_fd = OPTS_GET(opts, inner_map_fd, 0);
	attr.map_flags = OPTS_GET(opts, map_flags, 0);
	attr.map_extra = OPTS_GET(opts, map_extra, 0);
	attr.numa_node = OPTS_GET(opts, numa_node, 0);
	attr.map_ifindex = OPTS_GET(opts, map_ifindex, 0);

	fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz);
	return libbpf_err_errno(fd);
}

3 hello_verifier_bpf__attach()
#

不需要 auto Attach 和不需要指定 handler_id 的类型;

  • perf_event

  • socket/sk_reuseport;

  • tc/classifier/action;

  • xdp/cgroup 等;

  • 需要 auto Attach,指定了对应的 handler_id;

    • k[ret]probe/u[ret]probe/k[ret]syscall/usdt:attach_kprobe/uprobe
    • tracepoint+/raw_tracepoint: attach_tp/attach_raw_tp
    • tp_btf/fentry/fexit/fmod_ret/freplace: attach_trace, 依赖 BTF 信息。

kprobe/uprobe 支持的 4 种 Attach 模式, 对于 4.19.91 及以后的内核都是 PROBE_ATTACH_MODE_DEFAULT 模式.

// // https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.h#L486
/**
 * enum probe_attach_mode - the mode to attach kprobe/uprobe
 *
 * force libbpf to attach kprobe/uprobe in specific mode, -ENOTSUP will
 * be returned if it is not supported by the kernel.
 */
enum probe_attach_mode {
	/* attach probe in latest supported mode by kernel */
	PROBE_ATTACH_MODE_DEFAULT = 0,
	/* attach probe in legacy mode, using debugfs/tracefs */
	PROBE_ATTACH_MODE_LEGACY,
	/* create perf event with perf_event_open() syscall */
	PROBE_ATTACH_MODE_PERF,
	/* attach probe with BPF link */
	PROBE_ATTACH_MODE_LINK,
};

// https://github.com/lizrice/learning-ebpf/blob/e0fbafd4b88fa6a44eb9e4679e695b8f83b5a052/chapter6/hello-verifier.skel.h#L115
static inline int
hello_verifier_bpf__attach(struct hello_verifier_bpf *obj)
{
	return bpf_object__attach_skeleton(obj->skeleton);
}


// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L12854
int bpf_object__attach_skeleton(struct bpf_object_skeleton *s)
{
	int i, err;

	for (i = 0; i < s->prog_cnt; i++) {
		struct bpf_program *prog = *s->progs[i].prog;
		struct bpf_link **link = s->progs[i].link;

		if (!prog->autoload || !prog->autoattach)
			continue;

		/* auto-attaching not supported for this program */
		if (!prog->sec_def || !prog->sec_def->prog_attach_fn)
			continue;

		/* if user already set the link manually, don't attempt auto-attach */
            // 一般各种 attach_XX 函数会设置 link,所以后面在 load 时会被 auto attach。
		if (*link)
			continue;

		// 进行 attach 操作。
		err = prog->sec_def->prog_attach_fn(prog, prog->sec_def->cookie, link);
		if (err) {
			pr_warn("prog '%s': failed to auto-attach: %d\n",
				bpf_program__name(prog), err);
			return libbpf_err(err);
		}

		/* It's possible that for some SEC() definitions auto-attach
		 * is supported in some cases (e.g., if definition completely
		 * specifies target information), but is not in other cases.
		 * SEC("uprobe") is one such case. If user specified target
		 * binary and function name, such BPF program can be
		 * auto-attached. But if not, it shouldn't trigger skeleton's
		 * attach to fail. It should just be skipped.
		 * attach_fn signals such case with returning 0 (no error) and
		 * setting link to NULL.
		 */
	}

	return 0;
}

各种 attach_XX 函数:

  • 这些函数在 libbpf.c 的 static const struct bpf_sec_def section_defs[] 中被配置,如:

    SEC_DEF("kprobe+", KPROBE, 0, SEC_NONE, attach_kprobe),
    

    表示,在 SEC Name 前缀匹配 kprobe+ 时,使用 attach_kprobe 进行 auto attach。

// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L8646
static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link);
static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link);
static int attach_ksyscall(const struct bpf_program *prog, long cookie, struct bpf_link **link);
static int attach_usdt(const struct bpf_program *prog, long cookie, struct bpf_link **link);
// tracingpoint
static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link);
static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link);
// tracing
static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link);
static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link);
static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link);
static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link);

注意:auto attach 是 libbpf skeleton 封装的能力,也即在用户态程序调用 hello_verifier_bpf__attach(skeleton 生成的) -> bpf_object__attach_skeleton -> bpf_object__attach_skeleton()

实现方式:

  1. probe 类:k[ret]probe/ksyscall/k[ret]probe.multi/u[ret]probe/usdt

    1. pfd = perf_event_open_probe(false, retprobe, func_name, offset, -1, 0)
      • 第一个参数: false 为 uprobe,true 为 kprobe;
      • 最终调用系统调用 perf_event_open() 创建一个 perf event buffer;
    2. link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts);
  2. tracepoint 类:tp/tracepoint

    1. pfd = perf_event_open_tracepoint(tp_category, tp_name);
      • 最终调用系统调用 perf_event_open() 创建一个 perf event buffer;
    2. link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts);

    probe 和 tracepoint 相似的地方:

    1. 都先 open 一个 perf event buffer 获得 pfd,前者为 perf_event_open_probe,后者为 perf_event_open_tracepoint;
    2. 都使用 bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts); 将 prog 和 pfd 绑定,该函数内部的原理:
      • 如果kernel 支持 PERF_LINK 则使用 bpf BPF_LINK_CREATE CMD 来创建一个 bpf_link (attr 中 link_create.perf_event.bpf_cookie 不为空)
      • 否则使用 ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd) 和 ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) 来设置和开启前面 open 的 perf event tracepoint。
  3. raw_tracepoint 类:raw_tp/raw_tracepoint

    1. pfd = bpf_raw_tracepoint_open(tp_name, prog_fd);
      • 内部调用 bpf 系统调用 BPF_RAW_TRACEPOINT_OPEN 命令来创建一个 pfd;
    2. link->fd = pfd;

    raw_tracepoint 和 tracepoint 相比:不需要先 open 一个 perf event buffer,而不需要再将它与 Prog 绑定。而是直接使用 bpf 系统调用的 BPF_RAW_TRACEPOINT_OPEN 来创建一个 pfd。

  4. trace 类:tp_btf/fentry/fexit/fmod_ret/freplace: attach_trace

    1. pfd = bpf_link_create(prog_fd, 0, bpf_program__expected_attach_type(prog), &link_opts);
      • 内部调用 bpf 系统调用 BPF_LINK_CREATE 来创建 Link: fd = sys_bpf_fd(BPF_LINK_CREATE, &attr, attr_sz);
      • 对于不支持 bpf BPF_LINK_CREATE CMD 的老内核(内核头文件: include/uapi/linux/bpf.h),如 4.19.91 fallback 到和 raw_tracepoint 的实现机制,即使用 bpf CMD BPF_RAW_TRACEPOINT_OPEN 来创建一个 Program fd;
    2. link->fd = pfd;

    trace 类的特点:

    1. 和 raw_tracepoing 类似,不需要 open 一个 perf event buffer, 但是依赖 BTF 信息;

4 attach_kprobe
#

attach_kprobe 函数定义:

  1. 检查 SEC() name 是否是 kprobe/kretprobe 开头;

  2. 从 SEC Name 中提取要 kprobe 或 kretprobe 的 function name

  3. 创建一个 per event probe

    pfd = perf_event_open_probe(false /* uprobe */, retprobe, func_name, offset, -1 /* pid */, 0 /* ref_ctr_off */)
    
  4. 将 Program 和上面的 perf evnet 绑定,返回一个 struct perf_link:

    link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts);
    
// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.h#L486
struct bpf_kprobe_opts {
	/* size of this struct, for forward/backward compatibility */
	size_t sz;
	/* custom user-provided value fetchable through bpf_get_attach_cookie() */
	__u64 bpf_cookie;
	/* function's offset to install kprobe to */
	size_t offset;
	/* kprobe is return probe */
	bool retprobe;
	/* kprobe attach mode */
	enum probe_attach_mode attach_mode;
	size_t :0;
};

// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L10641
static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link)
{
      // 创建一个名称为 opts, 类型为 bpf_kprobe_opts 的对象, 其中 attache_mode 为缺省值 PROBE_ATTACH_MODE_DEFAULT.
	DECLARE_LIBBPF_OPTS(bpf_kprobe_opts, opts);
	unsigned long offset = 0;
	const char *func_name;
	char *func;
	int n;

	*link = NULL;

	/* no auto-attach for SEC("kprobe") and SEC("kretprobe") */
	if (strcmp(prog->sec_name, "kprobe") == 0 || strcmp(prog->sec_name, "kretprobe") == 0)
		return 0;

      // 重点: 从 SEC Name 中提取要 kprobe 或 kretprobe 的 function name
	opts.retprobe = str_has_pfx(prog->sec_name, "kretprobe/");
	if (opts.retprobe)
		func_name = prog->sec_name + sizeof("kretprobe/") - 1;
	else
		func_name = prog->sec_name + sizeof("kprobe/") - 1;

	n = sscanf(func_name, "%m[a-zA-Z0-9_.]+%li", &func, &offset);
	if (n < 1) {
		pr_warn("kprobe name is invalid: %s\n", func_name);
		return -EINVAL;
	}
	if (opts.retprobe && offset != 0) {
		free(func);
		pr_warn("kretprobes do not support offset specification\n");
		return -EINVAL;
	}

	opts.offset = offset;

      // 重点: 对指定 func name 进行 attach
	*link = bpf_program__attach_kprobe_opts(prog, func, &opts);
	free(func);
	return libbpf_get_error(*link);
}

// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L10373
struct bpf_link *
bpf_program__attach_kprobe_opts(const struct bpf_program *prog,
				const char *func_name,
				const struct bpf_kprobe_opts *opts)
{
      // 创建一个名为 pe_opts 类型为 bpf_perf_event_opts 的对象.
	DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts);
	enum probe_attach_mode attach_mode;
	char errmsg[STRERR_BUFSIZE];
	char *legacy_probe = NULL;
	struct bpf_link *link;
	size_t offset;
	bool retprobe, legacy;
	int pfd, err;

	if (!OPTS_VALID(opts, bpf_kprobe_opts))
		return libbpf_err_ptr(-EINVAL);

      // 从传入的 opts 中获取 attach mode, 默认为 PROBE_ATTACH_MODE_DEFAULT
	attach_mode = OPTS_GET(opts, attach_mode, PROBE_ATTACH_MODE_DEFAULT);
	retprobe = OPTS_GET(opts, retprobe, false);
	offset = OPTS_GET(opts, offset, 0);
	pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0);

      // 对于 4.19.91/5.15.0 内核而言, 这个函数返回值为 6, 所以是非 legacy 模式.
	legacy = determine_kprobe_perf_type() < 0;
	switch (attach_mode) {
	case PROBE_ATTACH_MODE_LEGACY:
		legacy = true;
		pe_opts.force_ioctl_attach = true;
		break;
	case PROBE_ATTACH_MODE_PERF:
		if (legacy)
			return libbpf_err_ptr(-ENOTSUP);
		pe_opts.force_ioctl_attach = true;
		break;
	case PROBE_ATTACH_MODE_LINK:
		if (legacy || !kernel_supports(prog->obj, FEAT_PERF_LINK))
			return libbpf_err_ptr(-ENOTSUP);
		break;
	case PROBE_ATTACH_MODE_DEFAULT:
		break;
	default:
		return libbpf_err_ptr(-EINVAL);
	}

	if (!legacy) {
            // 当前 4.19 内核开始都是该模式,即打开一个 perf event probe,对指定 fun_name 进行 probe.
		pfd = perf_event_open_probe(false /* uprobe */, retprobe,
					    func_name, offset,
					    -1 /* pid */, 0 /* ref_ctr_off */);
	}
...
      // 将 Program 和 Perf Event 绑定, 返回一个 link
	link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts);
	err = libbpf_get_error(link);
	if (err) {
		close(pfd);
		pr_warn("prog '%s': failed to attach to %s '%s+0x%zx': %s\n",
			prog->name, retprobe ? "kretprobe" : "kprobe",
			func_name, offset,
			libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
		goto err_clean_legacy;
	}
...
	return link;
...
}

内核可以 kprobe 的函数名称参考: /proc/kallsyms

PROGRAM TYPE: BPF_PROG_TYPE_KPROBE

SEC Name 格式 : SEC(“kprobe/XX”) 或 SEC(“kretprobe/XX”)

Event Handler func 参数类型 : struct pt_regs *

示例:

SEC("kprobe/do_sys_open")
int kprobe__do_sys_open(struct pt_regs *ctx)
{
	char file_name[256];
	bpf_probe_read(file_name, sizeof(file_name), PT_REGS_PARM2(ctx));
	char fmt[] = "file %s\n";
	bpf_trace_printk(fmt, sizeof(fmt), &file_name);
	return 0;
}

bpf/bpf_tracing.h 提供的 CO-RE 宏函数,可以直接写内核函数的参数列表。

  • BPF_KPROBE(name, args…):
    • SEC(“kprobe/XX”)
  • BPF_KRETPROBE(name, args…):
    • SEC(“kretporbe/XX”)
int do_execve(
	struct filename *filename,
	const char __user *const __user *__argv,
	const char __user *const __user *__envp)
// do_execve 为内核函数名称(非系统调用名称)。
SEC("kprobe/do_execve")
int BPF_KPROBE(kprobe_do_execve, struct filename *filename) {
}
SEC("kretprobe/do_unlinkat")
int BPF_KRETPROBE(do_unlinkat_exit, long ret) {
}

5 attach_ksyscall
#

attach_ksyscall 函数定义:

// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L10678
static int attach_ksyscall(const struct bpf_program *prog, long cookie, struct bpf_link **link)
{
	LIBBPF_OPTS(bpf_ksyscall_opts, opts);
	const char *syscall_name;

	*link = NULL;

	/* no auto-attach for SEC("ksyscall") and SEC("kretsyscall") */
	if (strcmp(prog->sec_name, "ksyscall") == 0 || strcmp(prog->sec_name, "kretsyscall") == 0)
		return 0;

	opts.retprobe = str_has_pfx(prog->sec_name, "kretsyscall/");
	if (opts.retprobe)
		syscall_name = prog->sec_name + sizeof("kretsyscall/") - 1;
	else
		syscall_name = prog->sec_name + sizeof("ksyscall/") - 1;

	*link = bpf_program__attach_ksyscall(prog, syscall_name, &opts);
	return *link ? 0 : -errno;
}

PROGRAM TYPE: BPF_PROG_TYPE_KPROBE

SEC Name 格式: SEC(“ksyscall/XX”) 或 SEC(“kretsyscall/XX”)

Event Handler func 参数类型 : struct pt_regs *

由于从内核 4.17 开始,默认开启了 CONFIG_ARCH_HAS_SYSCALL_WRAPPER, 所以系统调用的名称都是类似于 __x64_sys_execveat 的格式,其中 __x64_sys 是体系结构相关的前缀。

为了避免写类似于 SEC(“kprobe/__x64_sys_execveat”) 的代码,libbpf 提供了特殊的 ksyscall 前缀,只需要使用syscall name 即可,在 attach 时自动添加 ARCH 前缀,如 __x64_sys_execveat

#zgrep CONFIG_ARCH_HAS_SYSCALL_WRAPPER  /proc/config.gz
CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y

root@lima-learning-ebpf:/github.com/lizrice/learning-ebpf/chapter7# cat /sys/kernel/tracing/available_filter_functions  |grep _sys_bpf
__sys_bpf
bpf_sys_bpf
__ia32_sys_bpf
__x64_sys_bpf

# 需要手动先 mount 下 tracefs
mount -t tracefs nodev /sys/kernel/tracing

#grep _sys_bpf /sys/kernel/tracing/available_filter_functions
__ia32_sys_bpf
__x64_sys_bpf

#zgrep CONFIG_ARCH_HAS_SYSCALL_WRAPPER  /proc/config.gz
CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y

⚠️:这个前缀是 libpbf 专用的,在 attach 阶段处理。cilium 没有这个前缀,需要换成 ‘kprobe/sys_XX’.

bpf/bpf_tracing.h 提供的 CO-RE 函数宏 BPF_KPROBE_SYSCALL 或 BPF_KSYSCALL(name, args…):

  • 只适用于系统调用函数,自动处理 CONFIG_ARCH_HAS_SYSCALL_WRAPPER;
  • libpf:SEC(“ksyscall/execve”), libpf 在 attach 时,自动添加 ARCH 前缀,如 __x64_sys_execveat
  • cilium: SEC(“kprobe/sys_execve”),格式必须为 sys_XX,cilium 在 attach 时自动按需添加 ARCH 前缀,如 __x64_sys_execveat

6 attach_kprobe_multi
#

attach_kprobe_multi 实现定义:

// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L10699
static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link)
{
	LIBBPF_OPTS(bpf_kprobe_multi_opts, opts);
	const char *spec;
	char *pattern;
	int n;

	*link = NULL;

	/* no auto-attach for SEC("kprobe.multi") and SEC("kretprobe.multi") */
	if (strcmp(prog->sec_name, "kprobe.multi") == 0 ||
	    strcmp(prog->sec_name, "kretprobe.multi") == 0)
		return 0;

	opts.retprobe = str_has_pfx(prog->sec_name, "kretprobe.multi/");
	if (opts.retprobe)
		spec = prog->sec_name + sizeof("kretprobe.multi/") - 1;
	else
		spec = prog->sec_name + sizeof("kprobe.multi/") - 1;

	n = sscanf(spec, "%m[a-zA-Z0-9_.*?]", &pattern);
	if (n < 1) {
		pr_warn("kprobe multi pattern is invalid: %s\n", pattern);
		return -EINVAL;
	}

	*link = bpf_program__attach_kprobe_multi_opts(prog, pattern, &opts);
	free(pattern);
	return libbpf_get_error(*link);
}

PROGRAM TYPE: BPF_PROG_TYPE_KPROBE

SEC Name 格式: SEC(“kprobe.multi/xx”) 或 SEC(“kretprobe.multi/xx”)

Event Handler func 参数类型 : struct pt_regs *

7 attach_uprobe
#

attach_uprobe 函数定义:

// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L11285
/* Format of u[ret]probe section definition supporting auto-attach:
 * u[ret]probe/binary:function[+offset]
 *
 * binary can be an absolute/relative path or a filename; the latter is resolved to a
 * full binary path via bpf_program__attach_uprobe_opts.
 *
 * Specifying uprobe+ ensures we carry out strict matching; either "uprobe" must be
 * specified (and auto-attach is not possible) or the above format is specified for
 * auto-attach.
 */
static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link)
{
	DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, opts);
	char *probe_type = NULL, *binary_path = NULL, *func_name = NULL;
	int n, ret = -EINVAL;
	long offset = 0;

	*link = NULL;

	n = sscanf(prog->sec_name, "%m[^/]/%m[^:]:%m[a-zA-Z0-9_.]+%li",
		   &probe_type, &binary_path, &func_name, &offset);
	switch (n) {
	case 1:
		/* handle SEC("u[ret]probe") - format is valid, but auto-attach is impossible. */
		ret = 0;
		break;
	case 2:
		pr_warn("prog '%s': section '%s' missing ':function[+offset]' specification\n",
			prog->name, prog->sec_name);
		break;
	case 3:
	case 4:
		opts.retprobe = strcmp(probe_type, "uretprobe") == 0 ||
				strcmp(probe_type, "uretprobe.s") == 0;
		if (opts.retprobe && offset != 0) {
			pr_warn("prog '%s': uretprobes do not support offset specification\n",
				prog->name);
			break;
		}
		opts.func_name = func_name;
		*link = bpf_program__attach_uprobe_opts(prog, -1, binary_path, offset, &opts);
		ret = libbpf_get_error(*link);
		break;
	default:
		pr_warn("prog '%s': invalid format of section definition '%s'\n", prog->name,
			prog->sec_name);
		break;
	}
	free(probe_type);
	free(binary_path);
	free(func_name);

	return ret;
}

PROGRAM TYPE: BPF_PROG_TYPE_KPROBE

SEC name 格式: u[ret]probe/binary:function[+offset]

  • binary can be an absolute/relative path or a filename

Event Handler func 参数类型 : struct pt_regs *

cilium 使用 Executable 对象的 Uprobe/Uretprobe() 方法来执行 attach。

示例:

SEC("uprobe/lkm_seeker")
int uprobe_lkm_seeker(struct pt_regs *ctx) {
}

查看 binary 可以 uprobe 的函数列表:

  • 使用 objdump 工具;
  • 使用 bpftrace -l 命令;
root@lima-ebpf-dev:/Users/zhangjun# objdump -tT /usr/sbin/sshd |grep Base
0000000000000000      DF *UND*	0000000000000000  Base        deflateInit_
0000000000000000      DF *UND*	0000000000000000  Base        request_init
0000000000000000      DF *UND*	0000000000000000  Base        audit_log_acct_message
0000000000000000      DF *UND*	0000000000000000  Base        hosts_access
0000000000000000      DF *UND*	0000000000000000  Base        audit_open
0000000000000000      DF *UND*	0000000000000000  Base        deflate
0000000000000000      DF *UND*	0000000000000000  Base        deflateEnd
0000000000000000      DF *UND*	0000000000000000  Base        sock_host
0000000000000000      DF *UND*	0000000000000000  Base        inflate
0000000000000000      DF *UND*	0000000000000000  Base        inflateInit_
0000000000000000      DF *UND*	0000000000000000  Base        inflateEnd
0000000000000000      DF *UND*	0000000000000000  Base        error_message
0000000000000000      DF *UND*	0000000000000000  Base        refuse
0000000000000000  w   D  *UND*	0000000000000000  Base        _ITM_deregisterTMCloneTable
0000000000000000  w   D  *UND*	0000000000000000  Base        __gmon_start__
0000000000000000  w   D  *UND*	0000000000000000  Base        _ITM_registerTMCloneTable
0000000000072d50 g    DF .text	0000000000000182  Base        closefrom
00000000000e0ba4 g    DO .bss	0000000000000004  Base        allow_severity
00000000000e0ba0 g    DO .bss	0000000000000004  Base        deny_severity
0000000000073610 g    DF .text	0000000000000007  Base        setlogin

root@lima-ebpf-dev:/Users/zhangjun#  objdump -tT /usr/bin/bash|grep readline
0000000000155f18 g    DO .bss	0000000000000008  Base        rl_readline_state
00000000000d52f0 g    DF .text	0000000000000392  Base        readline_internal_char
00000000000d42d0 g    DF .text	0000000000000260  Base        readline_internal_setup
0000000000097e40 g    DF .text	00000000000000dd  Base        posix_readline_initialize
00000000000d5690 g    DF .text	00000000000000c9  Base        readline
0000000000155f20 g    DO .bss	0000000000000004  Base        bash_readline_initialized
0000000000154528 g    DO .data	0000000000000008  Base        rl_readline_name
00000000001555e8 g    DO .data	0000000000000004  Base        rl_readline_version
0000000000095630 g    DF .text	000000000000001d  Base        initialize_readline
0000000000155ca0 g    DO .bss	0000000000000004  Base        current_readline_line_index
0000000000155bf0 g    DO .bss	0000000000000008  Base        current_readline_prompt
000000000009f2b0 g    DF .text	0000000000000051  Base        pcomp_set_readline_variables
0000000000155ca8 g    DO .bss	0000000000000008  Base        current_readline_line
00000000000d4530 g    DF .text	00000000000002ea  Base        readline_internal_teardown
00000000001555ec g    DO .data	0000000000000004  Base        rl_gnu_readline_p

root@lima-ebpf-dev:/Users/zhangjun# bpftrace -l "uprobe:/usr/sbin/sshd:*"
uprobe:/usr/sbin/sshd:closefrom
uprobe:/usr/sbin/sshd:setlogin

8 attach_usdt
#

attach_usdt 函数定义: 最终还是调用 uprobe:

pfd = perf_event_open_probe(true /* uprobe */, retprobe, binary_path, func_offset, pid, ref_ctr_off);
// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L11393
static int attach_usdt(const struct bpf_program *prog, long cookie, struct bpf_link **link)
{
	char *path = NULL, *provider = NULL, *name = NULL;
	const char *sec_name;
	int n, err;

	sec_name = bpf_program__section_name(prog);
	if (strcmp(sec_name, "usdt") == 0) {
		/* no auto-attach for just SEC("usdt") */
		*link = NULL;
		return 0;
	}

	n = sscanf(sec_name, "usdt/%m[^:]:%m[^:]:%m[^:]", &path, &provider, &name);
	if (n != 3) {
		pr_warn("invalid section '%s', expected SEC(\"usdt/<path>:<provider>:<name>\")\n",
			sec_name);
		err = -EINVAL;
	} else {
		*link = bpf_program__attach_usdt(prog, -1 /* any process */, path,
						 provider, name, NULL);
		err = libbpf_get_error(*link);
	}
	free(path);
	free(provider);
	free(name);
	return err;
}


struct bpf_link *bpf_program__attach_usdt(const struct bpf_program *prog,
					  pid_t pid, const char *binary_path,
					  const char *usdt_provider, const char *usdt_name,
					  const struct bpf_usdt_opts *opts)
{
	char resolved_path[512];
	struct bpf_object *obj = prog->obj;
	struct bpf_link *link;
	__u64 usdt_cookie;
	int err;

	if (!OPTS_VALID(opts, bpf_uprobe_opts))
		return libbpf_err_ptr(-EINVAL);

	if (bpf_program__fd(prog) < 0) {
		pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n",
			prog->name);
		return libbpf_err_ptr(-EINVAL);
	}

	if (!binary_path)
		return libbpf_err_ptr(-EINVAL);

	if (!strchr(binary_path, '/')) {
		err = resolve_full_path(binary_path, resolved_path, sizeof(resolved_path));
		if (err) {
			pr_warn("prog '%s': failed to resolve full path for '%s': %d\n",
				prog->name, binary_path, err);
			return libbpf_err_ptr(err);
		}
		binary_path = resolved_path;
	}

	/* USDT manager is instantiated lazily on first USDT attach. It will
	 * be destroyed together with BPF object in bpf_object__close().
	 */
	if (IS_ERR(obj->usdt_man))
		return libbpf_ptr(obj->usdt_man);
	if (!obj->usdt_man) {
		obj->usdt_man = usdt_manager_new(obj);
		if (IS_ERR(obj->usdt_man))
			return libbpf_ptr(obj->usdt_man);
	}

	usdt_cookie = OPTS_GET(opts, usdt_cookie, 0);
	link = usdt_manager_attach_usdt(obj->usdt_man, prog, pid, binary_path,
					usdt_provider, usdt_name, usdt_cookie);
	err = libbpf_get_error(link);
	if (err)
		return libbpf_err_ptr(err);
	return link;
}


struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct bpf_program *prog,
					  pid_t pid, const char *path,
					  const char *usdt_provider, const char *usdt_name,
					  __u64 usdt_cookie)
{
	int i, fd, err, spec_map_fd, ip_map_fd;
	LIBBPF_OPTS(bpf_uprobe_opts, opts);
	struct hashmap *specs_hash = NULL;
	struct bpf_link_usdt *link = NULL;
	struct usdt_target *targets = NULL;
	size_t target_cnt;
	Elf *elf;

	spec_map_fd = bpf_map__fd(man->specs_map);
	ip_map_fd = bpf_map__fd(man->ip_to_spec_id_map);

	/* TODO: perform path resolution similar to uprobe's */
	fd = open(path, O_RDONLY);
	if (fd < 0) {
		err = -errno;
		pr_warn("usdt: failed to open ELF binary '%s': %d\n", path, err);
		return libbpf_err_ptr(err);
	}

	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
	if (!elf) {
		err = -EBADF;
		pr_warn("usdt: failed to parse ELF binary '%s': %s\n", path, elf_errmsg(-1));
		goto err_out;
	}

	err = sanity_check_usdt_elf(elf, path);
	if (err)
		goto err_out;

	/* normalize PID filter */
	if (pid < 0)
		pid = -1;
	else if (pid == 0)
		pid = getpid();

	/* discover USDT in given binary, optionally limiting
	 * activations to a given PID, if pid > 0
	 */
	err = collect_usdt_targets(man, elf, path, pid, usdt_provider, usdt_name,
				   usdt_cookie, &targets, &target_cnt);
	if (err <= 0) {
		err = (err == 0) ? -ENOENT : err;
		goto err_out;
	}

	specs_hash = hashmap__new(specs_hash_fn, specs_equal_fn, NULL);
	if (IS_ERR(specs_hash)) {
		err = PTR_ERR(specs_hash);
		goto err_out;
	}

	link = calloc(1, sizeof(*link));
	if (!link) {
		err = -ENOMEM;
		goto err_out;
	}

	link->usdt_man = man;
	link->link.detach = &bpf_link_usdt_detach;
	link->link.dealloc = &bpf_link_usdt_dealloc;

	link->uprobes = calloc(target_cnt, sizeof(*link->uprobes));
	if (!link->uprobes) {
		err = -ENOMEM;
		goto err_out;
	}

	for (i = 0; i < target_cnt; i++) {
		struct usdt_target *target = &targets[i];
		struct bpf_link *uprobe_link;
		bool is_new;
		int spec_id;

		/* Spec ID can be either reused or newly allocated. If it is
		 * newly allocated, we'll need to fill out spec map, otherwise
		 * entire spec should be valid and can be just used by a new
		 * uprobe. We reuse spec when USDT arg spec is identical. We
		 * also never share specs between two different USDT
		 * attachments ("links"), so all the reused specs already
		 * share USDT cookie value implicitly.
		 */
		err = allocate_spec_id(man, specs_hash, link, target, &spec_id, &is_new);
		if (err)
			goto err_out;

		if (is_new && bpf_map_update_elem(spec_map_fd, &spec_id, &target->spec, BPF_ANY)) {
			err = -errno;
			pr_warn("usdt: failed to set USDT spec #%d for '%s:%s' in '%s': %d\n",
				spec_id, usdt_provider, usdt_name, path, err);
			goto err_out;
		}
		if (!man->has_bpf_cookie &&
		    bpf_map_update_elem(ip_map_fd, &target->abs_ip, &spec_id, BPF_NOEXIST)) {
			err = -errno;
			if (err == -EEXIST) {
				pr_warn("usdt: IP collision detected for spec #%d for '%s:%s' in '%s'\n",
				        spec_id, usdt_provider, usdt_name, path);
			} else {
				pr_warn("usdt: failed to map IP 0x%lx to spec #%d for '%s:%s' in '%s': %d\n",
					target->abs_ip, spec_id, usdt_provider, usdt_name,
					path, err);
			}
			goto err_out;
		}

		opts.ref_ctr_offset = target->sema_off;
		opts.bpf_cookie = man->has_bpf_cookie ? spec_id : 0;
		uprobe_link = bpf_program__attach_uprobe_opts(prog, pid, path,
							      target->rel_ip, &opts);
		err = libbpf_get_error(uprobe_link);
		if (err) {
			pr_warn("usdt: failed to attach uprobe #%d for '%s:%s' in '%s': %d\n",
				i, usdt_provider, usdt_name, path, err);
			goto err_out;
		}

		link->uprobes[i].link = uprobe_link;
		link->uprobes[i].abs_ip = target->abs_ip;
		link->uprobe_cnt++;
	}

	free(targets);
	hashmap__free(specs_hash);
	elf_end(elf);
	close(fd);

	return &link->link;

err_out:
	if (link)
		bpf_link__destroy(&link->link);
	free(targets);
	hashmap__free(specs_hash);
	if (elf)
		elf_end(elf);
	close(fd);
	return libbpf_err_ptr(err);
}


LIBBPF_API struct bpf_link *
bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid,
				const char *binary_path, size_t func_offset,
				const struct bpf_uprobe_opts *opts)
{
	const char *archive_path = NULL, *archive_sep = NULL;
	char errmsg[STRERR_BUFSIZE], *legacy_probe = NULL;
	DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts);
	enum probe_attach_mode attach_mode;
	char full_path[PATH_MAX];
	struct bpf_link *link;
	size_t ref_ctr_off;
	int pfd, err;
	bool retprobe, legacy;
	const char *func_name;

	if (!OPTS_VALID(opts, bpf_uprobe_opts))
		return libbpf_err_ptr(-EINVAL);

	attach_mode = OPTS_GET(opts, attach_mode, PROBE_ATTACH_MODE_DEFAULT);
	retprobe = OPTS_GET(opts, retprobe, false);
	ref_ctr_off = OPTS_GET(opts, ref_ctr_offset, 0);
	pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0);

	if (!binary_path)
		return libbpf_err_ptr(-EINVAL);

	/* Check if "binary_path" refers to an archive. */
	archive_sep = strstr(binary_path, "!/");
	if (archive_sep) {
		full_path[0] = '\0';
		libbpf_strlcpy(full_path, binary_path,
			       min(sizeof(full_path), (size_t)(archive_sep - binary_path + 1)));
		archive_path = full_path;
		binary_path = archive_sep + 2;
	} else if (!strchr(binary_path, '/')) {
		err = resolve_full_path(binary_path, full_path, sizeof(full_path));
		if (err) {
			pr_warn("prog '%s': failed to resolve full path for '%s': %d\n",
				prog->name, binary_path, err);
			return libbpf_err_ptr(err);
		}
		binary_path = full_path;
	}
	func_name = OPTS_GET(opts, func_name, NULL);
	if (func_name) {
		long sym_off;

		if (archive_path) {
			sym_off = elf_find_func_offset_from_archive(archive_path, binary_path,
								    func_name);
			binary_path = archive_path;
		} else {
			sym_off = elf_find_func_offset_from_file(binary_path, func_name);
		}
		if (sym_off < 0)
			return libbpf_err_ptr(sym_off);
		func_offset += sym_off;
	}

	legacy = determine_uprobe_perf_type() < 0;
	switch (attach_mode) {
	case PROBE_ATTACH_MODE_LEGACY:
		legacy = true;
		pe_opts.force_ioctl_attach = true;
		break;
	case PROBE_ATTACH_MODE_PERF:
		if (legacy)
			return libbpf_err_ptr(-ENOTSUP);
		pe_opts.force_ioctl_attach = true;
		break;
	case PROBE_ATTACH_MODE_LINK:
		if (legacy || !kernel_supports(prog->obj, FEAT_PERF_LINK))
			return libbpf_err_ptr(-ENOTSUP);
		break;
	case PROBE_ATTACH_MODE_DEFAULT:
		break;
	default:
		return libbpf_err_ptr(-EINVAL);
	}

	if (!legacy) {
		pfd = perf_event_open_probe(true /* uprobe */, retprobe, binary_path,
					    func_offset, pid, ref_ctr_off);
	} else {
		char probe_name[PATH_MAX + 64];

		if (ref_ctr_off)
			return libbpf_err_ptr(-EINVAL);

		gen_uprobe_legacy_event_name(probe_name, sizeof(probe_name),
					     binary_path, func_offset);

		legacy_probe = strdup(probe_name);
		if (!legacy_probe)
			return libbpf_err_ptr(-ENOMEM);

		pfd = perf_event_uprobe_open_legacy(legacy_probe, retprobe,
						    binary_path, func_offset, pid);
	}
	if (pfd < 0) {
		err = -errno;
		pr_warn("prog '%s': failed to create %s '%s:0x%zx' perf event: %s\n",
			prog->name, retprobe ? "uretprobe" : "uprobe",
			binary_path, func_offset,
			libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
		goto err_out;
	}

	link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts);
	err = libbpf_get_error(link);
	if (err) {
		close(pfd);
		pr_warn("prog '%s': failed to attach to %s '%s:0x%zx': %s\n",
			prog->name, retprobe ? "uretprobe" : "uprobe",
			binary_path, func_offset,
			libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
		goto err_clean_legacy;
	}
	if (legacy) {
		struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link);

		perf_link->legacy_probe_name = legacy_probe;
		perf_link->legacy_is_kprobe = false;
		perf_link->legacy_is_retprobe = retprobe;
	}
	return link;

err_clean_legacy:
	if (legacy)
		remove_uprobe_event_legacy(legacy_probe, retprobe);
err_out:
	free(legacy_probe);
	return libbpf_err_ptr(err);
}

PROGRAM TYPE: BPF_PROG_TYPE_KPROBE

SEC Name 格式: SEC(“usdt/<path>:<provider>:<name>”)

Event Handler func 参数类型 : struct pt_regs *

Linux 5.19 才开始支持 usdt section,但是 cilium 还不支持

社区有一个单独的基于 cilium 的 USDT 实现:https://github.com/mmat11/usdt

9 attach_tp
#

attach_tp 函数定义:

// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L11514
static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link)
{
	char *sec_name, *tp_cat, *tp_name;

	*link = NULL;

	/* no auto-attach for SEC("tp") or SEC("tracepoint") */
	if (strcmp(prog->sec_name, "tp") == 0 || strcmp(prog->sec_name, "tracepoint") == 0)
		return 0;

	sec_name = strdup(prog->sec_name);
	if (!sec_name)
		return -ENOMEM;

	/* extract "tp/<category>/<name>" or "tracepoint/<category>/<name>" */
	if (str_has_pfx(prog->sec_name, "tp/"))
		tp_cat = sec_name + sizeof("tp/") - 1;
	else
		tp_cat = sec_name + sizeof("tracepoint/") - 1;
	tp_name = strchr(tp_cat, '/');
	if (!tp_name) {
		free(sec_name);
		return -EINVAL;
	}
	*tp_name = '\0';
	tp_name++;

	*link = bpf_program__attach_tracepoint(prog, tp_cat, tp_name);
	free(sec_name);
	return libbpf_get_error(*link);
}

struct bpf_link *bpf_program__attach_tracepoint(const struct bpf_program *prog,
						const char *tp_category,
						const char *tp_name)
{
	return bpf_program__attach_tracepoint_opts(prog, tp_category, tp_name, NULL);
}


// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L11473
struct bpf_link *bpf_program__attach_tracepoint_opts(const struct bpf_program *prog,
						     const char *tp_category,
						     const char *tp_name,
						     const struct bpf_tracepoint_opts *opts)
{
	DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts);
	char errmsg[STRERR_BUFSIZE];
	struct bpf_link *link;
	int pfd, err;

	if (!OPTS_VALID(opts, bpf_tracepoint_opts))
		return libbpf_err_ptr(-EINVAL);

	pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0);

    // 重点: 返回一个 perf fd,
	pfd = perf_event_open_tracepoint(tp_category, tp_name);
	if (pfd < 0) {
		pr_warn("prog '%s': failed to create tracepoint '%s/%s' perf event: %s\n",
			prog->name, tp_category, tp_name,
			libbpf_strerror_r(pfd, errmsg, sizeof(errmsg)));
		return libbpf_err_ptr(pfd);
	}

// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L9985

// 如果 kernel 支持 PERF_LINK 则使用 bpf BPF_LINK_CREATE CMD 来创建一个 bpf_link (attr 中
// link_create.perf_event.bpf_cookie 不为空),参考:
// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/bpf.c#L722

// 否则使用 ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd) 和 ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0)
	link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts);
	err = libbpf_get_error(link);
	if (err) {
		close(pfd);
		pr_warn("prog '%s': failed to attach to tracepoint '%s/%s': %s\n",
			prog->name, tp_category, tp_name,
			libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
		return libbpf_err_ptr(err);
	}
	return link;
}

attach_tp 过程如下:

  1. 打开一个 perf tracepoint (而非以前的 perf_event_open_probe)

    pfd = perf_event_open_tracepoint(tp_category, tp_name);
    
  2. link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts);

    • 如果 kernel 支持 PERF_LINK 则使用 bpf BPF_LINK_CREATE CMD 来创建一个 bpf_link (attr 中 link_create.perf_event.bpf_cookie 不为空)
      • 否则使用 ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd) 和 ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) 来设置和开启前面 open 的 perf event tracepoint。

tracepoint 是内核预定义的追踪点,内核定义的所有 trancepoint 可以查看文件 /sys/kernel/debug/tracing/events, 或者使用 perf list tracepoint 命令查看:

#ls /sys/kernel/debug/tracing/events/
alarmtimer        dma_fence   ftrace        irq          mei      page_isolation  regmap   smbus     tlb        xhci-hcd
block             drm         header_event  irq_matrix   migrate  pagemap         resctrl  sock      ucsi
bridge            enable      header_page   irq_vectors  module   percpu          rpcrdma  sunrpc    udp
cgroup            exceptions  huge_memory   jbd2         mpx      power           rpm      swiotlb   vmscan
clk               ext4        hyperv        kmem         msr      printk          rseq     syscalls  vsyscall
cma               fib         i2c           kvm          napi     qdisc           rtc      target    workqueue
compaction        fib6        initcall      kvmmmu       net      random          sched    task      writeback
context_tracking  filelock    iocost        libata       nmi      ras             scsi     tcp       x86_fpu
cpuhp             filemap     iommu         mce          nvme     raw_syscalls    signal   thermal   xdp
devlink           fs_dax      io_uring      mdio         oom      rcu             skb      timer     xen

# perf list tracepoint |& head
  alarmtimer:alarmtimer_cancel                       [Tracepoint event]
  alarmtimer:alarmtimer_fired                        [Tracepoint event]
  alarmtimer:alarmtimer_start                        [Tracepoint event]
  alarmtimer:alarmtimer_suspend                      [Tracepoint event]
  block:block_bio_backmerge                          [Tracepoint event]
  block:block_bio_bounce                             [Tracepoint event]
  block:block_bio_complete                           [Tracepoint event]
  block:block_bio_frontmerge                         [Tracepoint event]
  block:block_bio_queue                              [Tracepoint event]
  block:block_bio_remap                              [Tracepoint event]

文件中各行内容格式: <catalog>/<name>,

root@lima-ebpf-dev:/Users/zhangjun/codes# head /sys/kernel/debug/tracing/available_events
tls:tls_device_offload_set
tls:tls_device_decrypted
tls:tls_device_rx_resync_send
tls:tls_device_rx_resync_nh_schedule
tls:tls_device_rx_resync_nh_delay
tls:tls_device_tx_resync_req
tls:tls_device_tx_resync_send
vsock:virtio_transport_alloc_pkt
vsock:virtio_transport_recv_pkt
btrfs:btrfs_transaction_commit

PROGRAM TYPE: BPF_PROG_TYPE_TRACEPOINT

SEC Name 格式: “tp/<category>/<name>” or “tracepoint/<category>/<name>”

  • <category> 和 <name> 的值均取值前面 available_events 文件中列出的内容。
SEC("tracepoint/<category>/<name>")
// 比如:
SEC("tracepoint/syscalls/sys_enter_openat")

//或:
SEC("tp/<category>/<name>")
// 比如:
 SEC("tp/syscalls/sys_enter_openat")

Event Handler func 参数类型 : 当内核执行最追踪的 tracepoint 位置值,产生一个 tracepoint event,这时可以执行一段 用户自定义的 eBPF tracepoint event handler func ,参数惯例为一个名为 ctx 的指针, = 指向该 event 特定的struct 类型= ,该类型的成员结构,可以查看 event 的 /sys/kernel/debug/tracing/events/<catalog>/<name>/format 文件来看:

  • 空行前的部分:内容固定,占 8Bytes。固定为空行开始的;
  • 空行后的部分:为对应 tracepoint event 的上下文信息,每条占固定 8Bytes.
root@lima-ebpf-dev:/Users/zhangjun/codes# cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_execve/format
name: sys_enter_execve
ID: 716
format:
        field:unsigned short common_type;       offset:0;       size:2; signed:0;
        field:unsigned char common_flags;       offset:2;       size:1; signed:0;
        field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
        field:int common_pid;   offset:4;       size:4; signed:1; // 到这里为固定的部分,8 Bytes

        field:int __syscall_nr; offset:8;       size:4; signed:1; // 从这开始与 event name 相关。
        field:const char * filename;    offset:16;      size:8; signed:0;
        field:const char *const * argv; offset:24;      size:8; signed:0;
        field:const char *const * envp; offset:32;      size:8; signed:0;

print fmt: "filename: 0x%08lx, argv: 0x%08lx, envp: 0x%08lx", ((unsigned long)(REC->filename)), ((unsigned long)(REC->argv)), ((unsigned long)(REC->envp))

root@lima-ebpf-dev:/Users/zhangjun/codes# cat /sys/kernel/debug/tracing/events/syscalls/sys_exit_execve/format
name: sys_exit_execve
ID: 715
format:
        field:unsigned short common_type;       offset:0;       size:2; signed:0;
        field:unsigned char common_flags;       offset:2;       size:1; signed:0;
        field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
        field:int common_pid;   offset:4;       size:4; signed:1;

        field:int __syscall_nr; offset:8;       size:4; signed:1;
        field:long ret; offset:16;      size:8; signed:1;

print fmt: "0x%lx", REC->ret
root@lima-ebpf-dev:/Users/zhangjun/codes#

Event Handler func 的特定参数类型可以在 vmlinux.h 查找,一般 sys_enter_xx 对应 trace_event_raw_sys_enter ,sys_exit_xx 对应 trace_event_raw_sys_exit ,其他的一般对应 trace_event_raw_<name>

  • 该 struct 是由 TRACE_EVENT_FN 调用的 DECLARE_EVENT_CLASS 来创建的。

示例:TRACE_EVENT(sched_switch,

// include/trace/events/sched.h

/*
 * Tracepoint for task switches, performed by the scheduler:
 */
TRACE_EVENT(sched_switch,

	TP_PROTO(bool preempt,               // raw tracepoint 的 ctx 的 args 数组
		 struct task_struct *prev,
		 struct task_struct *next),

	TP_ARGS(preempt, prev, next),

	TP_STRUCT__entry(
		__array(	char,	prev_comm,	TASK_COMM_LEN	) // tracepoint 的 ctx struct 成员
		__field(	pid_t,	prev_pid			)
		__field(	int,	prev_prio			)
		__field(	long,	prev_state			)
		__array(	char,	next_comm,	TASK_COMM_LEN	)
		__field(	pid_t,	next_pid			)
		__field(	int,	next_prio			)
	),

	TP_fast_assign(
		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
		__entry->prev_pid	= prev->pid;
		__entry->prev_prio	= prev->prio;
		__entry->prev_state	= __trace_sched_switch_state(preempt, prev);
		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
		__entry->next_pid	= next->pid;
		__entry->next_prio	= next->prio;
		/* XXX SCHED_DEADLINE */
	),

	TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,

		(__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
		  __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
				{ TASK_INTERRUPTIBLE, "S" },
				{ TASK_UNINTERRUPTIBLE, "D" },
				{ __TASK_STOPPED, "T" },
				{ __TASK_TRACED, "t" },
				{ EXIT_DEAD, "X" },
				{ EXIT_ZOMBIE, "Z" },
				{ TASK_PARKED, "P" },
				{ TASK_DEAD, "I" }) :
		  "R",

		__entry->prev_state & TASK_REPORT_MAX ? "+" : "",
		__entry->next_comm, __entry->next_pid, __entry->next_prio)
);

在 vmlinux.h 中生成的对应的 struct trace_event_raw_sched_switch 结构定义:

// vmlinux-4.19.91-007.h
struct trace_event_raw_sched_switch {
	struct trace_entry ent;

	char prev_comm[16]; // 来源于 TRACE_EVENT 中的 TP_STRUCT__entry
	pid_t prev_pid;
	int prev_prio;
	long int prev_state;
	char next_comm[16];
	pid_t next_pid;
	int next_prio;

	char __data[0];
};

sched_switch 的 format 文件:

root@lima-ebpf-dev:~# cat /sys/kernel/debug/tracing/events/sched/sched_switch/format
name: sched_switch
ID: 318
format:
        field:unsigned short common_type;       offset:0;       size:2; signed:0;
        field:unsigned char common_flags;       offset:2;       size:1; signed:0;
        field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
        field:int common_pid;   offset:4;       size:4; signed:1; // 上面这些是打印 trace event 的 header 信息

        field:char prev_comm[16];       offset:8;       size:16;        signed:1; // 来源于 TRACE_EVENT 中的 TP_STRUCT__entry
        field:pid_t prev_pid;   offset:24;      size:4; signed:1;
        field:int prev_prio;    offset:28;      size:4; signed:1;
        field:long prev_state;  offset:32;      size:8; signed:1;
        field:char next_comm[16];       offset:40;      size:16;        signed:1;
        field:pid_t next_pid;   offset:56;      size:4; signed:1;
        field:int next_prio;    offset:60;      size:4; signed:1;

print fmt: "prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", REC->prev_comm, REC->prev_pid, REC->prev_prio, (REC->prev_state & ((((0x0000 | 0x0001 | 0x0002 | 0x0004 | 0x0008 | 0x0010 | 0x0020 | 0x0040) + 1) << 1) - 1)) ? __print_flags(REC->prev_state & ((((0x0000 | 0x0001 | 0x0002 | 0x0004 | 0x0008 | 0x0010 | 0x0020 | 0x0040) + 1) << 1) - 1), "|", { 0x0001, "S" }, { 0x0002, "D" }, { 0x0004, "T" }, { 0x0008, "t" }, { 0x0010, "X" }, { 0x0020, "Z" }, { 0x0040, "P" }, { 0x0080, "I" }) : "R", REC->prev_state & (((0x0000 | 0x0001 | 0x0002 | 0x0004 | 0x0008 | 0x0010 | 0x0020 | 0x0040) + 1) << 1) ? "+" : "", REC->next_comm, REC->next_pid, REC->next_prio

如果没有 vmlinux.h, 也可以根据 format 来自定义 ctx struct:

  • 举例一:sched/sched_switch
root@lima-ebpf-dev:/Users/zhangjun/codes# cat /sys/kernel/debug/tracing/events/sched/sched_switch/format
name: sched_switch
ID: 318
format:
        field:unsigned short common_type;       offset:0;       size:2; signed:0;
        field:unsigned char common_flags;       offset:2;       size:1; signed:0;
        field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
        field:int common_pid;   offset:4;       size:4; signed:1;

        field:char prev_comm[16];       offset:8;       size:16;        signed:1;
        field:pid_t prev_pid;   offset:24;      size:4; signed:1;
        field:int prev_prio;    offset:28;      size:4; signed:1;
        field:long prev_state;  offset:32;      size:8; signed:1;
        field:char next_comm[16];       offset:40;      size:16;        signed:1;
        field:pid_t next_pid;   offset:56;      size:4; signed:1;
        field:int next_prio;    offset:60;      size:4; signed:1;

print fmt: "prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", REC->prev_comm, REC->prev_pid, REC->prev_prio, (REC->prev_state & ((((0x0000 | 0x0001 | 0x0002 | 0x0004 | 0x0008 | 0x0010 | 0x0020 | 0x0040) + 1) << 1) - 1)) ? __print_flags(REC->prev_state & ((((0x0000 | 0x0001 | 0x0002 | 0x0004 | 0x0008 | 0x0010 | 0x0020 | 0x0040) + 1) << 1) - 1), "|", { 0x0001, "S" }, { 0x0002, "D" }, { 0x0004, "T" }, { 0x0008, "t" }, { 0x0010, "X" }, { 0x0020, "Z" }, { 0x0040, "P" }, { 0x0080, "I" }) : "R", REC->prev_state & (((0x0000 | 0x0001 | 0x0002 | 0x0004 | 0x0008 | 0x0010 | 0x0020 | 0x0040) + 1) << 1) ? "+" : "", REC->next_comm, REC->next_pid, REC->next_prio
root@lima-ebpf-dev:/Users/zhangjun/codes#


/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
struct sched_switch_args {
	unsigned long long pad; // 64Bytes ,对应 format 文件空行前的固定部分

	char prev_comm[16]; // format 文件空行后的部分,注意:出了看类型外,要同时看 offset。
	int prev_pid;
	int prev_prio;
	long long prev_state;
	char next_comm[16];
	int next_pid;
	int next_prio;
};

SEC("tracepoint/sched/sched_switch")
int oncpu(struct sched_switch_args *ctx)
{
	/* record previous thread sleep time */
	u32 pid = ctx->prev_pid;
...
}
  • 举例二:syscalls/sys_enter_open,即 open 系统调用;
// 对于 tracepoint syscall 也是一样的自定义 ctx 类型:
root@lima-ebpf-dev:/Users/zhangjun/codes# head -40 /sys/kernel/debug/tracing/events/syscalls/sys_enter_open/format
name: sys_enter_open
ID: 640
format:
        field:unsigned short common_type;       offset:0;       size:2; signed:0;
        field:unsigned char common_flags;       offset:2;       size:1; signed:0;
        field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
        field:int common_pid;   offset:4;       size:4; signed:1;


        // 系统调用号是 int 类型,占 4 bytes,但是下一个 filed offset 16,所以实际会有 4 byte padding。
        // 正因为有 padding,
        field:int __syscall_nr; offset:8;       size:4; signed:1;
        // 以下开始为系统调用参数,值可以是指针或数字,最多有 5 个,每个值固定为 8Bytes
        field:const char * filename;    offset:16;      size:8; signed:0;
        field:int flags;        offset:24;      size:8; signed:0;  // 虽然是 int 类型,但 size 固定为 8 Bytes
        field:umode_t mode;     offset:32;      size:8; signed:0;

print fmt: "filename: 0x%08lx, flags: 0x%08lx, mode: 0x%08lx", ((unsigned long)(REC->filename)), ((unsigned long)(REC->flags)), ((unsigned long)(REC->mode))

// 对应的自定义 ctx 类型定义: file:///Users/zhangjun/codes/kernel/linux-v4.19.91/samples/bpf/syscall_tp_kern.c
// unsed/syscall_nr 两个字段多所有系统调用来说是固定的,后面才是各自的系统调用参数。
// 不管系统调用参数实际什么类型,	在这里都要占用 8Bytes,可以统一定义为 long,然后后续按需转换为指针!
struct syscalls_enter_open_args {
	unsigned long long unused; // 对应于 __syscall_nr 前的 8bytes.

	long syscall_nr;  // 虽然 format 里定义为 int 为 4Bytes,,但下一个 offset 为 16,所以实际占用 8Byttes即 long 类型。
	long filename_ptr;
	long flags;
	long mode;
};

struct syscalls_exit_open_args {
	unsigned long long unused;

	long syscall_nr;
	long ret;
};

SEC("tracepoint/syscalls/sys_enter_open")
int trace_enter_open(struct syscalls_enter_open_args *ctx)
{
	count((void *)&enter_open_map);
	return 0;
}

SEC("tracepoint/syscalls/sys_exit_open")
int trace_enter_exit(struct syscalls_exit_open_args *ctx)
{
	count((void *)&exit_open_map);
	return 0;
}

cilium 使用 Tracepoint 来进行 Attach。

可以使用 bpftrace -l “tracepoint:*execve*” 来列出所有包含 execve 关键关键字的 tracepoint,使用 bpftrace -lv “tracepoint:XX” 来显示 XX 的函数参数列表:

root@lima-ebpf-dev:~# bpftrace -l 'tracepoint:*execve'
tracepoint:syscalls:sys_enter_execve
tracepoint:syscalls:sys_exit_execve

root@lima-ebpf-dev:~# bpftrace -lv 'tracepoint:syscalls:sys_enter_openat'
tracepoint:syscalls:sys_enter_openat
    int __syscall_nr
    int dfd
    const char * filename
    int flags
    umode_t mode
root@lima-ebpf-dev:~#

参考:

  1. https://mozillazg.com/2022/05/ebpf-libbpf-tracepoint-common-questions.html

10 attach_raw_tp
#

attach_raw_tp 函数定义:

  • 与使用 perf 接口的 tracepoint 不同,raw tp 内置在 bpf syscall 中,通过它的 CMD BPF_RAW_TRACEPOINT_OPEN 来创建一个 Program fd:

    fd = sys_bpf_fd(BPF_RAW_TRACEPOINT_OPEN, &attr, attr_sz);
    
// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L11576
static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link)
{
	static const char *const prefixes[] = {
		"raw_tp",
		"raw_tracepoint",
		"raw_tp.w",
		"raw_tracepoint.w",
	};
	size_t i;
	const char *tp_name = NULL;

	*link = NULL;

	for (i = 0; i < ARRAY_SIZE(prefixes); i++) {
		size_t pfx_len;

		if (!str_has_pfx(prog->sec_name, prefixes[i]))
			continue;

		pfx_len = strlen(prefixes[i]);
		/* no auto-attach case of, e.g., SEC("raw_tp") */
		if (prog->sec_name[pfx_len] == '\0')
			return 0;

		if (prog->sec_name[pfx_len] != '/')
			continue;

		tp_name = prog->sec_name + pfx_len + 1;
		break;
	}

	if (!tp_name) {
		pr_warn("prog '%s': invalid section name '%s'\n",
			prog->name, prog->sec_name);
		return -EINVAL;
	}

	*link = bpf_program__attach_raw_tracepoint(prog, tp_name);
	return libbpf_get_error(*link);
}


struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *prog,
						    const char *tp_name)
{
	char errmsg[STRERR_BUFSIZE];
	struct bpf_link *link;
	int prog_fd, pfd;

	prog_fd = bpf_program__fd(prog);
	if (prog_fd < 0) {
		pr_warn("prog '%s': can't attach before loaded\n", prog->name);
		return libbpf_err_ptr(-EINVAL);
	}

	link = calloc(1, sizeof(*link));
	if (!link)
		return libbpf_err_ptr(-ENOMEM);
	link->detach = &bpf_link__detach_fd;

    // 重点:使用 bpf_raw_tracepoint_open 来创建 perf fd
	pfd = bpf_raw_tracepoint_open(tp_name, prog_fd);
	if (pfd < 0) {
		pfd = -errno;
		free(link);
		pr_warn("prog '%s': failed to attach to raw tracepoint '%s': %s\n",
			prog->name, tp_name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg)));
		return libbpf_err_ptr(pfd);
	}
    // 绑定 perf fd
	link->fd = pfd;
	return link;
}

int bpf_raw_tracepoint_open(const char *name, int prog_fd)
{
	const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint);
	union bpf_attr attr;
	int fd;

	memset(&attr, 0, attr_sz);
	attr.raw_tracepoint.name = ptr_to_u64(name);
	attr.raw_tracepoint.prog_fd = prog_fd;

    // 使用  bpf CMD BPF_RAW_TRACEPOINT_OPEN 来创建一个 Program fd
	fd = sys_bpf_fd(BPF_RAW_TRACEPOINT_OPEN, &attr, attr_sz);
	return libbpf_err_errno(fd);
}

// 直接使用 bpf() 系统调用来根据 attr 绑定 raw tracepoint
static inline int sys_bpf_fd(enum bpf_cmd cmd, union bpf_attr *attr,
			     unsigned int size)
{
	int fd;

	fd = sys_bpf(cmd, attr, size);
	return ensure_good_fd(fd);
}

SEC Name 格式: SEC(“raw_tp/<name>”) 或 SEC(“raw_tracepoint/<name>”) 或 SEC(“raw_tp.w/<name>”) 或 SEC(“raw_tracepoint.w/<name>”)

SEC("raw_tracepoint/<name>")
// 比如:
SEC("raw_tracepoint/sched_switch")

// 或:
SEC("raw_tp/<name>")
// 比如:
SEC("raw_tp/sched_switch")

raw_tracepoint 和 tracepoint 类似,能 tracepoint 的 event 也是一致的, 但 SEC 中只需要指定 event <name>.

PROGRAM TYPE: BPF_PROG_TYPE_RAW_TRACEPOINT, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE

Event Handler func 参数类型 : raw tracepoint event handler 的 ctx 类型统一为 struct bpf_raw_tracepoint_args, 但是内部 args 指向的地址内容与 <name> 有关系:

struct bpf_raw_tracepoint_args {
    __u64 args[0];
};

// file:///Users/zhangjun/codes/kernel/linux-v4.19.91/samples/bpf/test_overhead_raw_tp_kern.c

SEC("raw_tracepoint/task_rename")
int prog(struct bpf_raw_tracepoint_args *ctx)
{
	return 0;
}

SEC("raw_tracepoint/urandom_read")
int prog2(struct bpf_raw_tracepoint_args *ctx)
{
	return 0;
}
char _license[] SEC("license") = "GPL";

为了确定 struct bpf_raw_tracepoint_args 中的 args 指向内容,步骤如下:

  1. 找到 <name> 的 TRACE_EVENT_FN event 定义,这里以 sys_enter 为例(内容取自 include/trace/events/syscalls.h,大部分事件主要集中在 include/trace/events/ 目录下) 。
TRACE_EVENT_FN(sys_enter,
    TP_PROTO(struct pt_regs *regs, long id),
    TP_ARGS(regs, id),
    TP_STRUCT__entry(
        __field(    long,           id              )
        __array(    unsigned long,  args,   6       )
    ),
    TP_fast_assign(
        __entry->id = id;
        syscall_get_arguments(current, regs, __entry->args);
    ),
    TP_printk("NR %ld (%lx, %lx, %lx, %lx, %lx, %lx)",
          __entry->id,
          __entry->args[0], __entry->args[1], __entry->args[2],
          __entry->args[3], __entry->args[4], __entry->args[5]),
    syscall_regfunc, syscall_unregfunc
);

其中 TP_PROTO(struct pt_regs *regs, long id) 定义了可以通过 bpf_raw_tracepoint_args 的 args 拿到的信息:

  1. struct pt_regs *regs = (struct pt_regs *) ctx->args[0];
  2. unsigned long syscall_id = ctx->args[1];

拿到了 struct pt_regs *regs ,然后在继续获取对应的系统调用参数,这可以查看对应的 format 文件来确定:

  • 空行后的第二个部分,__syscall_nr 后的内容即为系统调用函数参数列表。
root@lima-ebpf-dev:~# cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_fchmodat/format
name: sys_enter_fchmodat
ID: 652
format:
        field:unsigned short common_type;       offset:0;       size:2; signed:0;
        field:unsigned char common_flags;       offset:2;       size:1; signed:0;
        field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
        field:int common_pid;   offset:4;       size:4; signed:1;

        field:int __syscall_nr; offset:8;       size:4; signed:1;
        field:int dfd;  offset:16;      size:8; signed:0;   // 内核函数第一个参数:PT_REGS_PARM1_CORE()
        field:const char * filename;    offset:24;      size:8; signed:0;
        field:umode_t mode;     offset:32;      size:8; signed:0;

print fmt: "dfd: 0x%08lx, filename: 0x%08lx, mode: 0x%08lx", ((unsigned long)(REC->dfd)), ((unsigned long)(REC->filename)), ((unsigned long)(REC->mode))

因为 regs 是 struct pt_regs 类型,所以我们可以通过 libbpf 提供的 PT_REGS_PARM1_CORE(regs) 获取第一个参数的值,PT_REGS_PARM2_CORE(regs) 获取第二个参数的值, PT_REGS_PARM3_CORE(regs) 获取第三个参数的值,以此类推, 可以通过PT_REGS_PARM4_CORE 和 PT_REGS_PARM5_CORE 分别获取 regs 中第四个和第五个参数的值。

SEC("raw_tracepoint/sys_enter")
int raw_tracepoint__sys_enter(struct bpf_raw_tracepoint_args *ctx)
{
	 // 根据 TRACE_EVENT_FN(sys_enter 中的 TP_PROTO(struct pt_regs *regs, long id) 来确定
	// ctx->args 数组的内容。
	// ctx->args[0] --》 保存系统调用函数参数的 struct pt_regs *regs;
	// ctx->args[1] --》 保存 syscall id

    unsigned long syscall_id = ctx->args[1];
    if(syscall_id != 268)    // 过滤系统调用 id,只处理 fchmodat 系统调用
        return 0;

    struct pt_regs *regs;
    regs = (struct pt_regs *) ctx->args[0]; // 获得 struct pt_regs *

    char pathname[256];
    u32 mode;

    // 读取第二个系统调用函数参数的值
    // PT_REGS_PARM[1-5], 也即从 1 开始计数。
    char *pathname_ptr = (char *) PT_REGS_PARM2_CORE(regs);
    bpf_core_read_user_str(&pathname, sizeof(pathname), pathname_ptr);

    // 读取第三个系统调用函数参数的值
    mode = (u32) PT_REGS_PARM3_CORE(regs);

    char fmt[] = "fchmodat %s %d\n";
    bpf_trace_printk(fmt, sizeof(fmt), &pathname, mode);
    return 0;
}

完整的示例程序详见:

参考:

  1. https://mozillazg.com/2022/05/ebpf-libbpf-raw-tracepoint-common-questions.html

cilium 使用 AttachRawTracepoint 来 Attach。

11 tp 和 raw_tp 差别
#

raw trapcepoint event handler 的参数是自由一个变长数组成员 args[0] 的通用 struct bpf_raw_tracepoint_args *

struct bpf_raw_tracepoint_args {
    __u64 args[0];
};

而 tracepoint event handler 的参数都是在 vmlinux.h 中定义的特定 struct trace_event_raw_<name> *

// vmlinux-4.19.91-007.h
struct trace_event_raw_sched_switch {
	struct trace_entry ent;
	char prev_comm[16];
	pid_t prev_pid;
	int prev_prio;
	long int prev_state;
	char next_comm[16];
	pid_t next_pid;
	int next_prio;
	char __data[0];
};

eBPF 内核在调用 trcepoint event handler 程序前需要填充 trace_event_raw_<name> 中的各成员字段,但是对于bpf_raw_tracepoint_args 则不需要填充,后续访问的都是事件的原始参数:

  • tracepoint 的 ctx 是 vmlinux.h 预定义的结构体类型,包含各具体的参数字段值,具体来源于 TRACE_EVENT 定义中的TP_STRUCT__entry 部分。
  • raw tracepoint 的 ctx 类型 struct bpf_raw_tracepoint_args 就一个未填充的 args 字段,具体来源于 TRACE_EVENT定义中的 TP_PROTO 部分。

因此 raw tracepoint 相比 tracepoint 性能通常会更好一点(数据来自 https://lwn.net/Articles/750569/ )

samples/bpf/test_overhead performance on 1 cpu:

tracepoint    base  kprobe+bpf tracepoint+bpf raw_tracepoint+bpf
task_rename   1.1M   769K        947K            1.0M
urandom_read  789K   697K        750K            755K

以 net_dev_start_xmit 的 TRACE_EVENT定义为例:

// include/trace/events/net.h
TRACE_EVENT(net_dev_start_xmit,
	TP_PROTO(const struct sk_buff *skb, const struct net_device *dev), // raw tracepoint 的 ctx->args 内容
	TP_ARGS(skb, dev),
	TP_STRUCT__entry( // tracepoint 的结构类型中的内容
		__string(	name,			dev->name	)
		__field(	u16,			queue_mapping	)
		__field(	const void *,		skbaddr		)
		__field(	bool,			vlan_tagged	)
		__field(	u16,			vlan_proto	)
		__field(	u16,			vlan_tci	)
		__field(	u16,			protocol	)
		__field(	u8,			ip_summed	)
		__field(	unsigned int,		len		)
		__field(	unsigned int,		data_len	)
		__field(	int,			network_offset	)
		__field(	bool,			transport_offset_valid)
		__field(	int,			transport_offset)
		__field(	u8,			tx_flags	)
		__field(	u16,			gso_size	)
		__field(	u16,			gso_segs	)
		__field(	u16,			gso_type	)
	),

	TP_fast_assign( // 用 TP_PROTO 的 skb/dev 填充 TP_STRUCT__entry 的规则
		__assign_str(name, dev->name);
		__entry->queue_mapping = skb->queue_mapping;
		__entry->skbaddr = skb;
		__entry->vlan_tagged = skb_vlan_tag_present(skb);
		__entry->vlan_proto = ntohs(skb->vlan_proto);
		__entry->vlan_tci = skb_vlan_tag_get(skb);
		__entry->protocol = ntohs(skb->protocol);
		__entry->ip_summed = skb->ip_summed;
		__entry->len = skb->len;
		__entry->data_len = skb->data_len;
		__entry->network_offset = skb_network_offset(skb);
		__entry->transport_offset_valid =
			skb_transport_header_was_set(skb);
		__entry->transport_offset = skb_transport_offset(skb);
		__entry->tx_flags = skb_shinfo(skb)->tx_flags;
		__entry->gso_size = skb_shinfo(skb)->gso_size;
		__entry->gso_segs = skb_shinfo(skb)->gso_segs;
		__entry->gso_type = skb_shinfo(skb)->gso_type;
	),

	TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
		  __get_str(name), __entry->queue_mapping, __entry->skbaddr,
		  __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci,
		  __entry->protocol, __entry->ip_summed, __entry->len,
		  __entry->data_len,
		  __entry->network_offset, __entry->transport_offset_valid,
		  __entry->transport_offset, __entry->tx_flags,
		  __entry->gso_size, __entry->gso_segs, __entry->gso_type)
)

对应的 format 文件中的第二部分来源于 TRACE_EVENT 的 TP_STRUCT__entry:

root@lima-ebpf-dev:~# cat /sys/kernel/debug/tracing/events/net/net_dev_start_xmit/format
name: net_dev_start_xmit
ID: 1493
format:
        field:unsigned short common_type;       offset:0;       size:2; signed:0;
        field:unsigned char common_flags;       offset:2;       size:1; signed:0;
        field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
        field:int common_pid;   offset:4;       size:4; signed:1;

        field:__data_loc char[] name;   offset:8;       size:4; signed:1; // 来源于 TP_STRUCT__entry
        field:u16 queue_mapping;        offset:12;      size:2; signed:0;
        field:const void * skbaddr;     offset:16;      size:8; signed:0;
        field:bool vlan_tagged; offset:24;      size:1; signed:0;
        field:u16 vlan_proto;   offset:26;      size:2; signed:0;
        field:u16 vlan_tci;     offset:28;      size:2; signed:0;
        field:u16 protocol;     offset:30;      size:2; signed:0;
        field:u8 ip_summed;     offset:32;      size:1; signed:0;
        field:unsigned int len; offset:36;      size:4; signed:0;
        field:unsigned int data_len;    offset:40;      size:4; signed:0;
        field:int network_offset;       offset:44;      size:4; signed:1;
        field:bool transport_offset_valid;      offset:48;      size:1; signed:0;
        field:int transport_offset;     offset:52;      size:4; signed:1;
        field:u8 tx_flags;      offset:56;      size:1; signed:0;
        field:u16 gso_size;     offset:58;      size:2; signed:0;
        field:u16 gso_segs;     offset:60;      size:2; signed:0;
        field:u16 gso_type;     offset:62;      size:2; signed:0;

print fmt: "dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x", __get_str(name), REC->queue_mapping, REC->skbaddr, REC->vlan_tagged, REC->vlan_proto, REC->vlan_tci, REC->protocol, REC->ip_summed, REC->len, REC->data_len, REC->network_offset, REC->transport_offset_valid, REC->transport_offset, REC->tx_flags, REC->gso_size, REC->gso_segs, REC->gso_type

对应的 tracdpoint ctx struct 类型定义(需要大量字段要 eBPF 内核填充):

// vmlinux.h
struct trace_event_raw_net_dev_start_xmit {
	struct trace_entry ent;
	u32 __data_loc_name;
	u16 queue_mapping;
	const void *skbaddr;
	bool vlan_tagged;
	u16 vlan_proto;
	u16 vlan_tci;
	u16 protocol;
	u8 ip_summed;
	unsigned int len;
	unsigned int data_len;
	int network_offset;
	bool transport_offset_valid;
	int transport_offset;
	u8 tx_flags;
	u16 gso_size;
	u16 gso_segs;
	u16 gso_type;
	char __data[0];
}

而对应的 bpf_raw_tracepoint_args->args 只有两个成员的指针(TRACE_EVENT 中的 TP_PROTO):

TP_PROTO(const struct sk_buff *skb, const struct net_device *dev), // raw tracepoint 的 ctx->args 内容
TP_ARGS(skb, dev),

参考:

  1. https://mozillazg.com/2022/05/ebpf-libbpf-raw-tracepoint-common-questions.html

12 attach_trace
#

https://github.com/woodsts/linux-stable/commit/f06977f73ec0b3f0c14bc1005ffcb43edbb81796

btf_trace_xxx types, crucial for tp_btf BPF programs (raw tracepoint with verifier-checked direct memory access), have to be preserved in kernel BTF to allow verifier do its job and enforce type/memory safety. It was reported ([0]) that for kernels built with Clang current type-casting approach doesn’t preserve these types.

This patch fixes it by declaring an anonymous union for each registered tracepoint, capturing both struct bpf_raw_event_map information, as well as recording btf_trace_##call type reliably. Structurally, it’s still the same content as for a plain struct bpf_raw_event_map, so no other changes are necessary.

tracing 类型也称为 btf raw tracepoint,指的是 BTF-powered raw tracepoint (tp_btf) 或者说是 BTF-enabled raw tracepoint 。

trace 和 tracepoint 是 两个不同的类型 ,后者依赖 perf event,而前者和 raw_tp 类似,都是 bpf syscall 直接提供的 bpf link cmd 来进行 attach。

raw tracepoint(raw tp)解决了 tracepoint 的性能问题,将原始参数变成了 u64 数组,但是它并不知道参数的个数,也不知道每个参数的数据类型,无法对其做进一步的检查。btf raw tracepoint(btf raw tp)的引入,就是为了解决这一问题。它在加载的同时,会把该 tracepoint 对应的 btf 信息提交给内核,这样内核就能在 load bpf program 时对其做检查了。

btf raw tracepoint 跟常规 raw tracepoint 有一个最主要的区别是: btf 版本可以 直接在 ebpf 程序中访问内核内存 ,不需要像常规 raw tracepoint 一样需要借助类似 bpf_core_read 或 bpf_probe_read_kernel 这样的辅助函数才能访问内核内存。

BPF_PROG_TYPE_TRACING 一族的 BPF trampoline (fentry/fexit/fmod_ret/freplace) 复用,都提供了对 btf 更加完善的支持,因此都像 btf raw tp 一样,可支持直接访问结构体内部的数据成员,以及参数检查等。而且,在实现上效率也更高。

struct task_struct *task = (struct task_struct *) bpf_get_current_task();
u32 ppid = BPF_CORE_READ(task, real_parent, tgid);

// btf enabled
struct task_struct *task = (struct task_struct *) bpf_get_current_task_btf();
u32 ppid = task->real_parent->tgid;

BPF_PROG_TYPE_TRACING 有 4 种 attach 类型:

  • BPF_TRACE_RAW_TP : raw tracepoint + btf,可以直接访问成员;
  • BPF_TRACE_FENTRY: 类似于 kprobe,适用于 SEC(“fentry/XX”);
  • BPF_TRACE_FEXIT:类似于 kretprobe, 适用于 SEC(“fexit/XX”);
  • BPF_MODIFY_RETURN:适用于 SEC(“fmod_ret/XX”);
  • BPF_TRACE_ITER: 适用于 SEC(“iter/XX”);

tp_btf 实现

  • 依赖 BTF btf_id,即将 BFP Program 和 BTF ID attach 到一起。
  • 对应的 SEC 是 tp_btf/fentry/fexit/fmod_ret/freplace: attach_trace, 依赖 BTF 信息。
  • 对于老内核如 4.19.91 不提供 BPF_LINK_CREATE CMD(内核头文件: include/uapi/linux/bpf.h), 会自动 fallback 到bpf_raw_tracepoint_open 进而使用 bpf CMD BPF_RAW_TRACEPOINT_OPEN 来创建一个 Program fd;
// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L11670
static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link)
{
	*link = bpf_program__attach_trace(prog);
	return libbpf_get_error(*link);
}

struct bpf_link *bpf_program__attach_trace(const struct bpf_program *prog)
{
	return bpf_program__attach_btf_id(prog, NULL);
}


/* Common logic for all BPF program types that attach to a btf_id */
static struct bpf_link *bpf_program__attach_btf_id(const struct bpf_program *prog,
						   const struct bpf_trace_opts *opts)
{
    // 创建 一个 bpf_link opts
	LIBBPF_OPTS(bpf_link_create_opts, link_opts);
	char errmsg[STRERR_BUFSIZE];
	struct bpf_link *link;
	int prog_fd, pfd;

	if (!OPTS_VALID(opts, bpf_trace_opts))
		return libbpf_err_ptr(-EINVAL);

	prog_fd = bpf_program__fd(prog);
	if (prog_fd < 0) {
		pr_warn("prog '%s': can't attach before loaded\n", prog->name);
		return libbpf_err_ptr(-EINVAL);
	}

	link = calloc(1, sizeof(*link));
	if (!link)
		return libbpf_err_ptr(-ENOMEM);
	link->detach = &bpf_link__detach_fd;

	/* libbpf is smart enough to redirect to BPF_RAW_TRACEPOINT_OPEN on old kernels */
	link_opts.tracing.cookie = OPTS_GET(opts, cookie, 0);

    // 重点:使用 bpf 系统调用创建一个 bpf_link 来实现 attach。
	pfd = bpf_link_create(prog_fd, 0, bpf_program__expected_attach_type(prog), &link_opts);
	if (pfd < 0) {
		pfd = -errno;
		free(link);
		pr_warn("prog '%s': failed to attach: %s\n",
			prog->name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg)));
		return libbpf_err_ptr(pfd);
	}
    // 绑定
	link->fd = pfd;
	return link;
}

// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/bpf.c#L683
int bpf_link_create(int prog_fd, int target_fd,
		    enum bpf_attach_type attach_type,
		    const struct bpf_link_create_opts *opts)
{
	const size_t attr_sz = offsetofend(union bpf_attr, link_create);
	__u32 target_btf_id, iter_info_len;
	union bpf_attr attr;
	int fd, err;

	if (!OPTS_VALID(opts, bpf_link_create_opts))
		return libbpf_err(-EINVAL);

	iter_info_len = OPTS_GET(opts, iter_info_len, 0);
	target_btf_id = OPTS_GET(opts, target_btf_id, 0);

	/* validate we don't have unexpected combinations of non-zero fields */
	if (iter_info_len || target_btf_id) {
		if (iter_info_len && target_btf_id)
			return libbpf_err(-EINVAL);
		if (!OPTS_ZEROED(opts, target_btf_id))
			return libbpf_err(-EINVAL);
	}

	memset(&attr, 0, attr_sz);
	attr.link_create.prog_fd = prog_fd;
	attr.link_create.target_fd = target_fd;
	attr.link_create.attach_type = attach_type;
	attr.link_create.flags = OPTS_GET(opts, flags, 0);

	if (target_btf_id) {
		attr.link_create.target_btf_id = target_btf_id;
		goto proceed;
	}

    // 参考 static const struct bpf_sec_def section_defs[] 中的 enum bpf_attach_type expected_attach_type;定义;

	switch (attach_type) {
    // iter+/iter.s+
	case BPF_TRACE_ITER:
		attr.link_create.iter_info = ptr_to_u64(OPTS_GET(opts, iter_info, (void *)0));
		attr.link_create.iter_info_len = iter_info_len;
		break;
	case BPF_PERF_EVENT:
		attr.link_create.perf_event.bpf_cookie = OPTS_GET(opts, perf_event.bpf_cookie, 0);
		if (!OPTS_ZEROED(opts, perf_event))
			return libbpf_err(-EINVAL);
		break;
	case BPF_TRACE_KPROBE_MULTI:
		attr.link_create.kprobe_multi.flags = OPTS_GET(opts, kprobe_multi.flags, 0);
		attr.link_create.kprobe_multi.cnt = OPTS_GET(opts, kprobe_multi.cnt, 0);
		attr.link_create.kprobe_multi.syms = ptr_to_u64(OPTS_GET(opts, kprobe_multi.syms, 0));
		attr.link_create.kprobe_multi.addrs = ptr_to_u64(OPTS_GET(opts, kprobe_multi.addrs, 0));
		attr.link_create.kprobe_multi.cookies = ptr_to_u64(OPTS_GET(opts, kprobe_multi.cookies, 0));
		if (!OPTS_ZEROED(opts, kprobe_multi))
			return libbpf_err(-EINVAL);
		break;
    // fentry+/fexit/fmod_ret/lsm
	case BPF_TRACE_FENTRY:
	case BPF_TRACE_FEXIT:
	case BPF_MODIFY_RETURN:
	case BPF_LSM_MAC:
		attr.link_create.tracing.cookie = OPTS_GET(opts, tracing.cookie, 0);
		if (!OPTS_ZEROED(opts, tracing))
			return libbpf_err(-EINVAL);
		break;
	default:
		if (!OPTS_ZEROED(opts, flags))
			return libbpf_err(-EINVAL);
		break;
	}
proceed:
    // 重点:调用 bpf 系统调用创建一个 link
	fd = sys_bpf_fd(BPF_LINK_CREATE, &attr, attr_sz);
	if (fd >= 0)
		return fd;

    // 对于老内核如 4.19.91 不提供 BPF_LINK_CREATE CMD, 需要 fallback 机制到 bpf_raw_tracepoint_open.

	/* otherwise, for few select kinds of programs that can be
	 * attached using BPF_RAW_TRACEPOINT_OPEN command, try that as
	 * a fallback for older kernels
	 */
	switch (attach_type) {
    // tp_btf+
	case BPF_TRACE_RAW_TP:
	case BPF_LSM_MAC:
	case BPF_TRACE_FENTRY:
	case BPF_TRACE_FEXIT:
	case BPF_MODIFY_RETURN:
		return bpf_raw_tracepoint_open(NULL, prog_fd);
	default:
		return libbpf_err(err);
	}
}

SEC Name 格式 : SEC(“tp_btf/<name>"):

  • <name> 与 raw tracepoint 一致。但是 libbpf 会在运行时内核的 vmlinux BTF 中查找 “btf_trace_##name” 函数值指针定义。
SEC("tp_btf/sched_switch")
SEC("tp_btf/sys_enter")
SEC("tp_btf/sys_exit")

PROGRAM TYPE: BPF_PROG_TYPE_TRACING

区别于 BPF_PROG_TYPE_TRACEPOINT 和 BPF_PROG_TYPE_RAW_TRACEPOINT,内核从 5.5 版本才开始支持BPF_PROG_TYPE_TRACING ,故 4.19 内核不能使用。

btf raw tracepoint 跟 raw tracepoint 所能监控的事件是一样的, tp_btf event handler 的参数类型和 raw tp 一致 ,都是 u64 类型的数组,具体各成员的值取决于对应对应 tracepoint <name> 的 TP_PROTO 声明。对应的 ebpf 函数可以定义成下面这样:

SEC("tp_btf/sys_enter")
int btf_raw_tracepoint__sys_enter(u64 *ctx)
{
  // ...
}

其中 ctx[0] 对应上面 btf_trace_sys_enter 中 void * 后面的第一个参数 struct pt_regs *, ctx[1] 是第二个参数 long int 。这两个参数的含义跟前面 raw tracepoint 中所说的 TP_PROTO(struct pt_regs *regs, long id) 中的含义是一样的。

对应的,使用 btf raw tracepoint 获取 fchmodat 系统调用事件的示例程序如下:

SEC("tp_btf/sys_enter")
int btf_raw_tracepoint__sys_enter(u64 *ctx)
{
    long int syscall_id = (long int)ctx[1];
    if(syscall_id != 268)    // fchmodat
        return 0;

    struct pt_regs *regs = (struct pt_regs *)ctx[0];
    // 后面的逻辑跟前面 raw tracepoint 示例程序中是一样的
    // ...
}

举例:

#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>

// 和 raw tp 相同,直接传 u64 数组,手工将参数从 u64 转换为对应的类型。
SEC("tp_btf/sched_process_fork")
int tp_btf_test_1(u64 *ctx)
{
    struct task_struct *child = (void *)(ctx[0]);
    struct task_struct *parent = (void *)(ctx[1]);

    if (!child)
        return 0;
    bpf_printk("%lx\n", child->pid);
    return 0;
}

// 使用 libbpf 提供的 BPF_PROG 宏函数,直接列出 tracpoint 函数的参数列表。
SEC("tp_btf/sched_process_fork")
int BPF_PROG(tp_btf_test_2, struct task_struct *child, struct task_struct *parent)
{
    if (!child)
        return 0;
    bpf_printk("%lx\n", child->pid);
    return 0;
}

char _license[] SEC("license") = "GPL";

使用 libbpf 提供的 BPF_PROG 宏函数, 可以直接列出 tracepoint 函数的参数列表 :

  • BPF_PROG 适用于 tp_btf/fentry/fexit/freplace/fmod_ret
// 使用 libbpf 提供的 BPF_PROG 宏函数,直接列出 tracpoint 函数的参数列表。
SEC("tp_btf/sched_process_fork")
int BPF_PROG(tp_btf_test_2, struct task_struct *child, struct task_struct *parent)
{
    if (!child)
        return 0;
    bpf_printk("%lx\n", child->pid);
    return 0;
}

// https://elixir.bootlin.com/linux/latest/source/tools/testing/selftests/bpf/progs/task_local_storage.c
SEC("tp_btf/sys_enter")
int BPF_PROG(on_enter, struct pt_regs *regs, long id)
{
	struct task_struct *task;
	long *ptr;

	task = bpf_get_current_task_btf();
	if (task->pid != target_pid)
		return 0;

	ptr = bpf_task_storage_get(&enter_id, task, 0,
				   BPF_LOCAL_STORAGE_GET_F_CREATE);
	if (!ptr)
		return 0;

	__sync_fetch_and_add(&enter_cnt, 1);
	*ptr = MAGIC_VALUE + enter_cnt;

	return 0;
}

SEC("tp_btf/sys_exit")
int BPF_PROG(on_exit, struct pt_regs *regs, long id)
{
	struct task_struct *task;
	long *ptr;

	task = bpf_get_current_task_btf();
	if (task->pid != target_pid)
		return 0;

	ptr = bpf_task_storage_get(&enter_id, task, 0,
				   BPF_LOCAL_STORAGE_GET_F_CREATE);
	if (!ptr)
		return 0;

	__sync_fetch_and_add(&exit_cnt, 1);
	if (*ptr != MAGIC_VALUE + exit_cnt)
		__sync_fetch_and_add(&mismatch_cnt, 1);
	return 0;
}

// https://gist.github.com/teknoraver/2855e0f8770d1363b57d683fa32bccc3
SEC("tp_btf/xdp_devmap_xmit")
int BPF_PROG(tp_xdp_devmap_xmit_multi, const struct net_device *from_dev,
	     const struct net_device *to_dev, int sent, int drops, int err)

{
	randmap(from_dev->ifindex + to_dev->ifindex);
	return 0;
}

SEC("fentry/eth_type_trans")
int BPF_PROG(fentry_eth_type_trans, struct sk_buff *skb,
	     struct net_device *dev, unsigned short protocol)
{
	randmap(dev->ifindex + skb->len);
	return 0;
}

SEC("fexit/eth_type_trans")
int BPF_PROG(fexit_eth_type_trans, struct sk_buff *skb,
	     struct net_device *dev, unsigned short protocol)
{
	randmap(dev->ifindex + skb->len);
	return 0;
}



// https://docs.kernel.org/bpf/cpumasks.html
/* struct containing the struct bpf_cpumask kptr which is stored in the map. */
struct cpumasks_kfunc_map_value {
        struct bpf_cpumask __kptr * bpf_cpumask;
};

/* The map containing struct cpumasks_kfunc_map_value entries. */
struct {
        __uint(type, BPF_MAP_TYPE_ARRAY);
        __type(key, int);
        __type(value, struct cpumasks_kfunc_map_value);
        __uint(max_entries, 1);
} cpumasks_kfunc_map SEC(".maps");

/* ... */

/**
 * A simple example tracepoint program showing how a
 * struct bpf_cpumask * kptr that is stored in a map can
 * be passed to kfuncs using RCU protection.
 */
SEC("tp_btf/cgroup_mkdir")
int BPF_PROG(cgrp_ancestor_example, struct cgroup *cgrp, const char *path)
{
        struct bpf_cpumask *kptr;
        struct cpumasks_kfunc_map_value *v;
        u32 key = 0;

        /* Assume a bpf_cpumask * kptr was previously stored in the map. */
        v = bpf_map_lookup_elem(&cpumasks_kfunc_map, &key);
        if (!v)
                return -ENOENT;

        bpf_rcu_read_lock();
        /* Acquire a reference to the bpf_cpumask * kptr that's already stored in the map. */
        kptr = v->cpumask;
        if (!kptr) {
                /* If no bpf_cpumask was present in the map, it's because
                 * we're racing with another CPU that removed it with
                 * bpf_kptr_xchg() between the bpf_map_lookup_elem()
                 * above, and our load of the pointer from the map.
                 */
                bpf_rcu_read_unlock();
                return -EBUSY;
        }

        bpf_cpumask_setall(kptr);
        bpf_rcu_read_unlock();

        return 0;
}

// https://docs.kernel.org/bpf/kfuncs.html
SEC("tp_btf/task_newtask")
int BPF_PROG(task_get_pid_example, struct task_struct *task, u64 clone_flags)
{
        struct task_struct *lookup;

        lookup = bpf_task_from_pid(task->pid);
        if (!lookup)
                /* A task should always be found, as %task is a tracepoint arg. */
                return -ENOENT;

        if (lookup->pid != task->pid) {
                /* bpf_task_from_pid() looks up the task via its
                 * globally-unique pid from the init_pid_ns. Thus,
                 * the pid of the lookup task should always be the
                 * same as the input task.
                 */
                bpf_task_release(lookup);
                return -EINVAL;
        }

        /* bpf_task_from_pid() returns an acquired reference,
         * so it must be dropped before returning from the
         * tracepoint handler.
         */
        bpf_task_release(lookup);
        return 0;
}
/**
 * Simple tracepoint example that illustrates how a cgroup's
 * ancestor can be accessed using bpf_cgroup_ancestor().
 */
SEC("tp_btf/cgroup_mkdir")
int BPF_PROG(cgrp_ancestor_example, struct cgroup *cgrp, const char *path)
{
        struct cgroup *parent;

        /* The parent cgroup resides at the level before the current cgroup's level. */
        parent = bpf_cgroup_ancestor(cgrp, cgrp->level - 1);
        if (!parent)
                return -ENOENT;

        bpf_printk("Parent id is %d", parent->self.id);

        /* Return the parent cgroup that was acquired above. */
        bpf_cgroup_release(parent);
        return 0;
}

BTW, 在 btf raw tracepoint 程序中可以通过 bpf_get_current_task_btf() 获取 btf 版本的 task 信息。

完整的示例程序如下:

参考:

  1. https://mozillazg.com/2022/06/ebpf-libbpf-btf-powered-enabled-raw-tracepoint-common-questions.html

13 attach_lsm
#

Linux Security Module 是内核 2.5 开始引入的 security framework,通过预定义一些 hook 点,为各种 security module机制提供上下文信息,从而实现 MACs (Mandatory Access Control,强制访问控制)。

Linux Security Modules provides with security hooks necessary to set up the least permissive perimeter for various workloads. A nice introduction to LSMs here.

内核中定义的使用 LSM 实现的 security module 类型:(grep DEFINE_LSM)

Exported grep results:

security/commoncap.c:1471:DEFINE_LSM(capability) = {
include/linux/lsm_hooks.h:110:#define DEFINE_LSM(lsm)							\
security/selinux/hooks.c:7287:DEFINE_LSM(selinux) = {
security/smack/smack_lsm.c:5104:DEFINE_LSM(smack) = {
security/bpf/hooks.c:30:DEFINE_LSM(bpf) = {
security/integrity/iint.c:177:DEFINE_LSM(integrity) = {
security/lockdown/lockdown.c:163:DEFINE_LSM(lockdown) = {
security/tomoyo/tomoyo.c:608:DEFINE_LSM(tomoyo) = {
security/loadpin/loadpin.c:267:DEFINE_LSM(loadpin) = {
security/apparmor/lsm.c:1930:DEFINE_LSM(apparmor) = {
security/landlock/setup.c:37:DEFINE_LSM(LANDLOCK_NAME) = {
security/safesetid/lsm.c:282:DEFINE_LSM(safesetid_security_init) = {
security/yama/yama_lsm.c:479:DEFINE_LSM(yama) = {

其中 bpf 类型是内核 5.7 开始支持的,这允许对一个 lsm hook attach 一个 eBPF 程序,从而实现灵活的授权、拦截策略。对应 PROGRAM TYPE 为 BPF_PROG_TYPE_LSM:

查看系统加载的 LSM Module:

  • capability:root 权限拆分和限制;
  • RHEL/CentOS:默认使用 selinux
  • Ubuntu/Debian:默认使用 apparmor
#cat /sys/kernel/security/lsm
capability,selinux

# ubuntu 20.04
zhangjun@lima-ebpf-dev:/Users/zhangjun/codes/kernel$ cat /sys/kernel/security/lsm
lockdown,capability,landlock,yama,apparmor

注意:

  1. LSM 本身是一个可选 framework,而非 kernel module,可以通过参数 CONFIG_SECURITY 类配置启用。
  2. 上面基于 LSM 实现的 capability,selinux,apparmor 才是 kernel module。

LSM 提供的,可以被 eBPF 插桩的 LSM hooks 位于 security/security.c 文件中(具体可以参考头文件 lsm_hooks.h) :

// security/security.c

Function security_binder_set_context_mgr
Function security_binder_transaction
Function security_binder_transfer_binder
Function security_binder_transfer_file
Function security_ptrace_access_check
Function security_ptrace_traceme
Function security_capget
Function security_capset
Function security_capable
Function security_quotactl
Function security_quota_on
Function security_syslog
Function security_settime64
Function security_vm_enough_memory_mm
Function security_bprm_creds_for_exec
Function security_bprm_creds_from_file
Function security_bprm_check
Function security_bprm_committing_creds
Function security_bprm_committed_creds
Function security_fs_context_dup
Function security_fs_context_parse_param
Function security_sb_alloc
Function security_sb_delete
Function security_sb_free
Function security_free_mnt_opts
Function security_sb_eat_lsm_opts
Function security_sb_mnt_opts_compat
Function security_sb_remount
Function security_sb_kern_mount
Function security_sb_show_options
Function security_sb_statfs
Function security_sb_mount
Function security_sb_umount
Function security_sb_pivotroot
Function security_sb_set_mnt_opts
Function security_sb_clone_mnt_opts
Function security_move_mount
Function security_path_notify
Function security_inode_alloc
Function inode_free_by_rcu
Function security_inode_free
Function security_dentry_init_security
Function security_dentry_create_files_as
Function security_inode_init_security
Function security_inode_init_security_anon
Function security_path_mknod
Function security_path_mkdir
Function security_path_rmdir
Function security_path_unlink
Function security_path_symlink
Function security_path_link
Function security_path_rename
Function security_path_truncate
Function security_path_chmod
Function security_path_chown
Function security_path_chroot
Function security_inode_create
Function security_inode_link
Function security_inode_unlink
Function security_inode_symlink
Function security_inode_mkdir
Function security_inode_rmdir
Function security_inode_mknod
Function security_inode_rename
Function security_inode_readlink
Function security_inode_follow_link
Function security_inode_permission
Function security_inode_setattr
Function security_inode_getattr
Function security_inode_setxattr
Function security_inode_set_acl
Function security_inode_get_acl
Function security_inode_remove_acl
Function security_inode_post_setxattr
Function security_inode_getxattr
Function security_inode_listxattr
Function security_inode_removexattr
Function security_inode_need_killpriv
Function security_inode_killpriv
Function security_inode_getsecurity
Function security_inode_setsecurity
Function security_inode_listsecurity
Function security_inode_getsecid
Function security_inode_copy_up
Function security_inode_copy_up_xattr
Function security_kernfs_init_security
Function security_file_permission
Function security_file_alloc
Function security_file_free
Function security_file_ioctl
Function mmap_prot
Function security_mmap_file
Function security_mmap_addr
Function security_file_mprotect
Function security_file_lock
Function security_file_fcntl
Function security_file_set_fowner
Function security_file_send_sigiotask
Function security_file_receive
Function security_file_open
Function security_file_truncate
Function security_task_alloc
Function security_task_free
Function security_cred_alloc_blank
Function security_cred_free
Function security_prepare_creds
Function security_transfer_creds
Function security_cred_getsecid
Function security_kernel_act_as
Function security_kernel_create_files_as
Function security_kernel_module_request
Function security_kernel_read_file
Function security_kernel_post_read_file
Function security_kernel_load_data
Function security_kernel_post_load_data
Function security_task_fix_setuid
Function security_task_fix_setgid
Function security_task_fix_setgroups
Function security_task_setpgid
Function security_task_getpgid
Function security_task_getsid
Function security_current_getsecid_subj
Function security_task_getsecid_obj
Function security_task_setnice
Function security_task_setioprio
Function security_task_getioprio
Function security_task_prlimit
Function security_task_setrlimit
Function security_task_setscheduler
Function security_task_getscheduler
Function security_task_movememory
Function security_task_kill
Function security_task_prctl
Function security_task_to_inode
Function security_create_user_ns
Function security_ipc_permission
Function security_ipc_getsecid
Function security_msg_msg_alloc
Function security_msg_msg_free
Function security_msg_queue_alloc
Function security_msg_queue_free
Function security_msg_queue_associate
Function security_msg_queue_msgctl
Function security_msg_queue_msgsnd
Function security_msg_queue_msgrcv
Function security_shm_alloc
Function security_shm_free
Function security_shm_associate
Function security_shm_shmctl
Function security_shm_shmat
Function security_sem_alloc
Function security_sem_free
Function security_sem_associate
Function security_sem_semctl
Function security_sem_semop
Function security_d_instantiate
Function security_getprocattr
Function security_setprocattr
Function security_netlink_send
Function security_ismaclabel
Function security_secid_to_secctx
Function security_secctx_to_secid
Function security_release_secctx
Function security_inode_invalidate_secctx
Function security_inode_notifysecctx
Function security_inode_setsecctx
Function security_inode_getsecctx
Function security_post_notification
Function security_watch_key
Function security_unix_stream_connect
Function security_unix_may_send
Function security_socket_create
Function security_socket_post_create
Function security_socket_socketpair
Function security_socket_bind
Function security_socket_connect
Function security_socket_listen
Function security_socket_accept
Function security_socket_sendmsg
Function security_socket_recvmsg
Function security_socket_getsockname
Function security_socket_getpeername
Function security_socket_getsockopt
Function security_socket_setsockopt
Function security_socket_shutdown
Function security_sock_rcv_skb
Function security_socket_getpeersec_stream
Function security_socket_getpeersec_dgram
Function security_sk_alloc
Function security_sk_free
Function security_sk_clone
Function security_sk_classify_flow
Function security_req_classify_flow
Function security_sock_graft
Function security_inet_conn_request
Function security_inet_csk_clone
Function security_inet_conn_established
Function security_secmark_relabel_packet
Function security_secmark_refcount_inc
Function security_secmark_refcount_dec
Function security_tun_dev_alloc_security
Function security_tun_dev_free_security
Function security_tun_dev_create
Function security_tun_dev_attach_queue
Function security_tun_dev_attach
Function security_tun_dev_open
Function security_sctp_assoc_request
Function security_sctp_bind_connect
Function security_sctp_sk_clone
Function security_sctp_assoc_established
Function security_mptcp_add_subflow
Function security_ib_pkey_access
Function security_ib_endport_manage_subnet
Function security_ib_alloc_security
Function security_ib_free_security
Function security_xfrm_policy_alloc
Function security_xfrm_policy_clone
Function security_xfrm_policy_free
Function security_xfrm_policy_delete
Function security_xfrm_state_alloc
Function security_xfrm_state_alloc_acquire
Function security_xfrm_state_delete
Function security_xfrm_state_free
Function security_xfrm_policy_lookup
Function security_xfrm_state_pol_flow_match
Function security_xfrm_decode_session
Function security_skb_classify_flow
Function security_key_alloc
Function security_key_free
Function security_key_permission
Function security_key_getsecurity
Function security_audit_rule_init
Function security_audit_rule_known
Function security_audit_rule_free
Function security_audit_rule_match
Function security_bpf
Function security_bpf_map
Function security_bpf_prog
Function security_bpf_map_alloc
Function security_bpf_prog_alloc
Function security_bpf_map_free
Function security_bpf_prog_free
Function security_locked_down
Function security_perf_event_open
Function security_perf_event_alloc
Function security_perf_event_free
Function security_perf_event_read
Function security_perf_event_write
Function security_uring_override_creds
Function security_uring_sqpoll
Function security_uring_cmd

这些 LSM hook 点函数是内核定义的一组稳定的 hook 点函数,目的是为各种 security module 提供 context 和权限授予支持,目前 Linux Capability(位于 security/commoncap.c 文件),SELinux,AppArmor 等都使用这个 framework 来实现的。

  • 这些 hook 函数在 kernel 关键的位置被调用,用来管理 security field 和 perform access control;

lsm-bpf 依赖运行时内核的 vmlinux btf 信息。

SEC Name 前缀:lsm/lsm.s/lsm_cgroup,例如:

  • SEC(“lsm/file_mprotect”)
  • SEC(“lsm.s/bprm_committed_creds”)

两种 attach type:BPF_LSM_MAC 和 BPF_LSM_CGROUP:

 // 下面需要 auto attach 到 lsm,也依赖 BTF,有对应的 Attach Type。
 SEC_DEF("lsm+",			LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm),
 SEC_DEF("lsm.s+",		LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm),
 SEC_DEF("lsm_cgroup+",		LSM, BPF_LSM_CGROUP, SEC_ATTACH_BTF),
  • 最终复用 attach_trace 的 bpf_program__attach_btf_id().
// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L11682

static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link)
{
	*link = bpf_program__attach_lsm(prog);
	return libbpf_get_error(*link);
}

struct bpf_link *bpf_program__attach_lsm(const struct bpf_program *prog)
{
	return bpf_program__attach_btf_id(prog, NULL);
}
2023-08-03_17-53-22_screenshot.png From:
2023-08-03_17-52-46_screenshot.png From:

示例:

  • tp_btf/fentry/fexit/lsm 等依赖 vmlinux BTF-enabled 的 preifx,都可以使用 libbpf 提供的 BPF_PROG 函数宏来定义 event handle func,这样可以直接指定函数参数列表。
  • 参数里最后一个参数固定为 int ret,表示上一个 eBPF hook 程序的返回值;
    • 一个lsm hook 点函数,可能会 attach 多个处理函数,形成一个 hook func stack,内核会顺序执行这些函数;
    • 一般情况下,return 0 表示 sueecess,其他表示错误。0 也用来表示有些操作是被 granted,否则就是不允许。
    • 一个例子是使用 lsm bpf 来对内核的漏洞进行实时修改:插桩漏洞会调用的 lsm 函数,然后判断是否运行,如果不允许则返回 -error,表示不被授权。 https://blog.cloudflare.com/zh-cn/live-patch-security-vulnerabilities-with-ebpf-lsm-zh-cn/
security/security.c
/**
 * security_file_mprotect() - Check if changing memory protections is allowed
 * @vma: memory region
 * @reqprot: application requested protection
 * @prot: protection applied by the kernel
 *
 * Check permissions before changing memory access permissions.
 *
 * Return: Returns 0 if permission is granted.   # 0 表示操作被允许
 */
int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
			   unsigned long prot)
{
	int ret;

	ret = call_int_hook(file_mprotect, 0, vma, reqprot, prot);
	if (ret)
		return ret;
	return ima_file_mprotect(vma, prot);
}


// https://docs.kernel.org/bpf/prog_lsm.html
int file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot);

SEC("lsm/file_mprotect")
int BPF_PROG(mprotect_audit, struct vm_area_struct *vma,
             unsigned long reqprot, unsigned long prot, int ret)
{
        /* ret is the return value from the previous BPF program
         * or 0 if it's the first hook.
         */
        if (ret != 0)
                return ret;

        int is_heap;

        is_heap = (vma->vm_start >= vma->vm_mm->start_brk &&
                   vma->vm_end <= vma->vm_mm->brk);

        /* Return an -EPERM or write information to the perf events buffer
         * for auditing
         */
        if (is_heap)
                return -EPERM;
}

对于 cilium/ebpf 需要使用 AttachLSM(opts LSMOptions) (Link, error) 函数来进行 attch。

其他更全面的例子:

  1. https://github.com/lumontec/lsmtrace/blob/master/src/lsmtrace.bpf.c
  2. eBPF Tutorial by Example 19: Security Detection and Defense using LSM
  3. Live-patching security vulnerabilities inside the Linux kernel with eBPF Linux Security Module

14 attach_iter
#

SEC Name 前缀: iter/iter.s

最终复用 attach_trace 的 bpf_link_create(), 只不过 attach_type 是 BPF_TRACE_ITER;

// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L11808
static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link)
{
	*link = bpf_program__attach_iter(prog, NULL);
	return libbpf_get_error(*link);
}


struct bpf_link *
bpf_program__attach_iter(const struct bpf_program *prog,
			 const struct bpf_iter_attach_opts *opts)
{
	DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_create_opts);
	char errmsg[STRERR_BUFSIZE];
	struct bpf_link *link;
	int prog_fd, link_fd;
	__u32 target_fd = 0;

	if (!OPTS_VALID(opts, bpf_iter_attach_opts))
		return libbpf_err_ptr(-EINVAL);

	link_create_opts.iter_info = OPTS_GET(opts, link_info, (void *)0);
	link_create_opts.iter_info_len = OPTS_GET(opts, link_info_len, 0);

	prog_fd = bpf_program__fd(prog);
	if (prog_fd < 0) {
		pr_warn("prog '%s': can't attach before loaded\n", prog->name);
		return libbpf_err_ptr(-EINVAL);
	}

	link = calloc(1, sizeof(*link));
	if (!link)
		return libbpf_err_ptr(-ENOMEM);
	link->detach = &bpf_link__detach_fd;


	link_fd = bpf_link_create(prog_fd, target_fd, BPF_TRACE_ITER,
				  &link_create_opts);
	if (link_fd < 0) {
		link_fd = -errno;
		free(link);
		pr_warn("prog '%s': failed to attach to iterator: %s\n",
			prog->name, libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg)));
		return libbpf_err_ptr(link_fd);
	}
	link->fd = link_fd;
	return link;
}

相关文章

eBPF libbpf 库解析
·11325 字
Ebpf
libbpf 库解析,涉及宏定义、内存读写等。
perf_event_open() 系统调用分析
·2476 字
Ebpf
perf_event_open() 系统调用分析
Linux 内核追踪和 eBPF 介绍
··8393 字
Ebpf Ebpf

eBPF 是当今热门的底层技术,在网络、安全、可观测性、云原生等场景得到广泛应用。

本文档先介绍 Linux 内核的各种追踪技术,让大家对于各种事件源、内核各种追踪框架、用户工具等有个初步了解,然后介绍 eBPF 的发展历程、开发和执行流程、开发框架选择和 Demo 示例,希望对于想了解 Linux 内核追踪和 eBPF 技术的同学有所帮助。

不安全:unsafe
··824 字
Rust
Rust