eBPF 常见错误

1 PerCPU Array 解决栈太小的问题
#

ARRAY MAP 的各 key 都是 index 0-N, 而且在创建时就被初始化, 所以可以用于做全局参数, 查某个 index 查 item 时肯定都是存在的。

示例：

#include <uapi/linux/ptrace.h>
#include <linux/sched.h>

struct ameba_event_t {
    u32 pid;
    char comm[TASK_COMM_LEN];
    char arg1[512];
} __attribute__((packed));

BPF_PERF_OUTPUT(ameba_events);
BPF_PERCPU_ARRAY(ameba_struct, struct ameba_event_t, 1);

int get_input_args(struct pt_regs *ctx) {
    int zero = 0;
    if (!PT_REGS_PARM1(ctx))
        return 0;
    struct ameba_event_t* event = ameba_struct.lookup(&zero);
    if (!event)
        return 0;
    event->pid = bpf_get_current_pid_tgid();
    bpf_get_current_comm(&event->comm, sizeof(event->comm));
    bpf_probe_read(&event->arg1, sizeof(event->arg1), (void *)PT_REGS_PARM1(ctx));
    ameba_events.perf_submit(ctx, event, sizeof(*event));
    return 0;
}

2 kprobe do_execve 失败
#

/sys/kernel/debug/tracing/kprobe_events

获得内核可以 probe 的函数名称列表:

/boot/System.map
/proc/kallsyms

4.19 内核:

#uname -r
4.19.91-007.ali4000.alios7.x86_64

#grep do_execv /proc/kallsyms
ffffffff8b2a8e20 t __do_execve_file.isra.34
ffffffff8b2a97e0 T do_execve_file
ffffffff8b2a9810 T do_execve
ffffffff8b2a9840 T do_execveat

5.9 及以后内核:

root@lima-learning-ebpf:ebpf# uname -r
5.15.0-75-generic

root@lima-learning-ebpf:ebpf# grep do_execve /proc/kallsyms
ffffffff90199dd0 t do_execveat_common.isra.0
ffffffff90b2a1fc t do_execveat_common.isra.0.cold

所以在 5.9 及以后内核中 do_execve 函数在符号表中不存在,因为在 5.9 内核中该函数使用了 static 定义, 具体的git commit diff 参考这里，从该版本开始, do_execve 不在内核符号表中。

3 libbpf: BTF loading error: -22; failed to perform CO-RE relocations
#

5.2 之前的内核没有 btf 信息，但是通过 llvm 编译的字节码是 CO-RE 的，在加载时需要内核 btf 文件来进行 reload，所以如果内核没有 btf 文件时，会加载报错：libbpf: BTF loading error: -22

# /tmp/exec
libbpf: BTF loading error: -22
libbpf: -- BEGIN BTF LOAD LOG ---
magic: 0xeb9f
version: 1
flags: 0x0
hdr_len: 24
type_off: 0
type_len: 16812
str_off: 16812
str_len: 13411
btf_total_size: 30247
[1] PTR (anon) type_id=3
[2] INT int size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
[3] ARRAY (anon) type_id=2 index_type_id=4 nr_elems=2
[4] INT __ARRAY_SIZE_TYPE__ size=4 bits_offset=0 nr_bits=32 encoding=(none)
[5] PTR (anon) type_id=6
[6] TYPEDEF u32 type_id=7
[7] TYPEDEF __u32 type_id=8
[8] INT unsigned int size=4 bits_offset=0 nr_bits=32 encoding=(none)
[9] PTR (anon) type_id=10
[10] ARRAY (anon) type_id=2 index_type_id=4 nr_elems=10000
[11] STRUCT (anon) size=32 vlen=4
        type type_id=1 bits_offset=0
        key type_id=5 bits_offset=64
        value type_id=5 bits_offset=128
        max_entries type_id=9 bits_offset=192
[12] INT config_args size=1 bits_offset=0 nr_bits=8 encoding=(none)
[13] PTR (anon) type_id=14
[14] ARRAY (anon) type_id=2 index_type_id=4 nr_elems=8
[15] PTR (anon) type_id=16
[16] ARRAY (anon) type_id=2 index_type_id=4 nr_elems=1
[17] STRUCT (anon) size=32 vlen=4
        type type_id=13 bits_offset=0
        key type_id=5 bits_offset=64
        value type_id=5 bits_offset=128
        max_entries type_id=15 bits_offset=192
[18] INT cgroup_map size=1 bits_offset=0 nr_bits=8 encoding=(none)
[19] PTR (anon) type_id=20
[20] ARRAY (anon) type_id=2 index_type_id=4 nr_elems=10240
[21] PTR (anon) type_id=22
[22] TYPEDEF pid_t type_id=23
[23] TYPEDEF __kernel_pid_t type_id=2
[24] PTR (anon) type_id=25
[25] STRUCT event size=7720 vlen=8
        pid type_id=22 bits_offset=0
        ppid type_id=22 bits_offset=32
        uid type_id=26 bits_offset=64
        retval type_id=2 bits_offset=96
        args_count type_id=2 bits_offset=128
        args_size type_id=8 bits_offset=160
        comm type_id=29 bits_offset=192
        args type_id=30 bits_offset=320
[26] TYPEDEF uid_t type_id=27
[27] TYPEDEF __kernel_uid32_t type_id=8
[28] INT char size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
[29] ARRAY (anon) type_id=28 index_type_id=4 nr_elems=16
[30] ARRAY (anon) type_id=28 index_type_id=4 nr_elems=7680
[31] STRUCT (anon) size=32 vlen=4
        type type_id=15 bits_offset=0
        max_entries type_id=19 bits_offset=64
        key type_id=21 bits_offset=128
        value type_id=24 bits_offset=192
[32] INT execs size=1 bits_offset=0 nr_bits=8 encoding=(none)
[33] PTR (anon) type_id=34
[34] ARRAY (anon) type_id=2 index_type_id=4 nr_elems=4
[35] STRUCT (anon) size=24 vlen=3
        type type_id=33 bits_offset=0
        key_size type_id=33 bits_offset=64
        value_size type_id=33 bits_offset=128
[36] INT events size=1 bits_offset=0 nr_bits=8 encoding=(none)
[37] PTR (anon) type_id=38
[38] STRUCT trace_event_raw_sys_enter size=64 vlen=4
        ent type_id=39 bits_offset=0
        id type_id=42 bits_offset=64
        args type_id=44 bits_offset=128
        __data type_id=45 bits_offset=512
[39] STRUCT trace_entry size=8 vlen=4
        type type_id=40 bits_offset=0
        flags type_id=41 bits_offset=16
        preempt_count type_id=41 bits_offset=24
        pid type_id=2 bits_offset=32
[40] INT unsigned short size=2 bits_offset=0 nr_bits=16 encoding=(none)
[41] INT unsigned char size=1 bits_offset=0 nr_bits=8 encoding=(none)
[42] INT long size=8 bits_offset=0 nr_bits=64 encoding=SIGNED
[43] INT unsigned long size=8 bits_offset=0 nr_bits=64 encoding=(none)
[44] ARRAY (anon) type_id=43 index_type_id=4 nr_elems=6
[45] ARRAY (anon) type_id=28 index_type_id=4 nr_elems=0
[46] ENUM (anon) size=4 vlen=1
        ctx val=37
[47] TYPEDEF tracepoint__syscalls__sys_enter_execve type_id=46
[48] Invalid btf_info:840000d3
-- END BTF LOAD LOG --
libbpf: Error loading .BTF into kernel: -22. BTF is optional, ignoring.
libbpf: failed to parse target BTF: -2
libbpf: failed to perform CO-RE relocations: -2
libbpf: failed to load object 'exec_bpf'
libbpf: failed to load BPF skeleton 'exec_bpf': -2
failed to load BPF object: -2
Segmentation fault

libbpf 解决办法：

libbpf skeleton 程序会自动创建一个 struct bpf_object_open_opts 类型的 opts，然后传给 XX_bpf__open_opts(&opts)；
使用该 opts 的 opts->btf_custom_path = “/path/to/btf” 来指定指定 btf 文件路径。

// https://github.com/lizrice/learning-ebpf/blob/main/chapter6/hello-verifier.c#L39C1-L39C1
	char log_buf[64 * 1024];
	LIBBPF_OPTS(bpf_object_open_opts, opts,
		.kernel_log_buf = log_buf,
		.kernel_log_size = sizeof(log_buf),
		.kernel_log_level = 1,
	);
    opts.btf_custom_path = "/path/to/bft/file";
	skel = hello_verifier_bpf__open_opts(&opts);
	if (!skel) {
		printf("Failed to open BPF object\n");
		return 1;
	}

4 go1.17 和 eBPF 兼容性
#

下面的代码使用的是 go 1.16 版本，当时 Go 还是 stack-based calling convention，所以先从 pt_regs 中获得 sp，然后根据sp 的偏移来获得 go 函数的参数：

// https://github.com/pixie-io/pixie-demos/blob/main/http2-tracing/uprobe_trace/bpf_program.go#L19C1-L21C61

// This does not work for Golang programs built with toolchain version 1.17 or newer. Go 1.17 uses a
// register-based calling convention which the BPF code here cannot handle.

// Signature: func (l *loopyWriter) writeHeader(streamID uint32, endStream bool, hf []hpack.HeaderField, onWrite func())
int probe_loopy_writer_write_header(struct pt_regs* ctx) {
  const void* sp = (const void*)ctx->sp;

  void* fields_ptr;
	const int kFieldsPtrOffset = 24;
  bpf_probe_read(&fields_ptr, sizeof(void*), sp + kFieldsPtrOffset);

  int64_t fields_len;
	const int kFieldsLenOffset = 8;
  bpf_probe_read(&fields_len, sizeof(int64_t), sp + kFieldsPtrOffset + kFieldsLenOffset);

  submit_headers(ctx, fields_ptr, fields_len);
  return 0;
}

对于 go 1.17 以前的程序，bpftrace 需要使用 sarg0/sarg1 来获取函数参数：

https://github.com/iovisor/bpftrace/pull/828/files
go-bpf-gen：https://github.com/stevenjohnstone/go-bpf-gen
- Generate bpftrace scripts for use with golang programs. Works around quirks in the golang runtime.

但是 go 从 1.17 开始切换到 register-base calling convention，与其他 C libbpf 的调用惯例一致

https://go.dev/doc/go1.17#compiler

Go 1.17 开始的 register-base calling convention 如下：Here is the new calling convention in x86_64 platform, refered from Go internal ABI specification.

param argN go1.17+ 1 rdi rax 2 rsi rbx 3 rdx rcx 4 rcx rdi 5 r8 rsi 6 r9 r8 7 stack r9 8 stack r10 9 stack r11 10 stack stack

5 for 循环问题
#

Kernel 5.3 之前 eBPF 不支持 loop, 要想实现类似功能，而且前提是明确 loop 的次数，那么基本上只能靠 unroll 来实现。示例如下:

#pragma clang loop unroll(full)
        for (i = 0; i < 4; i++) {
            /* Do stuff ... */
        }

#pragram clang loop unroll 是 Clang 的一个编译指令, 它在编译时展开循环。5.3 及之后，就不用这样了。

注意：

clang 编译器进行编译优化展开循环，但由于展开后的字节码大小过大，编译器可能会拒绝；
- 解决办法: 使用#pragma unroll(16) 指定 clang 展开 for 循环的次数（默认全部展开）。
for 循环的可能会将 Stack 占满, 导致 load prog 出错；
for 循环的次数必须是固定的;
for循环在编译阶段就要确定是有限 for 循环
循环体内代码尽量精简(避免声明局部变量等)，减少单次循环的指令数量

参考：https://yanhang.me/post/2021-ebpf-loop/

MAP Iterator

Bounded Loop 只能解决一部分的问题。总会有 unbounded loop 的场景。5.13 引入了 map iterator, 支持对 bpf map做遍历。考虑到 bpf map 在 eBPF 程序中的普遍性，算是一个比较好的解决方案了.

具体可参考: https://lwn.net/Articles/826058/

bpf_loop

bpf_loop 仍然是 bounded loop 范畴。但从 verifier 的角度考虑，大大简化了 verify 的逻辑。相当于把 loop 从普通的函数流程里抽取了出来，单独用一个 bpf 函数来实现:

 long bpf_loop(u32 iterations, long (*loop_fn)(u32 index, void *ctx),
    		  void *ctx, u64 flags);

这个 patch 还没合并。

6 for 循环中不能使用 break/contintue，要用 goto
#

#pragma unroll 展开的 for 循环中使用 break 并不能跳出循环：

换成 goto 后才正确：

7 load failed: Invalid argument failed to load: -22
#

-22 对应 Invalid argument。

eBPF 的全局变量是使用 .rodata/.bss 等特殊 elf section 和 Map 来实现的。常见的使用场景是将 bpf_printk() 的 format string 作为全局变量来保存，例如： char const *fm = ‘%s\n’;

.rodata: 全局只读，如上面的字符串；
.bss: 可读写的全局变量。

全局变量需要 5.2 内核才开始支持，4.19 eBPF 不支持全局变量，所以 load 时报错：-22

# ./exec  # execsnoop C 版本
libbpf: loading object 'exec_bpf' from buffer
libbpf: elf: section(3) tracepoint/syscalls/sys_enter_execve, size 1752, link 0, flags 6, type=1
libbpf: sec 'tracepoint/syscalls/sys_enter_execve': found program 'tracepoint__syscalls__sys_enter_execve' at insn offset 0 (0 bytes), code size 219 insns (1752 bytes)
libbpf: elf: section(4) .reltracepoint/syscalls/sys_enter_execve, size 128, link 16, flags 40, type=9
libbpf: elf: section(5) tracepoint/syscalls/sys_exit_execve, size 1184, link 0, flags 6, type=1
libbpf: sec 'tracepoint/syscalls/sys_exit_execve': found program 'tracepoint__syscalls__sys_exit_execve' at insn offset 0 (0 bytes), code size 148 insns (1184 bytes)
libbpf: elf: section(6) .reltracepoint/syscalls/sys_exit_execve, size 128, link 16, flags 40, type=9
libbpf: elf: section(7) .maps, size 120, link 0, flags 3, type=1
libbpf: elf: section(8) .rodata, size 7720, link 0, flags 2, type=1                      // 只读
libbpf: elf: section(9) .rodata.str1.1, size 94, link 0, flags 32, type=1                // 只读，字符串
libbpf: elf: section(10) license, size 4, link 0, flags 3, type=1
libbpf: license of exec_bpf is GPL
libbpf: elf: section(11) .BTF, size 30247, link 0, flags 0, type=1
libbpf: elf: section(13) .BTF.ext, size 2380, link 0, flags 0, type=1
libbpf: elf: section(16) .symtab, size 744, link 1, flags 0, type=2
libbpf: looking for externs among 31 symbols...
libbpf: collected 0 externs total
libbpf: map 'config_args': at sec_idx 7, offset 0.
libbpf: map 'config_args': found type = 2.
libbpf: map 'config_args': found key [6], sz = 4.
libbpf: map 'config_args': found value [6], sz = 4.
libbpf: map 'config_args': found max_entries = 10000.
libbpf: map 'cgroup_map': at sec_idx 7, offset 32.
libbpf: map 'cgroup_map': found type = 8.
libbpf: map 'cgroup_map': found key [6], sz = 4.
libbpf: map 'cgroup_map': found value [6], sz = 4.
libbpf: map 'cgroup_map': found max_entries = 1.
libbpf: map 'execs': at sec_idx 7, offset 64.
libbpf: map 'execs': found type = 1.
libbpf: map 'execs': found key [22], sz = 4.
libbpf: map 'execs': found value [25], sz = 7720.
libbpf: map 'execs': found max_entries = 10240.
libbpf: map 'events': at sec_idx 7, offset 96.
libbpf: map 'events': found type = 4.
libbpf: map 'events': found key_size = 4.
libbpf: map 'events': found value_size = 4.
libbpf: map 'exec_bpf.rodata' (global data): at sec_idx 8, offset 0, flags 480.
libbpf: map 4 is "exec_bpf.rodata"
libbpf: map '.rodata.str1.1' (global data): at sec_idx 9, offset 0, flags 480.
libbpf: map 5 is ".rodata.str1.1"
libbpf: sec '.reltracepoint/syscalls/sys_enter_execve': collecting relocation for section(3) 'tracepoint/syscalls/sys_enter_execve'
libbpf: sec '.reltracepoint/syscalls/sys_enter_execve': relo #0: insn #7 against 'config_args'
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': found map 0 (config_args, sec 7, off 0) for insn #7
libbpf: sec '.reltracepoint/syscalls/sys_enter_execve': relo #1: insn #17 against 'config_args'
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': found map 0 (config_args, sec 7, off 0) for insn #17
libbpf: sec '.reltracepoint/syscalls/sys_enter_execve': relo #2: insn #27 against 'config_args'
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': found map 0 (config_args, sec 7, off 0) for insn #27
libbpf: sec '.reltracepoint/syscalls/sys_enter_execve': relo #3: insn #37 against 'config_args'
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': found map 0 (config_args, sec 7, off 0) for insn #37
libbpf: sec '.reltracepoint/syscalls/sys_enter_execve': relo #4: insn #93 against 'cgroup_map'
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': found map 1 (cgroup_map, sec 7, off 32) for insn #93
libbpf: sec '.reltracepoint/syscalls/sys_enter_execve': relo #5: insn #112 against 'execs'
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': found map 2 (execs, sec 7, off 64) for insn #112
libbpf: sec '.reltracepoint/syscalls/sys_enter_execve': relo #6: insn #114 against '.rodata'
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': found data map 4 (exec_bpf.rodata, sec 8, off 0) for insn 114
libbpf: sec '.reltracepoint/syscalls/sys_enter_execve': relo #7: insn #121 against 'execs'
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': found map 2 (execs, sec 7, off 64) for insn #121
libbpf: sec '.reltracepoint/syscalls/sys_exit_execve': collecting relocation for section(5) 'tracepoint/syscalls/sys_exit_execve'
libbpf: sec '.reltracepoint/syscalls/sys_exit_execve': relo #0: insn #7 against 'config_args'
libbpf: prog 'tracepoint__syscalls__sys_exit_execve': found map 0 (config_args, sec 7, off 0) for insn #7
libbpf: sec '.reltracepoint/syscalls/sys_exit_execve': relo #1: insn #17 against 'config_args'
libbpf: prog 'tracepoint__syscalls__sys_exit_execve': found map 0 (config_args, sec 7, off 0) for insn #17
libbpf: sec '.reltracepoint/syscalls/sys_exit_execve': relo #2: insn #27 against 'config_args'
libbpf: prog 'tracepoint__syscalls__sys_exit_execve': found map 0 (config_args, sec 7, off 0) for insn #27
libbpf: sec '.reltracepoint/syscalls/sys_exit_execve': relo #3: insn #38 against 'config_args'
libbpf: prog 'tracepoint__syscalls__sys_exit_execve': found map 0 (config_args, sec 7, off 0) for insn #38
libbpf: sec '.reltracepoint/syscalls/sys_exit_execve': relo #4: insn #94 against 'cgroup_map'
libbpf: prog 'tracepoint__syscalls__sys_exit_execve': found map 1 (cgroup_map, sec 7, off 32) for insn #94
libbpf: sec '.reltracepoint/syscalls/sys_exit_execve': relo #5: insn #110 against 'execs'
libbpf: prog 'tracepoint__syscalls__sys_exit_execve': found map 2 (execs, sec 7, off 64) for insn #110
libbpf: sec '.reltracepoint/syscalls/sys_exit_execve': relo #6: insn #135 against 'events'
libbpf: prog 'tracepoint__syscalls__sys_exit_execve': found map 3 (events, sec 7, off 96) for insn #135
libbpf: sec '.reltracepoint/syscalls/sys_exit_execve': relo #7: insn #143 against 'execs'
libbpf: prog 'tracepoint__syscalls__sys_exit_execve': found map 2 (execs, sec 7, off 64) for insn #143
libbpf: BTF loading error: -22
libbpf: Error loading .BTF into kernel: -22. BTF is optional, ignoring.
libbpf: map 'config_args': created successfully, fd=3
libbpf: map 'cgroup_map': created successfully, fd=4
libbpf: map 'execs': created successfully, fd=5
libbpf: map 'events': setting size to 96
libbpf: map 'events': created successfully, fd=6
libbpf: map 'exec_bpf.rodata': skipped auto-creating...
libbpf: map '.rodata.str1.1': skipped auto-creating...
libbpf: sec 'tracepoint/syscalls/sys_enter_execve': found 4 CO-RE relocations
libbpf: CO-RE relocating [38] struct trace_event_raw_sys_enter: found target candidate [22619] struct trace_event_raw_sys_enter in [vmlinux]
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': relo #0: <byte_off> [38] struct trace_event_raw_sys_enter.args[1] (0:2:1 @ offset 24)
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': relo #0: matching candidate #0 <byte_off> [22619] struct trace_event_raw_sys_enter.args[1] (0:2:1 @ offset 24)
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': relo #0: patched insn #1 (LDX/ST/STX) off 24 -> 24
libbpf: CO-RE relocating [48] struct task_struct: found target candidate [178] struct task_struct in [vmlinux]
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': relo #1: <byte_off> [48] struct task_struct.real_parent (0:79 @ offset 1432)
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': relo #1: matching candidate #0 <byte_off> [178] struct task_struct.real_parent (0:79 @ offset 1432)
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': relo #1: patched insn #130 (ALU/ALU64) imm 1432 -> 1432
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': relo #2: <byte_off> [48] struct task_struct.tgid (0:77 @ offset 1420)
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': relo #2: matching candidate #0 <byte_off> [178] struct task_struct.tgid (0:77 @ offset 1420)
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': relo #2: patched insn #137 (ALU/ALU64) imm 1420 -> 1420
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': relo #3: <byte_off> [38] struct trace_event_raw_sys_enter.args[0] (0:2:0 @ offset 16)
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': relo #3: matching candidate #0 <byte_off> [22619] struct trace_event_raw_sys_enter.args[0] (0:2:0 @ offset 16)
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': relo #3: patched insn #152 (LDX/ST/STX) off 16 -> 16
libbpf: sec 'tracepoint/syscalls/sys_exit_execve': found 1 CO-RE relocations
libbpf: CO-RE relocating [351] struct trace_event_raw_sys_exit: found target candidate [22620] struct trace_event_raw_sys_exit in [vmlinux]
libbpf: prog 'tracepoint__syscalls__sys_exit_execve': relo #0: <byte_off> [351] struct trace_event_raw_sys_exit.ret (0:2 @ offset 16)
libbpf: prog 'tracepoint__syscalls__sys_exit_execve': relo #0: matching candidate #0 <byte_off> [22620] struct trace_event_raw_sys_exit.ret (0:2 @ offset 16)
libbpf: prog 'tracepoint__syscalls__sys_exit_execve': relo #0: patched insn #116 (LDX/ST/STX) off 16 -> 16
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': relo #7: poisoning insn #114 that loads map #4 'exec_bpf.rodata'
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': BPF program load failed: Invalid argument
libbpf: prog 'tracepoint__syscalls__sys_enter_execve': failed to load: -22
libbpf: failed to load object 'exec_bpf'
libbpf: failed to load BPF skeleton 'exec_bpf': -22
failed to load BPF object: -22
Segmentation fault

可以使用 llvm-objdump 命令来查看 object 文件中的 section：

root@lima-learning-ebpf:/hello-ebpf/processv2# llvm-objdump -dj .rodata exec.bpf.o

exec.bpf.o:     file format elf64-bpf

Disassembly of section .rodata:

0000000000000000 <empty_event>:
                ...
$ make

$ llvm-readelf --sections exec.bpf.o  |head
There are 30 section headers, starting at offset 0x8f5d0:

Section Headers:
  [Nr] Name              Type            Address          Off    Size   ES Flg Lk Inf Al
  [ 0]                   NULL            0000000000000000 000000 000000 00      0   0  0
  [ 1] .strtab           STRTAB          0000000000000000 08f388 000243 00      0   0  1
  [ 2] .text             PROGBITS        0000000000000000 000040 000000 00  AX  0   0  4
  [ 3] tracepoint/syscalls/sys_enter_execve PROGBITS 0000000000000000 000040 0004b0 00  AX  0   0  8
  [ 4] .reltracepoint/syscalls/sys_enter_execve REL 0000000000000000 068210 000070 10   I 29   3  8
  [ 5] tracepoint/syscalls/sys_exit_execve PROGBITS 0000000000000000 0004f0 0002a8 00  AX  0   0  8

// 其他命令
llvm-objdump -no-show-raw-insn -section=raw_tracepoint/sys_enter -S driver/bpf/probe.o

解决办法：将全局变量写成函数内静态变量：

static const struct event empty_event = {};

https://github.com/libbpf/libbpf-bootstrap/issues/135

As a work around, you can add #define BPF_NO_GLOBAL_DATA before #include <bpf/bpf_helpers.h> to make bpf_printk() not use static variables. That will eliminate the need to have .rodata.str1.1.

8 load program: permission denied: invalid indirect read from stack off
#

load 阶段 verifier error 的详细信息：

2023/07/13 12:10:01 list.go:49: PID: 48137, Name: exec, Cmdline: ./exec process --btf ./vmlinux-4.19.91-007.btf, Cgroup : /system.slice/sshd.service, Namespace: pid:[4026531836]
2023/07/13 12:10:02 exec.go:60: verifier error: load program: permission denied:
        74: R0=inv0 R1_w=inv1 R4_w=fp-40,call_-1 R6=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R7=inv0 R8=inv(id=0) R9=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0,call_-1 fp-32=0 fp-40_w=inv fp-72=ctx
        74: (79) r1 = *(u64 *)(r10 -72)
        75: R0=inv0 R1_w=ctx(id=0,off=0,imm=0) R4_w=fp-40,call_-1 R6=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R7=inv0 R8=inv(id=0) R9=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0,call_-1 fp-32=0 fp-40_w=inv fp-72=ctx
        75: (18) r2 = 0xffff889f98855e00
        77: R0=inv0 R1_w=ctx(id=0,off=0,imm=0) R2_w=map_ptr(id=0,off=0,ks=4,vs=4) R4_w=fp-40,call_-1 R6=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R7=inv0 R8=inv(id=0) R9=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0,call_-1 fp-32=0 fp-40_w=inv fp-72=ctx
        77: (18) r3 = 0xffffffff
        79: R0=inv0 R1_w=ctx(id=0,off=0,imm=0) R2_w=map_ptr(id=0,off=0,ks=4,vs=4) R3_w=inv4294967295 R4_w=fp-40,call_-1 R6=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R7=inv0 R8=inv(id=0) R9=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0,call_-1 fp-32=0 fp-40_w=inv fp-72=ctx
        79: (b7) r5 = 32
        80: R0=inv0 R1_w=ctx(id=0,off=0,imm=0) R2_w=map_ptr(id=0,off=0,ks=4,vs=4) R3_w=inv4294967295 R4_w=fp-40,call_-1 R5_w=inv32 R6=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R7=inv0 R8=inv(id=0) R9=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0,call_-1 fp-32=0 fp-40_w=inv fp-72=ctx
        80: (85) call bpf_perf_event_output#25
        invalid indirect read from stack off -40+28 size 32

报错位置对应的 struct debug 数据结构：


struct debug {
	__u64 index;
	__u64 type;
	pid_t pid;
	pid_t ppid;
	pid_t tgid;
};

	struct debug debug_event = {
		.type = TASK_START, .pid = pid, .ppid = ppid, .tgid = tgid
	};

# 对应的字节码：
;       struct debug debug_event = {
      31:       63 9a f0 ff 00 00 00 00 *(u32 *)(r10 - 16) = r9
      32:       63 6a ec ff 00 00 00 00 *(u32 *)(r10 - 20) = r6
      33:       63 8a e8 ff 00 00 00 00 *(u32 *)(r10 - 24) = r8

可见时按照 u32 来对齐的，这是由于数据对齐的问题，需要将 __64 修改为 u8、u32、int 等 32 位对齐的格式：

struct debug {
	__u8 index;
	__u8 type;
	pid_t pid;
	pid_t ppid;
	pid_t tgid;
};

9 invalid mem access ‘inv’
#

加载时报错：

原因是：eBPF 不允许通过指针访问用户空间的指针数据

// skip args[0]: filename

// 这个是 OK 的，让内核指针指向用户空间地址
const char **args = (const char **)(ctx->args[1]);

// 问题代码，原因是：eBPF 不允许通过指针访问用户空间的指针数据
if (args[i] == NULL) {
 }

解决办法：使用 bpf_probe_read 将用户指针内容读入内核变量 argp，然后可以直接使用 argp 的内容了。

const char *argp = NULL;
// 读取 arg 字符串地址，存入 argp。
ret = bpf_probe_read(&argp, sizeof(argp), &args[i]);
if (ret < 0) {
	// error
	debug_event.index = 6;
	debug_event.val = ret;
	bpf_perf_event_output(ctx, &debug_events,
			      BPF_F_CURRENT_CPU, &debug_event,
			      sizeof(debug_event));
	return 0;
}
if (argp == NULL) {
	// no more args!
	debug_event.index = 7;
	debug_event.val = 0;
	bpf_perf_event_output(ctx, &debug_events,
			      BPF_F_CURRENT_CPU, &debug_event,
			      sizeof(debug_event));
	goto output;
}

10 load program: permission denied: stack depth 56
#

load 报错：program tracepoint__syscalls__sys_enter_execve: load program: permission denied: stack depth 56 (147 line(s) omitted)：

原因是使用了如下 for 循环，次数不固定；需要将后面的 && 内容去掉。


/* loop 次数必须固定, 经过测试, 超过 10 次后会导致字节码加载失败
   #pragma unroll指令告诉编译器将循环展开为10次迭代，以提高程序的运行效率。 */
#pragma unroll
for (int i = 0; i < 10 && *args[i] != '\x00'; i++) {
	const char *argp = NULL;
	// 读取 arg 字符串地址，存入 argp。
	bpf_probe_read_user(&argp, sizeof(argp), &args[i]);
	if (!argp) {
		// no more args!
		debug_event.index = 4;
		bpf_perf_event_output(ctx, &debug_events,
				      BPF_F_CURRENT_CPU, &debug_event,
				      sizeof(debug_event));
		goto out;
	}

11 R1 unbounded memory access, make sure to bounds check any such access
#

eBPF 要求在访问数组时元素时，要检查是否越界否则会报类似于下面的错误：

regs=5 stack=0 before 203: (61) r2 = *(u32 *)(r7 +48)
regs=1 stack=0 before 202: (6d) if r3 s> r2 goto pc-71
regs=1 stack=0 before 201: (b7) r1 = 9
regs=1 stack=0 before 200: (b7) r3 = 0
regs=1 stack=0 before 199: (c7) r2 s>>= 32
regs=1 stack=0 before 198: (67) r2 <<= 32
regs=1 stack=0 before 197: (bf) r2 = r0
regs=1 stack=0 before 196: (85) call bpf_probe_read#4
210: R0_w=inv(id=13) R1_w=map_value(id=0,off=164,ks=8,vs=1448,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R2_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R3_w=inv0 R6=inv2147483648 R7=map_value(id=0,off=0,ks=8vs=1448,imm=0) R8=inv(id=10) R9_w=map_value(id=0,off=164,ks=8,vs=1448,imm=0) R10=fp0 fp-8=mmmm???? fp-16=00000000 fp-24=inv fp-32=mmmmmm00 fp-40=mmmmmmmm fp-48=mmmm???? fp-56=mmmmmmmm fp-64=mmmmmmmm fp-72=ctx
210: (71) r2 = *(u8 *)(r1 +0)
R0_w=inv(id=13) R1_w=map_value(id=0,off=164,ks=8,vs=1448,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R2_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R3_w=inv0 R6=inv2147483648 R7=map_value(id=0,off=0,ks=8,vs=448,imm=0) R8=inv(id=10) R9_w=map_value(id=0,off=164,ks=8,vs=1448,imm=0) R10=fp0 fp-8=mmmm???? fp-16=00000000 fp-24=inv fp-32=mmmmmm00 fp-40=mmmmmmmm fp-48=mmmm???? fp-56=mmmmmmmm fp-64=mmmmmmmm fp-72=ctx
R1 unbounded memory access, make sure to bounds check any such access
verification time 3441 usec
stack depth 72
processed 251 insns (limit 1000000) max_states_per_insn 2 total_states 19 peak_states 19 mark_read 7

对应的 C 代码：

int i;
#pragma unroll
for (i = 1; i < 10; i++) {
	const char *argp = NULL;
	// 读取 arg 字符串地址，存入 argp。
	ret = bpf_probe_read(&argp, sizeof(argp), &args[i]);
	if (ret < 0) {
		// error
		debug_event.index = 6;
		debug_event.val = ret;
		bpf_perf_event_output(ctx, &debug_events,
				      BPF_F_CURRENT_CPU, &debug_event,
				      sizeof(debug_event));
		return 0;
	}
	if (argp == NULL) {
		// no more args!
		debug_event.index = 7;
		debug_event.val = 0;
		bpf_perf_event_output(ctx, &debug_events,
				      BPF_F_CURRENT_CPU, &debug_event,
				      sizeof(debug_event));
		goto output;
	}
	if (event->args_size + ARGSIZE > sizeof(event->args)) {
		// no args space!
		debug_event.index = 8;
		debug_event.val = event->args_size;
		bpf_perf_event_output(ctx, &debug_events,
				      BPF_F_CURRENT_CPU, &debug_event,
				      sizeof(debug_event));
		goto output;
	}


	/* // 从 argp 读取字符串，存入 event->args, 字符串间用 '\x00' 分割。 */
	ret = bpf_probe_read(&event->args[event->args_size], ARGSIZE,
			     argp);
	if (ret < 0) {
		// error
		debug_event.index = 9;
		debug_event.val = ret;
		bpf_perf_event_output(ctx, &debug_events,
				      BPF_F_CURRENT_CPU, &debug_event,
				      sizeof(debug_event));
		return 0;
	}
	event->args_size += ret;
	if (event->args[event->args_size] != '\x0') {
		event->args[event->args_size] = '\x0';
	}
	event->args_count++;
}

output:

12 bpf_printk 和 cleanup trace_pipe
#

4.19 不支持全局变量, 而 bpf_printk() 使用全局变量来保存格式字符串, 所以不兼容(需要 5.2+ 支持)。

bpf_printk() 是 libbpf 库的封装，底层使用 bpf_trace_printk() helper func。

trace-pipe 输出格式: https://www.kernel.org/doc/html/v5.10/trace/ftrace.html#trace-pipe

解决办法: 直接使用 bpf_trace_printk() helper func。

//  调试模式再打开, 否则两个字符串占用宝贵的栈空间字节
char fmt1[] = "enter: args: pid: %d, filter cg: %d, ignore_failed: %d.\n";
bpf_trace_printk(fmt1, sizeof(fmt1), pid, _filter_cg, _ignore_failed);

注意： eBPF 函数最多只能有 5 个参数 ， bpf_trace_printk() 函数最多只能有 5 个参数，而且除了第一个参数时字符串外，其他参数都只能是数字。

Q: can more than 5 function arguments be supported in the future?

A: NO. BPF calling convention only allows registers R1-R5 to be used as arguments. BPF is not a standalone instruction set. (unlike x64 ISA that allows msft, cdecl and other conventions)

      /* 调试模式再打开, 否则两个字符串占用宝贵的栈空间字节 */
      /* char fmt1[] = "enter: args: pid: %d, filter cg: %d, ignore_failed: %d.\n";
      */
      /* bpf_trace_printk(fmt1, sizeof(fmt1), pid, _filter_cg, _ignore_failed); */

生产环境不建议使用 bpf_trace_printk() 打印大量调试信息，而应该使用 bpf_perf_event_output() : Q: bpf_trace_printk() helper warning Q: When bpf_trace_printk() helper is used the kernel prints nasty warning message. Why is that?

A: This is done to nudge program authors into better interfaces when programs need to pass data to user space. Like bpf_perf_event_output() can be used to efficiently stream data via perf ring buffer. BPF maps can be used for asynchronous data sharing between kernel and user space. bpf_trace_printk() should only be used for debugging.

12.1 cleanup trace_pipe
#

bpf_printk 打印的 message 会一直缓存在内核，直到溢出。所以如果以前已经运行过 eBPF 程序，则使用

cat /sys/kernel/debug/tracing/trace_pipe

命令输出的可能还是上一次缓存的内容，所以需要 cleanup。

https://unix.stackexchange.com/questions/747990/how-to-clear-the-sys-kernel-debug-tracing-trace-pipe-quickly

In general, trace and trace_pipe have the same data. The difference is that trace is static; Events don't get deleted from it, just appended (up until the size of the buffer, which you can show and set in /sys/kernel/debug/tracing/buffer_size_kb).

In trace_pipe, however, once you read a certain event, it will disappear from this file (kind of like fifo queue). So if you run cat /sys/kernel/debug/tracing/trace_pipe, all the events this file are cleared (at least until the next events).

The thing is, the trace_pipe file doesn't have EOF (End Of File). this cat command will never end, and will keep waiting for new events indefinitely. Maybe that’s the reason you think it takes a long time - this command will never finish, either waiting for new events or reading them when they appear.

Clearing the buffer from trace and trace_pipe If you want to clear all the events from both files, you should simply write into the trace file:

$ echo > /sys/kernel/debug/tracing/trace

This will clear both trace and trace_pipe files. Of course, they will still get new events until your disable the tracing.

13 bpftool –debug prog load xx.o /sys/fs/bpf/xx type xdb
#

https://qmonnet.github.io/whirl-offload/2021/09/23/bpftool-features-thread/

14 parsing perf event error: EOF
#

这个是在执行 binary.Read(bytes.NewBuffer(record.RawSample), binary.LittleEndian, &event) 报错时打印的。

2023/07/11 23:56:43 pid: 55848, ppid: 79451, type: started, cgroup: 4294967297, ret: 0, comm: start, filename: /usr/local/sbin/ps, argCount: 5, args: ps -o rsz -p 79794
2023/07/11 23:56:43 pid: 55848, ppid: 79451, type: exited, cgroup: 4294967297, ret: -2, comm: start, filename: /usr/local/sbin/ps, argCount: 5, args: ps -o rsz -p 79794
2023/07/11 23:56:43 pid: 55848, ppid: 79451, type: started, cgroup: 4294967297, ret: 0, comm: start, filename: /usr/local/bin/ps, argCount: 5, args: ps -o rsz -p 79794
2023/07/11 23:56:43 parsing perf event error: EOF
2023/07/11 23:56:43 pid: 55848, ppid: 79451, type: started, cgroup: 4294967297, ret: 0, comm: start, filename: /usr/bin/ps, argCount: 5, args: ps -o rsz -p 79794
2023/07/11 23:56:43 pid: 55848, ppid: 79451, type: exited, cgroup: 4294967297, ret: 0, comm: ps, filename: /usr/bin/ps, argCount: 5, args: ps -o rsz -p 79794

// perf event
perfReader, err := perf.NewReader(objs.bpfMaps.Events, 4*os.Getpagesize())
if err != nil {
	log.Fatal(err)
}
var event bpfEvent
for {
	record, err := perfReader.Read()
	if err != nil {
		if errors.Is(err, perf.ErrClosed) {
			return
		}
		log.Printf("reading from perf event reader error: %v", err)
		continue
	}
	// 报错位置！
	if err := binary.Read(bytes.NewBuffer(record.RawSample), binary.LittleEndian, &event); err != nil {
		log.Printf("parsing perf event error: %v", err)
		continue
	}
	log.Printf("%s\n", event)
}

这是由于当 kernel perf lost sample 时，record.RawSample 是空的（但是 record.LostSamples > 0），所以需要 perf Reader 在 Read() 到一个 record 后，需要先判断是否是 record.LostSamples 大于零，只要当等于 0 时才能用binary.Read() 来解码：

var event bpfEvent
for {
	record, err := perfReader.Read()
	if err != nil {
		if errors.Is(err, perf.ErrClosed) {
			log.Println("Received signal, exiting..")
			return
		}
		log.Printf("reading from perf event reader error: %v", err)
		continue
	}
	// https://github.com/cilium/ebpf/blob/master/perf/reader.go#L72
	if record.LostSamples > 0 {
		log.Printf("kernel perf event buff full, losted sample count: %d", record.LostSamples)
	} else if err := binary.Read(bytes.NewBuffer(record.RawSample), binary.LittleEndian, &event); err != nil {
		log.Printf("parsing perf event error: %v", err)
	} else {
		log.Printf("EVENT: %s\n", event)
	}
}

15 4.19 内核 bpf_probe_read_user_str bug 导致 deadlock
#

bpf_probe_read_user_str 返回值类型是 int，当出错是返回负值，特别的是 -14 （-EFAULT）表示从用户空间地址读取数据时发生 page fault：

ret = bpf_probe_read_user_str(&event->filename[0], TASK_FILENAME_LEN,
			      (const char *)ctx->args[0]);
if (ret < 0) {
	// 成功情况下，ret 值至少为 1（filename 为空的情况）。
	debug_event.index = 3;
	debug_event.val = ret;
	bpf_perf_event_output(ctx, &debug_events, BPF_F_CURRENT_CPU,
			      &debug_event, sizeof(debug_event));
	return 0;
}

上面的 ctx 类型是 struct trace_event_raw_sys_enter 的指针：

struct trace_event_raw_sys_enter {
	struct trace_entry ent;
	long int id;
	long unsigned int args[6];
	char __data[0];
};

struct trace_event_raw_sys_exit {
	struct trace_entry ent;
	long int id;
	long int ret;
	char __data[0];
};

其中的 long unsigned int args[6]; 是系统调用参数指针数组，对于 execve 来说，系统调用声明如下：

ctx->args[0] 表示 const char __user *, filename,
ctx->args[1] 表示 const char __user *const __user *, argv,
ctx->args[2] 表示 const char __user *const __user *, envp)

可见 filename，argv 和 envp 都是带 __user 限制的指针，表示只想用户空间地址：

// fs/exec.c
SYSCALL_DEFINE3(execve,
		const char __user *, filename,
		const char __user *const __user *, argv,
		const char __user *const __user *, envp)
{
	return do_execve(getname(filename), argv, envp);
}

// fs/exec.c
int do_execve(struct filename *filename,
	const char __user *const __user *__argv,
	const char __user *const __user *__envp)
{
	struct user_arg_ptr argv = { .ptr.native = __argv };
	struct user_arg_ptr envp = { .ptr.native = __envp };
	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
}

eBPF 程序中只能使用 helper func 来读取用户空间地址的内容：

bpf_probe_read_user
bpf_probe_read_user_str
bpf_probe_read

16 Program too large (103 insns), at most 4096 insns
#

https://stackoverflow.com/questions/70147464/program-too-large-threshold-greater-than-actual-instruction-count

原因是: 5.15 以前的内核, 指令数量限制为 4096 条. 如果编译后的 eBPF 程序复杂度太高, 例如 for + if condition 的情况, 由于 eBPF verify 过程会遍历所有 if branch, 则有可能还没有生成 4096 条指令的情况下,就报上面的错误.

// https://github.com/iovisor/bcc/blob/master/src/cc/libbpf.c#L915
if (ret < 0 && errno == E2BIG) {
  fprintf(stderr,
          "bpf: %s. Program %s too large (%u insns), at most %d insns\n\n",
          strerror(errno), attr->name, insns_cnt, BPF_MAXINSNS);
  return -1;
}

17 常见 verifier 出错提示消息
#

有时当我们加载编译后的 eBPF 程序的时候，eBPF 验证器会提示程序中有类型错误的问题导致程序加载失败。本文记录一下这种错误的一种解决方法。

错误示例:

SEC("iter/bpf_sk_storage_map")
int iter__bpf_sk_storage_map(struct bpf_iter__bpf_sk_storage_map *ctx)
{
    if (ctx->sk)
        bpf_sk_storage_delete(&sk_storage_map, ctx->sk);

    return 0;
}

加载到内核提示如下错误:

libbpf: prog 'iter__bpf_sk_storage_map': BPF program load failed: Permission denied
libbpf: prog 'iter__bpf_sk_storage_map': -- BEGIN PROG LOAD LOG --
R1 type=ctx expected=fp
; if (ctx->sk)
0: (79) r2 = *(u64 *)(r1 +16)
; if (ctx->sk)
1: (15) if r2 == 0x0 goto pc+4
 R1=ctx(id=0,off=0,imm=0) R2_w=ptr_sock(id=0,off=0,imm=0) R10=fp0
; bpf_sk_storage_delete(&sk_storage_map, ctx->sk);
2: (79) r2 = *(u64 *)(r1 +16)
; bpf_sk_storage_delete(&sk_storage_map, ctx->sk);
3: (18) r1 = 0xffffa0658305aa00
5: (85) call bpf_sk_storage_delete#108
R2 type=ptr_or_null_ expected=ptr_
processed 5 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
-- END PROG LOAD LOG --
libbpf: prog 'iter__bpf_sk_storage_map': failed to load: -13
libbpf: failed to load object 'main.bpf.o'
failed to load BPF object: permission denied

这个错误信息有两个关键错误，一个错误是:

R1 type=ctx expected=fp
; if (ctx->sk)
0: (79) r2 = *(u64 *)(r1 +16)
; if (ctx->sk)
1: (15) if r2 == 0x0 goto pc+4

其中 R1 type=ctx expected=fp 说的是，验证器期望 R1 的类型是 fp 而不是 ctx 。所谓的 fp 指的是栈上的指针类型 ，即期望 R1 是栈上的数据而不是 ctx 。

另一个错误是:

R1=ctx(id=0,off=0,imm=0) R2_w=ptr_sock(id=0,off=0,imm=0) R10=fp0
; bpf_sk_storage_delete(&sk_storage_map, ctx->sk);
2: (79) r2 = *(u64 *)(r1 +16)
; bpf_sk_storage_delete(&sk_storage_map, ctx->sk);
3: (18) r1 = 0xffffa0658305aa00
5: (85) call bpf_sk_storage_delete#108
R2 type=ptr_or_null_ expected=ptr_

其中 R2 type=ptr_or_null_ expected=ptr_ 说的是，验证器期望 R2 的类型是 ptr 而不是 prt_or_null ，即，期望 R2 是一个指针而不是一个指针或 NULL 。这里可能会有点疑惑，前面的判断 if (ctx->sk) 已经确保了不会为 NULL ，为啥这里还会认为它有可能为 NULL ，这是因为前面的 if 判断的不是栈变量，存在 R1 type=ctx expected=fp 的问题也就无法保证它一定不是 NULL 了。

解决办法也很简单，就是用一个临时变量保存 ctx->sk 的值，然后用这个栈上的临时变量做后续的操作:

 SEC("iter/bpf_sk_storage_map")
 int iter__bpf_sk_storage_map(struct bpf_iter__bpf_sk_storage_map *ctx)
 {
-    if (ctx->sk)
-        bpf_sk_storage_delete(&sk_storage_map, ctx->sk);
+    struct sock *sk = ctx->sk;
+    if (sk)
+        bpf_sk_storage_delete(&sk_storage_map, sk);

     return 0;
 }

这里记录一下类似前面 fp 这样的常见类型关键字具体的含义：

关键字含义 scalar 标量类型（scalar type），不是一个有效的指针类型 ctx bpf_context 指针 map_ptr bpf_map 类型的指针 map_value 指向 map 中的元素 value 的指针 map_value_or_null 指向 map 中的元素 value 的指针或 NULL map_key 指向 map 中的元素 key 的指针 fp 栈上的指针（frame pointer） pkt skb->data 指针 pkt_meta skb->data - meta_len 位置的指针 pkt_end skb->data + headlen 位置的指针 sock bpf_sock 类型的指针 sock_or_null bpf_sock 类型的指针或 NULL sock_common sock_common 类型指针 sock_common_or_null sock_common 类型指针或 NULL tcp_sock tcp_sock 类型指针 tcp_sock_or_null tcp_sock 类型指针或 NULL tp_buffer 可写的 raw tracepoint buffer 指针 xdp_sock xdp_sock 类型指针 ptr_ 一个 BTF ID，非空指针 ptr_or_null_ 一个 BTF ID 或 NULL，可能为空的指针 dynptr_ptr 动态指针（dynptr 指针） mem 指向一块有效内存区域的指针 mem_or_null 指向一块有效内存区域的指针或 NULL buf 指向一个读/写 buffer 的指针 func BPF 程序函数指针 inv 无效类型（invalid type），不是一个有效的指针类型 flow_keys bpf_flow_keys 类型的指针 percpu_ptr_ 指向一个 percpu 内核变量的指针 rdonly_buf 指向一个只读 buffer 的指针 rdonly_buf_or_null 指向一个只读 buffer 的指针或 NULL rdwr_buf 指向一个读/写 buffer 的指针 rdwr_buf_or_null 指向一个读/写 buffer 的指针或 NULL

// https://elixir.bootlin.com/linux/v5.19.14/source/kernel/bpf/verifier.c#L533

/* string representation of 'enum bpf_reg_type'
 *
 * Note that reg_type_str() can not appear more than once in a single verbose()
 * statement.
 */
static const char *reg_type_str(struct bpf_verifier_env *env,
				enum bpf_reg_type type)
{
	char postfix[16] = {0}, prefix[32] = {0};
	static const char * const str[] = {
		[NOT_INIT]		= "?",
		[SCALAR_VALUE]		= "scalar",
		[PTR_TO_CTX]		= "ctx",
		[CONST_PTR_TO_MAP]	= "map_ptr",
		[PTR_TO_MAP_VALUE]	= "map_value",
		[PTR_TO_STACK]		= "fp",
		[PTR_TO_PACKET]		= "pkt",
		[PTR_TO_PACKET_META]	= "pkt_meta",
		[PTR_TO_PACKET_END]	= "pkt_end",
		[PTR_TO_FLOW_KEYS]	= "flow_keys",
		[PTR_TO_SOCKET]		= "sock",
		[PTR_TO_SOCK_COMMON]	= "sock_common",
		[PTR_TO_TCP_SOCK]	= "tcp_sock",
		[PTR_TO_TP_BUFFER]	= "tp_buffer",
		[PTR_TO_XDP_SOCK]	= "xdp_sock",
		[PTR_TO_BTF_ID]		= "ptr_",
		[PTR_TO_MEM]		= "mem",
		[PTR_TO_BUF]		= "buf",
		[PTR_TO_FUNC]		= "func",
		[PTR_TO_MAP_KEY]	= "map_key",
	};

	if (type & PTR_MAYBE_NULL) {
		if (base_type(type) == PTR_TO_BTF_ID)
			strncpy(postfix, "or_null_", 16);
		else
			strncpy(postfix, "_or_null", 16);
	}

	if (type & MEM_RDONLY)
		strncpy(prefix, "rdonly_", 32);
	if (type & MEM_ALLOC)
		strncpy(prefix, "alloc_", 32);
	if (type & MEM_USER)
		strncpy(prefix, "user_", 32);
	if (type & MEM_PERCPU)
		strncpy(prefix, "percpu_", 32);
	if (type & PTR_UNTRUSTED)
		strncpy(prefix, "untrusted_", 32);

	snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
		 prefix, str[base_type(type)], postfix);
	return env->type_str_buf;
}

18 verifier error: load program: invalid argument:
#

2023/07/26 07:52:00 exec.go:64: verifier error: load program: invalid argument:
func#0 @0
func#1 @308
Validating parse_args() func#1...
arg#0 reference type('UNKNOWN ') size cannot be determined: -22
verification time 78 usec
stack depth 0+0
processed 0 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0

原因是：

int parse_args(void *ctx, struct debug *debug_event, const char **args, u8 *buff, int isEnv) {

修正：

int parse_args(struct trace_event_raw_sys_enter *ctx, struct debug *debug_event, const char **args, u8 *buff, int isEnv) {

19 verifier error: load program: permission denied:
#

2023/07/26 08:01:41 exec.go:64: verifier error: load program: permission denied:
func#0 @0
func#1 @308
Validating parse_args() func#1...
308: R1=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R2=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R3=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R4=mem_or_null(id=8,ref_obj_id=0,off=0,imm=0) R5=invP(id=0) R10=fp0
; int parse_args(struct trace_event_raw_sys_enter *ctx, struct debug *debug_event, const char **args, u8 *buff, int isEnv) {
    308: (7b) *(u64 *)(r10 -16) = r4
    309: R1=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R2=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R3=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R4=mem_or_null(id=8,ref_obj_id=0,off=0,imm=0) R5=invP(id=0) R10=fp0 fp-16_w=mem_or_null
    309: (bf) r8 = r3
    310: R1=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R2=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R3=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R4=mem_or_null(id=8,ref_obj_id=0,off=0,imm=0) R5=invP(id=0) R8_w=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-16_w=mem_or_null
    310: (bf) r6 = r2
    311: R1=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R2=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R3=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R4=mem_or_null(id=8,ref_obj_id=0,off=0,imm=0) R5=invP(id=0) R6_w=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R8_w=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-16_w=mem_or_null
    311: (bf) r9 = r1
    312: R1=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R2=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R3=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R4=mem_or_null(id=8,ref_obj_id=0,off=0,imm=0) R5=invP(id=0) R6_w=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R8_w=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9_w=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-16_w=mem_or_null
    312: (b7) r7 = 0
    313: R1=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R2=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R3=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R4=mem_or_null(id=8,ref_obj_id=0,off=0,imm=0) R5=invP(id=0) R6_w=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R7_w=invP0 R8_w=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9_w=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-16_w=mem_or_null
    ; const char *argp = NULL;
    313: (7b) *(u64 *)(r10 -8) = r7
    314: R1=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R2=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R3=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R4=mem_or_null(id=8,ref_obj_id=0,off=0,imm=0) R5=invP(id=0) R6_w=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R7_w=invP0 R8_w=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9_w=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8_w=00000000 fp-16_w=mem_or_null
    314: (bf) r1 = r10
    315: R1_w=fp0 R2=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R3=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R4=mem_or_null(id=8,ref_obj_id=0,off=0,imm=0) R5=invP(id=0) R6_w=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R7_w=invP0 R8_w=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9_w=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8_w=00000000 fp-16_w=mem_or_null
    ;
    315: (07) r1 += -8
    316: R1_w=fp-8 R2=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R3=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R4=mem_or_null(id=8,ref_obj_id=0,off=0,imm=0) R5=invP(id=0) R6_w=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R7_w=invP0 R8_w=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9_w=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8_w=00000000 fp-16_w=mem_or_null
    ; ret = bpf_probe_read(&argp, sizeof(argp), &args[i]);
    316: (b7) r2 = 8
    317: R1_w=fp-8 R2_w=invP8 R3=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R4=mem_or_null(id=8,ref_obj_id=0,off=0,imm=0) R5=invP(id=0) R6_w=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R7_w=invP0 R8_w=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9_w=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8_w=00000000 fp-16_w=mem_or_null
    317: (85) call bpf_probe_read#4
    318: R0_w=invP(id=0) R6_w=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R7_w=invP0 R8_w=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9_w=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm fp-16_w=mem_or_null
    318: (18) r1 = 0x80000000
    320: R0_w=invP(id=0) R1_w=invP2147483648 R6_w=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R7_w=invP0 R8_w=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9_w=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm fp-16_w=mem_or_null
    ; if (ret < 0) {
             320: (bf) r2 = r0
             321: R0_w=invP(id=9) R1_w=invP2147483648 R2_w=invP(id=9) R6_w=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R7_w=invP0 R8_w=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9_w=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm fp-16_w=mem_or_null
             321: (5f) r2 &= r1
             322: R0_w=invP(id=9) R1_w=invP2147483648 R2_w=invP(id=0,umax_value=2147483648,var_off=(0x0; 0x80000000),s32_max_value=0) R6_w=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R7_w=invP0 R8_w=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9_w=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm fp-16_w=mem_or_null
             ; if (ret < 0) {
		      322: (15) if r2 == 0x0 goto pc+12
		      R0_w=invP(id=9) R1_w=invP2147483648 R2_w=invP(id=0,umax_value=2147483648,var_off=(0x0; 0x80000000),s32_max_value=0) R6_w=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R7_w=invP0 R8_w=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9_w=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8_w=mmmmmmmm fp-16_w=mem_or_null
		      323: R0=invP(id=9) R1=invP2147483648 R2=invP(id=0,umax_value=2147483648,var_off=(0x0; 0x80000000),s32_max_value=0) R6=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R7=invP0 R8=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=mem_or_null
		      323: (18) r7 = 0xffffffff
		      325: R0=invP(id=9) R1=invP2147483648 R2=invP(id=0,umax_value=2147483648,var_off=(0x0; 0x80000000),s32_max_value=0) R6=mem_or_null(id=6,ref_obj_id=0,off=0,imm=0) R7_w=invP4294967295 R8=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=mem_or_null
		      ; if (debug_event != NULL) {
			       325: (15) if r6 == 0x0 goto pc+39
			       R0=invP(id=9) R1=invP2147483648 R2=invP(id=0,umax_value=2147483648,var_off=(0x0; 0x80000000),s32_max_value=0) R6=mem(id=0,ref_obj_id=0,off=0,imm=0) R7_w=invP4294967295 R8=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=mem_or_null
			       326: R0=invP(id=9) R1=invP2147483648 R2=invP(id=0,umax_value=2147483648,var_off=(0x0; 0x80000000),s32_max_value=0) R6=mem(id=0,ref_obj_id=0,off=0,imm=0) R7_w=invP4294967295 R8=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=mem_or_null
			       326: (b7) r1 = 6
			       327: R0=invP(id=9) R1_w=invP6 R2=invP(id=0,umax_value=2147483648,var_off=(0x0; 0x80000000),s32_max_value=0) R6=mem(id=0,ref_obj_id=0,off=0,imm=0) R7_w=invP4294967295 R8=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=mem_or_null
			       ; debug_event->index = index;
			       327: (73) *(u8 *)(r6 +0) = r1
			       R0=invP(id=9) R1_w=invP6 R2=invP(id=0,umax_value=2147483648,var_off=(0x0; 0x80000000),s32_max_value=0) R6=mem(id=0,ref_obj_id=0,off=0,imm=0) R7_w=invP4294967295 R8=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=mem_or_null
			       328: R0=invP(id=9) R1_w=invP6 R2=invP(id=0,umax_value=2147483648,var_off=(0x0; 0x80000000),s32_max_value=0) R6=mem(id=0,ref_obj_id=0,off=0,imm=0) R7_w=invP4294967295 R8=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=mem_or_null
			       ; submit_debug_event(ctx, debug_event, 6, ret);
			       328: (67) r0 <<= 32
			       329: R0_w=invP(id=0,smax_value=9223372032559808512,umax_value=18446744069414584320,var_off=(0x0; 0xffffffff00000000),s32_min_value=0,s32_max_value=0,u32_max_value=0) R1_w=invP6 R2=invP(id=0,umax_value=2147483648,var_off=(0x0; 0x80000000),s32_max_value=0) R6=mem(id=0,ref_obj_id=0,off=0,imm=0) R7_w=invP4294967295 R8=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=mem_or_null
			       329: (c7) r0 s>>= 32
			       330: R0_w=invP(id=0,smin_value=-2147483648,smax_value=2147483647) R1_w=invP6 R2=invP(id=0,umax_value=2147483648,var_off=(0x0; 0x80000000),s32_max_value=0) R6=mem(id=0,ref_obj_id=0,off=0,imm=0) R7_w=invP4294967295 R8=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=mem_or_null
			       ; debug_event->val = ret;
			       330: (7b) *(u64 *)(r6 +16) = r0
			       R0_w=invP(id=0,smin_value=-2147483648,smax_value=2147483647) R1_w=invP6 R2=invP(id=0,umax_value=2147483648,var_off=(0x0; 0x80000000),s32_max_value=0) R6=mem(id=0,ref_obj_id=0,off=0,imm=0) R7_w=invP4294967295 R8=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=mem_or_null
			       331: R0_w=invP(id=0,smin_value=-2147483648,smax_value=2147483647) R1_w=invP6 R2=invP(id=0,umax_value=2147483648,var_off=(0x0; 0x80000000),s32_max_value=0) R6=mem(id=0,ref_obj_id=0,off=0,imm=0) R7_w=invP4294967295 R8=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=mem_or_null
			       331: (18) r7 = 0xffffffff
			       333: R0_w=invP(id=0,smin_value=-2147483648,smax_value=2147483647) R1_w=invP6 R2=invP(id=0,umax_value=2147483648,var_off=(0x0; 0x80000000),s32_max_value=0) R6=mem(id=0,ref_obj_id=0,off=0,imm=0) R7_w=invP4294967295 R8=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=mem_or_null
			       ; bpf_perf_event_output(ctx, &debug_events, BPF_F_CURRENT_CPU,
						       333: (bf) r1 = r9
						       334: R0_w=invP(id=0,smin_value=-2147483648,smax_value=2147483647) R1_w=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R2=invP(id=0,umax_value=2147483648,var_off=(0x0; 0x80000000),s32_max_value=0) R6=mem(id=0,ref_obj_id=0,off=0,imm=0) R7_w=invP4294967295 R8=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=mem_or_null
						       334: (05) goto pc+23
						       358: R0=invP(id=0,smin_value=-2147483648,smax_value=2147483647) R1=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R2=invP(id=0,umax_value=2147483648,var_off=(0x0; 0x80000000),s32_max_value=0) R6=mem(id=0,ref_obj_id=0,off=0,imm=0) R7=invP4294967295 R8=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=mem_or_null
						       ;
						       358: (18) r2 = 0xffffa0bfe94c1c00
						       360: R0=invP(id=0,smin_value=-2147483648,smax_value=2147483647) R1=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R2_w=map_ptr(id=0,off=0,ks=4,vs=4,imm=0) R6=mem(id=0,ref_obj_id=0,off=0,imm=0) R7=invP4294967295 R8=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=mem_or_null
						       360: (18) r3 = 0xffffffff
						       362: R0=invP(id=0,smin_value=-2147483648,smax_value=2147483647) R1=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R2_w=map_ptr(id=0,off=0,ks=4,vs=4,imm=0) R3_w=invP4294967295 R6=mem(id=0,ref_obj_id=0,off=0,imm=0) R7=invP4294967295 R8=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=mem_or_null
						       362: (bf) r4 = r6
						       363: R0=invP(id=0,smin_value=-2147483648,smax_value=2147483647) R1=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R2_w=map_ptr(id=0,off=0,ks=4,vs=4,imm=0) R3_w=invP4294967295 R4_w=mem(id=0,ref_obj_id=0,off=0,imm=0) R6=mem(id=0,ref_obj_id=0,off=0,imm=0) R7=invP4294967295 R8=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=mem_or_null
						       363: (b7) r5 = 24
						       364: R0=invP(id=0,smin_value=-2147483648,smax_value=2147483647) R1=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R2_w=map_ptr(id=0,off=0,ks=4,vs=4,imm=0) R3_w=invP4294967295 R4_w=mem(id=0,ref_obj_id=0,off=0,imm=0) R5_w=invP24 R6=mem(id=0,ref_obj_id=0,off=0,imm=0) R7=invP4294967295 R8=mem_or_null(id=7,ref_obj_id=0,off=0,imm=0) R9=mem_or_null(id=5,ref_obj_id=0,off=0,imm=0) R10=fp0 fp-8=mmmmmmmm fp-16=mem_or_null
						       364: (85) call bpf_perf_event_output#25
						       R1 type=mem_or_null expected=ctx
						       verification time 383 usec
						       stack depth 0+16
						       processed 29 insns (limit 1000000) max_states_per_insn 0 total_states 2 peak_states 2 mark_read 1

20 loading objects: map create: argument list too long
#

https://github.com/iovisor/bcc/issues/1471#issuecomment-349045613 “Argument list too long” is E2BIG error code. It does not really mean argument list too long. It may mean you have too many insns, too big hash table size, too big attr structures, etc. But it should not apply to default bcc tools.

strace -f 抓取 bpf 系统调用，结果返回 E2BIG （Argument list too long）

bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_PERCPU_HASH, key_size=8, value_size=4792, max_entries=10240, map_flags=0, inner_map_fd=0, map_name=“execs”, map_ifindex=0, …}, 72 <unfinished …>
<… bpf resumed> ) = -1 E2BIG (Argument list too long)

bpf$  17E_OPS1 [ 2023-07-26 22:21:10 ]
#strace -f  ./exec process --btf ./vmlinux-4.19.91-007.btf  &>strace.log

[pid 130459] nanosleep({tv_sec=0, tv_nsec=20000}, NULL) = 0
[pid 130459] nanosleep({tv_sec=0, tv_nsec=20000}, NULL) = 0
[pid 130487] bpf(BPF_BTF_LOAD, 0xc000612a90, 32 <unfinished ...>
[pid 130459] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 130487] <... bpf resumed> )        = 3
[pid 130487] bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_ARRAY, key_size=4, value_size=4, max_entries=5, map_flags=0, inner_map_fd=0, map_name="config_args", map_ifindex=0, ...}, 72) = 7
[pid 130459] <... nanosleep resumed> NULL) = 0
[pid 130487] close(3 <unfinished ...>
[pid 130459] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 130487] <... close resumed> )      = 0
[pid 130487] bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_PERCPU_ARRAY, key_size=4, value_size=4792, max_entries=1, map_flags=0, inner_map_fd=0, map_name="empty_event", map_ifindex=0}, 72 <unfinished ...>
[pid 130459] <... nanosleep resumed> NULL) = 0
[pid 130459] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 130487] <... bpf resumed> )        = 3
[pid 130459] <... nanosleep resumed> NULL) = 0
[pid 130459] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 130487] bpf(BPF_BTF_LOAD, 0xc000612a90, 32) = 8
[pid 130487] bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_PERCPU_HASH, key_size=8, value_size=4792, max_entries=10240, map_flags=0, inner_map_fd=0, map_name="execs", map_ifindex=0, ...}, 72 <unfinished ...>
[pid 130459] <... nanosleep resumed> NULL) = 0
[pid 130487] <... bpf resumed> )        = -1 E2BIG (Argument list too long)   #报错位置
[pid 130459] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 130487] close(8)                   = 0
[pid 130487] close(7)                   = 0
[pid 130459] <... nanosleep resumed> NULL) = 0
[pid 130487] close(3 <unfinished ...>
[pid 130459] nanosleep({tv_sec=0, tv_nsec=20000},  <unfinished ...>
[pid 130487] <... close resumed> )      = 0
[pid 130487] write(2, "2023/07/26 22:21:38 exec.go:66: "..., 1832023/07/26 22:21:38 exec.go:66: loading objects: field TracepointSyscallsSysEnterExecve: program tracepoint__syscalls__sys_enter_execve: map execs: map create: argument list too long
 <unfinished ...>
[pid 130459] <... nanosleep resumed> NULL) = 0

查看 bpf 系统调用定义，可能返回 -E2BIG 的地方如下：

// https://elixir.bootlin.com/linux/v4.19.91/source/kernel/bpf/syscall.c#L61

/*
 * If we're handed a bigger struct than we know of, ensure all the unknown bits
 * are 0 - i.e. new user-space does not rely on any kernel feature extensions
 * we don't know about yet.
 *
 * There is a ToCToU between this function call and the following
 * copy_from_user() call. However, this is not a concern since this function is
 * meant to be a future-proofing of bits.
 */
int bpf_check_uarg_tail_zero(void __user *uaddr,
			     size_t expected_size,
			     size_t actual_size)
{
	unsigned char __user *addr;
	unsigned char __user *end;
	unsigned char val;
	int err;

	// 这个不太可能，因为 const union bpf_attr 的大小不太可能超过一个 PAGE_SIZE
	if (unlikely(actual_size > PAGE_SIZE))	/* silly large */
		return -E2BIG;

	if (unlikely(!access_ok(VERIFY_READ, uaddr, actual_size)))
		return -EFAULT;

	if (actual_size <= expected_size)
		return 0;

	addr = uaddr + expected_size;
	end  = uaddr + actual_size;

	// 大概率是这个问题，即如果 expected_size 和 actual_size 之间的内容有非 0 的情况，
	// 返回 -E2BIG.
	for (; addr < end; addr++) {
		err = get_user(val, addr);
		if (err)
			return err;
		if (val)
			return -E2BIG;
	}

	return 0;
}

// https://elixir.bootlin.com/linux/v4.19.91/source/kernel/bpf/syscall.c#L2373
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
	union bpf_attr attr = {};
	int err;

	if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
		return -EPERM;

	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
	if (err)
		return err;
	size = min_t(u32, size, sizeof(attr));

	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
	if (copy_from_user(&attr, uattr, size) != 0)
		return -EFAULT;

	err = security_bpf(cmd, &attr, size);
	if (err < 0)
		return err;

	switch (cmd) {
	case BPF_MAP_CREATE:
		err = map_create(&attr);
		break;
...
}

4.19.91 和 5.6.1 内核关于 union bpf_attr 中 BPF_MAP_CREATE 的定义有差别：

5.6.1 多了一个字段 __u32 btf_vmlinux_value_type_id
man 2 bpf 也说明 bpf_attr 未使用的 fields 和 padding 都必须是 zeroed，然后才能调用 bpf syscall。

// https://elixir.bootlin.com/linux/v4.19.91/source/include/uapi/linux/bpf.h#L287
union bpf_attr {
	struct { /* anonymous struct used by BPF_MAP_CREATE command */
		__u32	map_type;	/* one of enum bpf_map_type */
		__u32	key_size;	/* size of key in bytes */
		__u32	value_size;	/* size of value in bytes */
		__u32	max_entries;	/* max number of entries in a map */
		__u32	map_flags;	/* BPF_MAP_CREATE related
					 * flags defined above.
					 */
		__u32	inner_map_fd;	/* fd pointing to the inner map */
		__u32	numa_node;	/* numa node (effective only if
					 * BPF_F_NUMA_NODE is set).
					 */
		char	map_name[BPF_OBJ_NAME_LEN];
		__u32	map_ifindex;	/* ifindex of netdev to create on */
		__u32	btf_fd;		/* fd pointing to a BTF type data */
		__u32	btf_key_type_id;	/* BTF type_id of the key */
		__u32	btf_value_type_id;	/* BTF type_id of the value */
	};
	...
} __attribute__((aligned(8)));

// https://elixir.bootlin.com/linux/v5.6.1/source/include/uapi/linux/bpf.h#L394
union bpf_attr {
	struct { /* anonymous struct used by BPF_MAP_CREATE command */
		__u32	map_type;	/* one of enum bpf_map_type */
		__u32	key_size;	/* size of key in bytes */
		__u32	value_size;	/* size of value in bytes */
		__u32	max_entries;	/* max number of entries in a map */
		__u32	map_flags;	/* BPF_MAP_CREATE related
					 * flags defined above.
					 */
		__u32	inner_map_fd;	/* fd pointing to the inner map */
		__u32	numa_node;	/* numa node (effective only if
					 * BPF_F_NUMA_NODE is set).
					 */
		char	map_name[BPF_OBJ_NAME_LEN];
		__u32	map_ifindex;	/* ifindex of netdev to create on */
		__u32	btf_fd;		/* fd pointing to a BTF type data */
		__u32	btf_key_type_id;	/* BTF type_id of the key */
		__u32	btf_value_type_id;	/* BTF type_id of the value */
		__u32	btf_vmlinux_value_type_id;/* BTF type_id of a kernel-
						   * struct stored as the
						   * map value
						   */
	};

而 cilium/ebpf 的 MapCreateAttr 是根据 5.6.1 来定义的：

这就导致 go 创建的 map create attr 比 4.19 内核的多两个字段的内容，如果这两个字段内容不为 0，则前面的 bpf_check_uarg_tail_zero 失败，返回 -E2BIG.

// https://github.com/cilium/ebpf/blob/02200f5764662c8fd3779172de216e5b93b3fefc/internal/sys/types.go#L728
type MapCreateAttr struct {
	MapType               MapType 4
	KeySize               uint32
	ValueSize             uint32
	MaxEntries            uint32
	MapFlags              MapFlags 4
	InnerMapFd            uint32
	NumaNode              uint32
	MapName               ObjName 2
	MapIfindex            uint32
	BtfFd                 uint32
	BtfKeyTypeId          TypeID 4
	BtfValueTypeId        TypeID
	BtfVmlinuxValueTypeId TypeID
	MapExtra              uint64
}

cilium/ebpf 中的注解：

所以 kernel 太老而不支持这个 map type。

// https://github.com/cilium/ebpf/blob/b4b25b38e907c6d0963ecbf6401228be3f8fdaf3/features/map.go#L79C9-L79C10
func createMap(attr *sys.MapCreateAttr) error {
	fd, err := sys.MapCreate(attr)
	if err == nil {
		fd.Close()
		return nil
	}

	switch {
	// EINVAL occurs when attempting to create a map with an unknown type.
	// E2BIG occurs when MapCreateAttr contains non-zero bytes past the end
	// of the struct known by the running kernel, meaning the kernel is too old
	// to support the given map type.
	case errors.Is(err, unix.EINVAL), errors.Is(err, unix.E2BIG):
		return ebpf.ErrNotSupported
	}

	return err
}

最终的解决办法：

将 BPF_MAP_TYPE_PERCPU_HASH 修改为 BPF_MAP_TYPE_HASH。
或者，减少 FULL_MAX_ARGS_ARR 和 FULL_MAX_ENVS_ARR 的值。

// 17E 18*128 BAD
// 17E OK
#define TOTAL_MAX_ARGS 16
#define ARGSIZE 120


type bpfEvent struct {
	Pid       int32
	Ppid      int32
	Tgid      int32
	Uid       uint32
	CgroupId  int64
	Type      int64
	Retval    int64
	ArgsCount int64
	ArgsSize  uint32
	_         [4]byte
	EnvsCount int64
	EnvsSize  uint32
	Comm      [16]uint8
	Filename  [96]uint8
	Args      [1920]uint8
	Envs      [1920]uint8
	_         [4]byte
}


# strace -f   ./exec process --btf ./vmlinux-4.19.91-007.btf  |& grep map_type=BPF_MAP_TYPE_PERCPU_HASH
[pid 28142] write(2, "2023/07/26 22:38:18 list.go:49: "..., 1832023/07/26 22:38:18 list.go:49: PID: 28047, Name: grep, Cmdline: grep --color=auto map_type=BPF_MAP_TYPE_PERCPU_HASH, Cgroup : /system.slice/sshd.service, Namespace: pid:[4026531836]


[pid 33870] bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_PERCPU_HASH, key_size=8, value_size=4024, max_entries=10240, map_flags=0, inner_map_fd=0, map_name="execs", map_ifindex=0, ...}, 72 <unfinished ...>
^C

21 kernel perf event buff full, losted sample count: xx 和 val: -14 报错：
#

2023/07/26 22:52:06 exec.go:178: DEBUG: index: 8, pid: 108109, ppid: 108089, tgid: 108109, type: envs, val: -14
2023/07/26 22:52:06 exec.go:178: DEBUG: index: 8, pid: 108109, ppid: 108089, tgid: 108109, type: envs, val: -14
2023/07/26 22:52:06 exec.go:156: EVENT: pid: 108109, ppid: 108089, tgid: 108109, type: started, cgroup: 4294967297, ret: 0, comm: xargs, filename: /usr/local/sbin/grep, argCount: 2, argSize: 0, envCount: 10,
2023/07/26 22:52:06 exec.go:178: DEBUG: index: 8, pid: 108155, ppid: 108138, tgid: 108155, type: envs, val: -14
2023/07/26 22:52:17 exec.go:152: kernel perf event buff full, losted sample count: 64
2023/07/26 22:52:17 exec.go:178: DEBUG: index: 8, pid: 121260, ppid: 37259, tgid: 121260, type: envs, val: -14
2023/07/26 22:52:17 exec.go:178: DEBUG: index: 8, pid: 121087, ppid: 121079, tgid: 121087, type: envs, val: -14
2023/07/26 22:52:17 exec.go:178: DEBUG: index: 8, pid: 121234, ppid: 121233, tgid: 121234, type: envs, val: -14
2023/07/26 22:52:17 exec.go:178: DEBUG: index: 8, pid: 121234, ppid: 121233, tgid: 121234, type: envs, val: -14
2023/07/26 22:52:17 exec.go:178: DEBUG: index: 8, pid: 121234, ppid: 121233, tgid: 121234, type: envs, val: -14
2023/07/26 22:52:17 exec.go:178: DEBUG: index: 8, pid: 121234, ppid: 121233, tgid: 121234, type: envs, val: -14
2023/07/26 22:52:17 exec.go:178: DEBUG: index: 8, pid: 121234, ppid: 121233, tgid: 121234, type: envs, val: -14
2023/07/26 22:52:17 exec.go:178: DEBUG: index: 8, pid: 121234, ppid: 121233, tgid: 121234, type: envs, val: -14
2023/07/26 22:52:17 exec.go:156: EVENT: pid: 0, ppid: 0, tgid: 0, type: started, cgroup: 4294967297, ret: 0, comm: awk, filename: , argCount: 0, argSize: 0, envCount: 0, envSize: 0, args: , envs:
2023/07/26 22:52:17 exec.go:178: DEBUG: index: 8, pid: 120082, ppid: 37259, tgid: 120082, type: envs, val: -14
2023/07/26 22:52:17 exec.go:178: DEBUG: index: 8, pid: 119011, ppid: 119009, tgid: 119011, type: envs, val: -14
2023/07/26 22:52:17 exec.go:178: DEBUG: index: 8, pid: 119011, ppid: 119009, tgid: 119011, type: envs, val: -14
2023/07/26 22:52:17 exec.go:178: DEBUG: index: 8, pid: 119011, ppid: 119009, tgid: 119011, type: envs, val: -14
2023/07/26 22:52:17 exec.go:178: DEBUG: index: 8, pid: 119011, ppid: 119009, tgid: 119011, type: envs, val: -14

需要进一步调大：

perfReader, err := perf.NewReader(objs.bpfMaps.Events, 20*os.Getpagesize())

22 eBPF 不支持获得 task 的 cwd
#

To get the full path from the task_struct, you need to call the kernel function d_path https://archive.kernel.org/oldlinux/htmldocs/filesystems/API-d-path.html Pass task->mm->exe_file->f_path as the first parameter char * d_path(f_path, buf, buflen); There is a bpf_d_path function in libbpf, but for some reason it does not work for my type of tracepoint program.

#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_core_read.h>

char LICENSE[] SEC("license") = "GPL";

SEC("tp/syscalls/sys_enter_execve")
int handle_tp(void *ctx)
{
 struct task_struct *task;
 task = (struct task_struct*)bpf_get_current_task();
 struct path p;
 p = BPF_CORE_READ(task, mm, exe_file, f_path);
 char path_exe[256];
 bpf_d_path(&p, path_exe, sizeof(path_exe));
 bpf_printk("TASK: %s", path_exe);

 return 0;
}

This solution is not feasible on even higher versions where bpf_d_path() is supported, because bpf_d_path() is not allowed from tracepoints or raw tracepoints yet! Check the allowed syscalls

bpf_d_path 只能在 BPF_PROG_TYPE_TRACING 中使用，LSM 需要看情况：

// https://github.com/torvalds/linux/blob/a92b7d26c743b9dc06d520f863d624e94978a1d9/kernel/trace/bpf_trace.c#L930
BTF_SET_START(btf_allowlist_d_path)
#ifdef CONFIG_SECURITY
BTF_ID(func, security_file_permission)
BTF_ID(func, security_inode_getattr)
BTF_ID(func, security_file_open)
#endif
#ifdef CONFIG_SECURITY_PATH
BTF_ID(func, security_path_truncate)
#endif
BTF_ID(func, vfs_truncate)
BTF_ID(func, vfs_fallocate)
BTF_ID(func, dentry_open)
BTF_ID(func, vfs_getattr)
BTF_ID(func, filp_close)
BTF_SET_END(btf_allowlist_d_path)

static bool bpf_d_path_allowed(const struct bpf_prog *prog)
{
	if (prog->type == BPF_PROG_TYPE_TRACING &&
	    prog->expected_attach_type == BPF_TRACE_ITER)
		return true;

	if (prog->type == BPF_PROG_TYPE_LSM)
		return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);

	return btf_id_set_contains(&btf_allowlist_d_path,
				   prog->aux->attach_btf_id);
}