跳过正文

eBPF libbpf 库解析

·11325 字
Ebpf
目录

1 bpf_helpers.h 宏定义
#

该头文件定义了各种宏:

  • __uint/__type/__array : 定义 map 时常用的宏,这些宏会给 map 提供 CO-RE 属性,后续可以用 btftool map dump 来格式化查看 map 数据。
struct {
	__uint(type, BPF_MAP_TYPE_ARRAY);
	__type(key, u32);
	__type(value, u32);
	__uint(max_entries, 5);
} config_args SEC(".maps");

在定义 C eBPF 程序的 struct 时, 经常使用 s8/u8/__s8/__u8/uid_t/pid_t 等宏定义, 它们都是在内核头文件中定义的,可以通过 #include "vmlinux.h" 文件来使用:

// vmlinux-4.19.91-007.h

typedef signed char __s8;
typedef unsigned char __u8;
typedef short int __s16;
typedef short unsigned int __u16;
typedef int __s32;
typedef unsigned int __u32;
typedef long long int __s64;
typedef long long unsigned int __u64;

typedef __s8 s8;
typedef __u8 u8;
typedef __s16 s16;
typedef __u16 u16;
typedef __s32 s32;
typedef __u32 u32;
typedef __s64 s64;
typedef __u64 u64;

enum {
	false = 0,
	true = 1,
};

typedef _Bool bool;
  • SEC : 定义 eBPF 插桩类型和名称.
  • __always_inline : 内联函数.
  • __noinline : 非内联函数.
  • 使用场景:
    • 低版本内核 eBPF 不支持调用函数, 如果要定义公共函数, 则解决方案是使用 static __always_inline 来修饰;
    • 从 4.16 内核和 LLVM 6.0 开始, eBPF 支持直接调用自定义函数, 没必要使用 static __always_inline 了.
// 4.16 前内核, 自定义函数需要使用 __always_inline
static __always_inline void my_function(void *ctx, int val)

// 4.16 及以后内核,可以 eBPF 函数间可以直接调用
static __noinline int get_opcode(struct bpf_raw_tracepoint_args *ctx) {
// 或者:
static __attribute((noinline)) int get_opcode(struct bpf_raw_tracepoint_args *ctx) {
	return ctx->args[1];
}

SEC("raw_tp")
int hello(struct bpf_raw_tracepoint_args *ctx) {
	int opcode = get_opcode(ctx);
	bpf_printk("Syscall: %d", opcode);
	return 0;
}

__weak :

__hidden :

// <bpf/bpf_helpers.h>

/* Avoid 'linux/stddef.h' definition of '__always_inline'. */
#undef __always_inline
#define __always_inline inline __attribute__((always_inline))

#ifndef __noinline
#define __noinline __attribute__((noinline))
#endif
#ifndef __weak
#define __weak __attribute__((weak))
#endif

/*
 * Use __hidden attribute to mark a non-static BPF subprogram effectively
 * static for BPF verifier's verification algorithm purposes, allowing more
 * extensive and permissive BPF verification process, taking into account
 * subprogram's caller context.
 */
#define __hidden __attribute__((visibility("hidden")))

1.1 SEC 宏
#

SEC() 是在 bpf_helpers.h 中定义的宏.

  • 不需要 auto Attach,不需要指定 handler_id;
    • socket/sk_reuseport;
    • tc/classifier/action;
    • xdp/cgroup 等;
  • 需要 auto Attach,指定了对应的 handler_id;
    • k[ret]probe/u[ret]probe/k[ret]syscall/usdt:attach_kprobe/uprobe/ksyscall
    • tracepoint+/raw_tracepoint: attach_tp/attach_raw_tp
    • tp_btf/fentry/fexit/fmod_ret/freplace: attach_trace, 依赖 BTF 信息。

cilium 的 elf_reader.gofunc getProgType(sectionName string) (ProgramType, AttachType, uint32, string) 内部也包含一个类似的 SEC Name 前缀,返回地最后一个 string 为去掉前缀后的 attachTo 字符串。

  • 同一种 Prog Type,通过 Attach Type 来区分;
// https://github.com/libbpf/libbpf/blob/a6d7530cb7dff87ac1e64a540e63b67ddde2e0f9/src/libbpf.c#L35
struct bpf_sec_def {
	char *sec;
	enum bpf_prog_type prog_type;
	enum bpf_attach_type expected_attach_type;
	long cookie;
	int handler_id;

	libbpf_prog_setup_fn_t prog_setup_fn;
	libbpf_prog_prepare_load_fn_t prog_prepare_load_fn;
	libbpf_prog_attach_fn_t prog_attach_fn;
};

// https://github.com/libbpf/libbpf/blob/a6d7530cb7dff87ac1e64a540e63b67ddde2e0f9/src/libbpf.c#L8472
static const struct bpf_sec_def section_defs[] = {
	// 不需要 attach,所以不需要指定 handler_id
	SEC_DEF("socket",		SOCKET_FILTER, 0, SEC_NONE),
	SEC_DEF("sk_reuseport/migrate",	SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, SEC_ATTACHABLE),
	SEC_DEF("sk_reuseport",		SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT, SEC_ATTACHABLE)

    // 下面需要 auto attach 到 kprobe 或 uprobe
	SEC_DEF("kprobe+",		KPROBE,	0, SEC_NONE, attach_kprobe),
	SEC_DEF("uprobe+",		KPROBE,	0, SEC_NONE, attach_uprobe),
	SEC_DEF("uprobe.s+",		KPROBE,	0, SEC_SLEEPABLE, attach_uprobe),
	SEC_DEF("kretprobe+",		KPROBE, 0, SEC_NONE, attach_kprobe),
	SEC_DEF("uretprobe+",		KPROBE, 0, SEC_NONE, attach_uprobe),
	SEC_DEF("uretprobe.s+",		KPROBE, 0, SEC_SLEEPABLE, attach_uprobe),
	SEC_DEF("kprobe.multi+",	KPROBE,	BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi),
	SEC_DEF("kretprobe.multi+",	KPROBE,	BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi),
	SEC_DEF("ksyscall+",		KPROBE,	0, SEC_NONE, attach_ksyscall),
	SEC_DEF("kretsyscall+",		KPROBE, 0, SEC_NONE, attach_ksyscall),
	SEC_DEF("usdt+",		KPROBE,	0, SEC_NONE, attach_usdt),

	// 网络:
	SEC_DEF("tc",			SCHED_CLS, 0, SEC_NONE), // 不需要 attach
	SEC_DEF("classifier",		SCHED_CLS, 0, SEC_NONE),
	SEC_DEF("action",		SCHED_ACT, 0, SEC_NONE),

    // 下面需要 auto attach 到 tracepoint
	SEC_DEF("tracepoint+",		TRACEPOINT, 0, SEC_NONE, attach_tp), // 需要 attach
	SEC_DEF("tp+",			TRACEPOINT, 0, SEC_NONE, attach_tp),
	SEC_DEF("raw_tracepoint+",	RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp),
	SEC_DEF("raw_tp+",		RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp),
	SEC_DEF("raw_tracepoint.w+",	RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp),
	SEC_DEF("raw_tp.w+",		RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp),

    // 下面需要 auto attach 到 tracing 而非 tracepoint,都是依赖于 BTF 信息的,使用 bpf link 来 attach
	// Program Type 都是 TRACING,但是 Attach Type 各不相同。
	SEC_DEF("tp_btf+",		TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF, attach_trace),
	SEC_DEF("fentry+",		TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF, attach_trace),
	SEC_DEF("fmod_ret+",		TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF, attach_trace),
	SEC_DEF("fexit+",		TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF, attach_trace),
	SEC_DEF("fentry.s+",		TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
	SEC_DEF("fmod_ret.s+",		TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
	SEC_DEF("fexit.s+",		TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
	SEC_DEF("freplace+",		EXT, 0, SEC_ATTACH_BTF, attach_trace),

    // 下面需要 auto attach 到 lsm,也依赖 BTF,有对应的 Attach Type。
	SEC_DEF("lsm+",			LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm),
	SEC_DEF("lsm.s+",		LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm),
	SEC_DEF("lsm_cgroup+",		LSM, BPF_LSM_CGROUP, SEC_ATTACH_BTF),
	SEC_DEF("iter+",		TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF, attach_iter),
	SEC_DEF("iter.s+",		TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_iter),

	SEC_DEF("syscall",		SYSCALL, 0, SEC_SLEEPABLE),

    // 网络:下面这些都是网络相关的,不需要 auto attach
	SEC_DEF("xdp.frags/devmap",	XDP, BPF_XDP_DEVMAP, SEC_XDP_FRAGS),
	SEC_DEF("xdp/devmap",		XDP, BPF_XDP_DEVMAP, SEC_ATTACHABLE),
	SEC_DEF("xdp.frags/cpumap",	XDP, BPF_XDP_CPUMAP, SEC_XDP_FRAGS),
	SEC_DEF("xdp/cpumap",		XDP, BPF_XDP_CPUMAP, SEC_ATTACHABLE),
	SEC_DEF("xdp.frags",		XDP, BPF_XDP, SEC_XDP_FRAGS),
	SEC_DEF("xdp",			XDP, BPF_XDP, SEC_ATTACHABLE_OPT),

    // 用户空间进程序在 attach 前,需要使用 perf_event_open 的 att 来指定采样频率和周期.
    // https://github.com/libbpf/libbpf-bootstrap/blob/master/examples/c/profile.c#L194
	SEC_DEF("perf_event",		PERF_EVENT, 0, SEC_NONE),

	// 网络:轻量级隧道
	SEC_DEF("lwt_in",		LWT_IN, 0, SEC_NONE),
	SEC_DEF("lwt_out",		LWT_OUT, 0, SEC_NONE),
	SEC_DEF("lwt_xmit",		LWT_XMIT, 0, SEC_NONE),
	SEC_DEF("lwt_seg6local",	LWT_SEG6LOCAL, 0, SEC_NONE),

	SEC_DEF("sockops",		SOCK_OPS, BPF_CGROUP_SOCK_OPS, SEC_ATTACHABLE_OPT),
	SEC_DEF("sk_skb/stream_parser",	SK_SKB, BPF_SK_SKB_STREAM_PARSER, SEC_ATTACHABLE_OPT),
	SEC_DEF("sk_skb/stream_verdict",SK_SKB, BPF_SK_SKB_STREAM_VERDICT, SEC_ATTACHABLE_OPT),
	SEC_DEF("sk_skb",		SK_SKB, 0, SEC_NONE),
	SEC_DEF("sk_msg",		SK_MSG, BPF_SK_MSG_VERDICT, SEC_ATTACHABLE_OPT),
	SEC_DEF("lirc_mode2",		LIRC_MODE2, BPF_LIRC_MODE2, SEC_ATTACHABLE_OPT),
	SEC_DEF("flow_dissector",	FLOW_DISSECTOR, BPF_FLOW_DISSECTOR, SEC_ATTACHABLE_OPT),
	SEC_DEF("cgroup_skb/ingress",	CGROUP_SKB, BPF_CGROUP_INET_INGRESS, SEC_ATTACHABLE_OPT),
	SEC_DEF("cgroup_skb/egress",	CGROUP_SKB, BPF_CGROUP_INET_EGRESS, SEC_ATTACHABLE_OPT),
	SEC_DEF("cgroup/skb",		CGROUP_SKB, 0, SEC_NONE),
	SEC_DEF("cgroup/sock_create",	CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE),
	SEC_DEF("cgroup/sock_release",	CGROUP_SOCK, BPF_CGROUP_INET_SOCK_RELEASE, SEC_ATTACHABLE),
	SEC_DEF("cgroup/sock",		CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE_OPT),
	SEC_DEF("cgroup/post_bind4",	CGROUP_SOCK, BPF_CGROUP_INET4_POST_BIND, SEC_ATTACHABLE),
	SEC_DEF("cgroup/post_bind6",	CGROUP_SOCK, BPF_CGROUP_INET6_POST_BIND, SEC_ATTACHABLE),
	SEC_DEF("cgroup/bind4",		CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_BIND, SEC_ATTACHABLE),
	SEC_DEF("cgroup/bind6",		CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_BIND, SEC_ATTACHABLE),
	SEC_DEF("cgroup/connect4",	CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_CONNECT, SEC_ATTACHABLE),
	SEC_DEF("cgroup/connect6",	CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_CONNECT, SEC_ATTACHABLE),
	SEC_DEF("cgroup/sendmsg4",	CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_SENDMSG, SEC_ATTACHABLE),
	SEC_DEF("cgroup/sendmsg6",	CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_SENDMSG, SEC_ATTACHABLE),
	SEC_DEF("cgroup/recvmsg4",	CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_RECVMSG, SEC_ATTACHABLE),
	SEC_DEF("cgroup/recvmsg6",	CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_RECVMSG, SEC_ATTACHABLE),
	SEC_DEF("cgroup/getpeername4",	CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETPEERNAME, SEC_ATTACHABLE),
	SEC_DEF("cgroup/getpeername6",	CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETPEERNAME, SEC_ATTACHABLE),
	SEC_DEF("cgroup/getsockname4",	CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETSOCKNAME, SEC_ATTACHABLE),
	SEC_DEF("cgroup/getsockname6",	CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETSOCKNAME, SEC_ATTACHABLE),
	SEC_DEF("cgroup/sysctl",	CGROUP_SYSCTL, BPF_CGROUP_SYSCTL, SEC_ATTACHABLE),
	SEC_DEF("cgroup/getsockopt",	CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT, SEC_ATTACHABLE),
	SEC_DEF("cgroup/setsockopt",	CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE),
	SEC_DEF("cgroup/dev",		CGROUP_DEVICE, BPF_CGROUP_DEVICE, SEC_ATTACHABLE_OPT),

	SEC_DEF("struct_ops+",		STRUCT_OPS, 0, SEC_NONE),
	SEC_DEF("sk_lookup",		SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE),
};

1.2 libbpf 支持的 Section
#

参考:Program Types and ELF Sections

The table below lists the program types, their attach types where relevant and the ELF section names supported by libbpf for them. The ELF section names follow these rules:

  • type is an exact match, e.g. SEC(“socket”)
  • type+ means it can be either exact SEC("type") or well-formed SEC("type/extras") with a ‘/’ separator between type and extras.

When extras are specified, they provide details of how to auto-attach the BPF program. The format of extras depends on the program type, e.g. SEC("tracepoint/<category>/<name>") for tracepoints or SEC("usdt/<path>:<provider>:<name>") for USDT probes. The extras are described in more detail in the footnotes.

https://raw.githubusercontent.com/libbpf/libbpf/master/docs/program_types.rst

 Program Type                                Attach Type                              ELF Section Name                   Sleepable 
 ``BPF_PROG_TYPE_CGROUP_DEVICE``             ``BPF_CGROUP_DEVICE``                    ``cgroup/dev``                               
 ``BPF_PROG_TYPE_CGROUP_SKB``              
                                           
                                           
                                           
                                           
                                          ``cgroup/skb``                               
 ``BPF_CGROUP_INET_EGRESS``               ``cgroup_skb/egress``                        
 ``BPF_CGROUP_INET_INGRESS``              ``cgroup_skb/ingress``                       
 ``BPF_PROG_TYPE_CGROUP_SOCKOPT``          
                                           
                                           
 ``BPF_CGROUP_GETSOCKOPT``                ``cgroup/getsockopt``                        
 ``BPF_CGROUP_SETSOCKOPT``                ``cgroup/setsockopt``                        
 ``BPF_PROG_TYPE_CGROUP_SOCK_ADDR``        
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
 ``BPF_CGROUP_INET4_BIND``                ``cgroup/bind4``                             
 ``BPF_CGROUP_INET4_CONNECT``             ``cgroup/connect4``                          
 ``BPF_CGROUP_INET4_GETPEERNAME``         ``cgroup/getpeername4``                      
 ``BPF_CGROUP_INET4_GETSOCKNAME``         ``cgroup/getsockname4``                      
 ``BPF_CGROUP_INET6_BIND``                ``cgroup/bind6``                             
 ``BPF_CGROUP_INET6_CONNECT``             ``cgroup/connect6``                          
 ``BPF_CGROUP_INET6_GETPEERNAME``         ``cgroup/getpeername6``                      
 ``BPF_CGROUP_INET6_GETSOCKNAME``         ``cgroup/getsockname6``                      
 ``BPF_CGROUP_UDP4_RECVMSG``              ``cgroup/recvmsg4``                          
 ``BPF_CGROUP_UDP4_SENDMSG``              ``cgroup/sendmsg4``                          
 ``BPF_CGROUP_UDP6_RECVMSG``              ``cgroup/recvmsg6``                          
 ``BPF_CGROUP_UDP6_SENDMSG``              ``cgroup/sendmsg6``                          
 ``BPF_PROG_TYPE_CGROUP_SOCK``             
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
 ``BPF_CGROUP_INET4_POST_BIND``           ``cgroup/post_bind4``                        
 ``BPF_CGROUP_INET6_POST_BIND``           ``cgroup/post_bind6``                        
 ``BPF_CGROUP_INET_SOCK_CREATE``        
                                        
                                        
 ``cgroup/sock_create``                       
 ``cgroup/sock``                              
 ``BPF_CGROUP_INET_SOCK_RELEASE``         ``cgroup/sock_release``                      
 ``BPF_PROG_TYPE_CGROUP_SYSCTL``             ``BPF_CGROUP_SYSCTL``                    ``cgroup/sysctl``                            
 ``BPF_PROG_TYPE_EXT``                                                                ``freplace+`` [#fentry]_                     
 ``BPF_PROG_TYPE_FLOW_DISSECTOR``            ``BPF_FLOW_DISSECTOR``                   ``flow_dissector``                           
 ``BPF_PROG_TYPE_KPROBE``                  
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                        
                                        
                                        
                                        
                                        
                                        
                                        
                                        
                                        
                                        
                                        
                                        
                                        
                                        
                                        
                                        
                                        
 ``kprobe+`` [#kprobe]_                       
 ``kretprobe+`` [#kprobe]_                    
 ``ksyscall+`` [#ksyscall]_                   
  ``kretsyscall+`` [#ksyscall]_               
 ``uprobe+`` [#uprobe]_                       
 ``uprobe.s+`` [#uprobe]_           Yes       
 ``uretprobe+`` [#uprobe]_                    
 ``uretprobe.s+`` [#uprobe]_        Yes       
 ``usdt+`` [#usdt]_                           
 ``BPF_TRACE_KPROBE_MULTI``             
                                        
                                        
 ``kprobe.multi+`` [#kpmulti]_                
 ``kretprobe.multi+`` [#kpmulti]_             
 ``BPF_PROG_TYPE_LIRC_MODE2``                ``BPF_LIRC_MODE2``                       ``lirc_mode2``                               
 ``BPF_PROG_TYPE_LSM``                     
                                           
                                           
                                           
                                           
 ``BPF_LSM_CGROUP``                       ``lsm_cgroup+``                              
 ``BPF_LSM_MAC``                        
                                        
                                        
 ``lsm+`` [#lsm]_                             
 ``lsm.s+`` [#lsm]_                 Yes       
 ``BPF_PROG_TYPE_LWT_IN``                                                             ``lwt_in``                                   
 ``BPF_PROG_TYPE_LWT_OUT``                                                            ``lwt_out``                                  
 ``BPF_PROG_TYPE_LWT_SEG6LOCAL``                                                      ``lwt_seg6local``                            
 ``BPF_PROG_TYPE_LWT_XMIT``                                                           ``lwt_xmit``                                 
 ``BPF_PROG_TYPE_PERF_EVENT``                                                         ``perf_event``                               
 ``BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE`` 
                                           
                                           
                                        
                                        
                                        
 ``raw_tp.w+`` [#rawtp]_                      
 ``raw_tracepoint.w+``                        
 ``BPF_PROG_TYPE_RAW_TRACEPOINT``          
                                           
                                           
                                        
                                        
                                        
 ``raw_tp+`` [#rawtp]_                        
 ``raw_tracepoint+``                          
 ``BPF_PROG_TYPE_SCHED_ACT``                                                          ``action``                                   
 ``BPF_PROG_TYPE_SCHED_CLS``               
                                           
                                           
                                        
                                        
                                        
 ``classifier``                               
 ``tc``                                       
 ``BPF_PROG_TYPE_SK_LOOKUP``                 ``BPF_SK_LOOKUP``                        ``sk_lookup``                                
 ``BPF_PROG_TYPE_SK_MSG``                    ``BPF_SK_MSG_VERDICT``                   ``sk_msg``                                   
 ``BPF_PROG_TYPE_SK_REUSEPORT``            
                                           
                                           
 ``BPF_SK_REUSEPORT_SELECT_OR_MIGRATE``   ``sk_reuseport/migrate``                     
 ``BPF_SK_REUSEPORT_SELECT``              ``sk_reuseport``                             
 ``BPF_PROG_TYPE_SK_SKB``                  
                                           
                                           
                                           
                                           
                                          ``sk_skb``                                   
 ``BPF_SK_SKB_STREAM_PARSER``             ``sk_skb/stream_parser``                     
 ``BPF_SK_SKB_STREAM_VERDICT``            ``sk_skb/stream_verdict``                    
 ``BPF_PROG_TYPE_SOCKET_FILTER``                                                      ``socket``                                   
 ``BPF_PROG_TYPE_SOCK_OPS``                  ``BPF_CGROUP_SOCK_OPS``                  ``sockops``                                  
 ``BPF_PROG_TYPE_STRUCT_OPS``                                                         ``struct_ops+``                              
 ``BPF_PROG_TYPE_SYSCALL``                                                            ``syscall``                        Yes       
 ``BPF_PROG_TYPE_TRACEPOINT``              
                                           
                                           
                                        
                                        
                                        
 ``tp+`` [#tp]_                               
 ``tracepoint+`` [#tp]_                       
 ``BPF_PROG_TYPE_TRACING``                 
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
 ``BPF_MODIFY_RETURN``                  
                                        
                                        
 ``fmod_ret+`` [#fentry]_                     
 ``fmod_ret.s+`` [#fentry]_         Yes       
 ``BPF_TRACE_FENTRY``                   
                                        
                                        
 ``fentry+`` [#fentry]_                       
 ``fentry.s+`` [#fentry]_           Yes       
 ``BPF_TRACE_FEXIT``                    
                                        
                                        
 ``fexit+`` [#fentry]_                        
 ``fexit.s+`` [#fentry]_            Yes       
 ``BPF_TRACE_ITER``                     
                                        
                                        
 ``iter+`` [#iter]_                           
 ``iter.s+`` [#iter]_               Yes       
 ``BPF_TRACE_RAW_TP``                     ``tp_btf+`` [#fentry]_                       
 ``BPF_PROG_TYPE_XDP``                     
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
                                           
 ``BPF_XDP_CPUMAP``                     
                                        
                                        
 ``xdp.frags/cpumap``                         
 ``xdp/cpumap``                               
 ``BPF_XDP_DEVMAP``                     
                                        
                                        
 ``xdp.frags/devmap``                         
 ``xdp/devmap``                               
 ``BPF_XDP``                            
                                        
                                        
 ``xdp.frags``                                
 ``xdp``                                      

下面是各种在 load 时自动 Attach event 的格式

  • [1] (1, 2, 3, 4, 5, 6, 7, 8) The fentry attach format is fentry[.s]/<function>
  • [2] (1, 2) The kprobe attach format is kprobe/<function>[+<offset>]. Valid characters for function are a-zA-Z0-9_. and offset must be a valid non-negative integer.
  • [3] (1, 2) The ksyscall attach format is ksyscall/<syscall>.
  • [4] (1, 2, 3, 4) The uprobe attach format is uprobe[.s]/<path>:<function>[+<offset>].
  • [5] The usdt attach format is usdt/<path>:<provider>:<name>.
  • [6] (1, 2) The kprobe.multi attach format is kprobe.multi/<pattern> where pattern supports * and ? wildcards. Valid characters for pattern are a-zA-Z0-9_.*?.
  • [7] (1, 2) The lsm attachment format is lsm[.s]/<hook>.
  • [8] (1, 2) The raw_tp attach format is raw_tracepoint[.w]/<tracepoint>.
  • [9] (1, 2) The tracepoint attach format is tracepoint/<category>/<name>.
  • [10] (1, 2) The iter attach format is iter[.s]/<struct-name>.

2 bpf/bpf_core_read.h 内存读取
#

eBPF Program tracing 存取内存是有限制的,需要使用 bpf_probe_read_*() helper func 或 libbpf 的封装函数,这是由于 eBPF verier一般不允许使用指针来读取内存内容。

对于传递给 eBPF event handler 的参数:

  1. struct 中 pointer field: 则需要先创建一个 eBPF 本地同类型指针变量,使用 bpf_probe_read 将 pointer 读取到该变量,然后再使用 bpf_probe_read 读取本地指针变量指向地址的内容;
  2. struct 中 array field: 则可以直接使用 bpf_probe_read arrry 包含的内容。

例如 tracepoint event handler 的参数类型是 struct trace_event_raw_XX *ctx 是 vmlinux.h 提供的 struct 定义,其中绝大部分 field 是 array 类型。内核在调用该 handler 之前,已经对 struct trace_event_raw_XX *ctx 各 filed 赋值(这也是 tracepoint 不如 raw_tracepoint 性能高的原因),所以如果是 array 类型 field,则各字段可以直接读取,例如下面的 ctx->args[0] 和 ctx->args[1],甚至可以直接 memcpy array 字段。

struct bpf_raw_tracepoint_args {
    __u64 args[0];  // 变长数组
};

SEC("raw_tracepoint/sys_enter")
int raw_tracepoint__sys_enter(struct bpf_raw_tracepoint_args *ctx)
{
	 // 根据 TRACE_EVENT_FN(sys_enter 中的 TP_PROTO(struct pt_regs *regs, long id) 来确定
	// ctx->args 数组的内容。
	// ctx->args[0] --》 保存系统调用函数参数的 struct pt_regs *regs;
	// ctx->args[1] --》 保存 syscall id

    unsigned long syscall_id = ctx->args[1]; // 直接读取变长数组成员的值。

    if(syscall_id != 268)    // 过滤系统调用 id,只处理 fchmodat 系统调用
        return 0;

    struct pt_regs *regs;
    regs = (struct pt_regs *) ctx->args[0]; // 获得 struct pt_regs * // 直接读取变长数组成员的值。

    char pathname[256];
    u32 mode;

    // 读取第二个系统调用函数参数的值
    // PT_REGS_PARM[1-5], 也即从 1 开始计数。
    char *pathname_ptr = (char *) PT_REGS_PARM2_CORE(regs);
    bpf_core_read_user_str(&pathname, sizeof(pathname), pathname_ptr);

    // 读取第三个系统调用函数参数的值
    mode = (u32) PT_REGS_PARM3_CORE(regs);

    char fmt[] = "fchmodat %s %d\n";
    bpf_trace_printk(fmt, sizeof(fmt), &pathname, mode);
    return 0;
}

但是如果读取的是 struct 中的 ptr 成员指向的内容,则一般使用如下 pattern:

  1. 先定义一个 eBPF 函数内的指针变量 Ptr;
  2. 使用 bpf_probe_read() 将待读取的指针保存到 Ptr;
  3. 再使用 bpf_probe_read() 从 Ptr 指向的地址读取数据。
// task 是内核数据结构
struct task_struct *task = (struct task_struct *)bpf_get_current_task();
// 从内核地址读取数据,也需要使用 bpf_probe_read, 这里的 BPF_CORE_READ 是它的链式封装。
pid_t ppid = (pid_t)BPF_CORE_READ(task, real_parent, tgid);


//vmlinux.h 提供的 tracepoint ctx struct 大部分 field 都是 array 类型。内核在调用 handler 之前,已经对
//tracepoint ctx struct 中的各 filed 赋值,所以如果是 array 类型 field,则各字段可以直接读取,例如下面的
// ctx->args[0] 和 ctx->args[1]。
struct trace_event_raw_sys_enter {
	struct trace_entry ent;
	long int id;
	long unsigned int args[6];
	char __data[0];
};

SEC("tracepoint/syscalls/sys_enter_execve")
int tracepoint__syscalls__sys_enter_execve(struct trace_event_raw_sys_enter *ctx)
{
	//  execve 系统调用参数格式:
	// int execve(const char *filename, char *const argv[], char *const envp[]);

	// ctx->args 是 long unsigned int args[6]; 类型,故可以直接读取成员值。
	ret = bpf_probe_read(event->filename, TASK_FILENAME_LEN,
			     (const char *)ctx->args[0]);

	// 下面的 ctx->args[1] 指向命令行参数指针数组,数组以 NULL 结尾。
	// 所以,需要先用 bpf_probe_read 读取命令行参数指针,存入 argp,
	// 然后再使用 bpf_probe_read 从 argp 读取参数字符串。

	const char **args = (const char **)(ctx->args[1]);
#pragma unroll
	for (int i = 1; i < 10; i++) {
		const char *argp = NULL;
		// 1. 读取 args[i] 字符串地址,存入 argp。
		ret = bpf_probe_read(&argp, sizeof(argp), &args[i]);

		// 向 event->args 数组写入前,需要做边界检查,否则 eBPF verifier 失败。
		if (event->args_size > LAST_ARG) {
			goto output;
		}

		// 2. 从 argp 读取字符串。
		ret = bpf_probe_read(&event->args[event->args_size],
				     ARGSIZE - 2, argp);
		event->args_size += ARGSIZE - 2;

		// 在 read 的数据块末尾添加两个特殊的字符串标记 '!!', 用于后续用户空间切割。
		//
		// 边界检查的 index 必须是变量,不能是表达式(如 event->args_size + 1),如果需要表达式,则需
                // 要先对 index 值计算,然后再用它做检查和设置。
		//
		// 1. 先计算要写的数组 index;
		event->args_size += 1;
		// 2. 写之前对 index 进行边界检查;
		if (event->args_size > FULL_MAX_ARGS_ARR) {
			goto output;
		}
		// 3. 实际写数组。
		event->args[event->args_size] = '!';
// 。。。
	}

Q:如何区分要读取的指针是 user space 还是 kernel space?

A:当对 syscall 内核函数进行插桩时,函数的指针参数可能指向 user space 内容,具体可以根据 syscall 或函数签名的指针参数部分 是否包含 __user 限定符来判断

  • 系统调用都是使用 SYSCALL_DEFINEX 宏函数来定义(X 表示系统调用参数的数量),可以作为快速过滤条件;
// fs/exec.c
SYSCALL_DEFINE3(execve,
		const char __user *, filename,
		const char __user *const __user *, argv,
		const char __user *const __user *, envp)
{
	return do_execve(getname(filename), argv, envp);
}

// fs/exec.c
int do_execve(struct filename *filename,
	const char __user *const __user *__argv,
	const char __user *const __user *__envp)
{
	struct user_arg_ptr argv = { .ptr.native = __argv };
	struct user_arg_ptr envp = { .ptr.native = __envp };
	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
}

2.1 内核 eBPF bpf_probe_read 系列 helper func
#

5.2 内核版本之前,内核 eBPF 只提供了两个通用 bpf_probe_read 和 bpf_probe_read_str help func,可以同时用来读取内核和用户地址空间的内容。

5.2 内核版本之后,内核 eBPF 开始区分 user 和 kernel 地址空间的内容,并提供了两类专用函数:

  • bpf_probe_read_user/bpf_probe_read_user_str: 只读取用户空间地址内容,可能会 page fault,返回 -EFAULT;
  • bpf_probe_read_kernel/bpf_probe_read_kernel_str: 只读取内核空间地址内容;

libpf 会做兼容,即 C Kernel 程序中如果使用了 bpf_probe_read_user* 和 bpf_probe_read_kernel* 函数,在老内核情况下,会自动转换为 bpf_probe_read 或 bpf_probe_read_str helper func。

  • FIXME!!! 这个是编译时还是运行时转换?如果是编译时转换,可能在高版本内核上开发编译,在低版本内核运行不起来的问题?

4.19 内核的 eBPF bpf_probe helper 函数:

  • bpf_probe_read_str 有 BUG 会导致 CPU Core soft lockup,系统卡死,实际只能使用 bpf_probe_read;
# zgrep bpf_probe /proc/kallsyms
ffffffff811bb360 T bpf_probe_read
ffffffff811bb3a0 T bpf_probe_write_user
ffffffff811bb3f0 T bpf_probe_read_str
ffffffff811bc600 T bpf_probe_register
ffffffff811bc630 T bpf_probe_unregister
ffffffff81e457c0 r bpf_probe_read_str_proto
ffffffff81e45980 r bpf_probe_write_user_proto
ffffffff81e459c0 r bpf_probe_read_proto

5.15 内核的 eBPF bpf_probe heler 函数,对于 user 和 kernel,分别有对应的读取函数:

  • user:bpf_probe_read_user/bpf_probe_read_use_str
  • kernel: bpf_probe_read_kernel/bpf_probe_read_kernel_str
root@lima-ebpf-dev:/hello-ebpf/process# zgrep bpf_probe /proc/kallsyms
ffffffff9f42abd0 T bpf_probe_read_user
ffffffff9f42ac20 T bpf_probe_read_user_str
ffffffff9f42ac70 T bpf_probe_read_kernel
ffffffff9f42acc0 T bpf_probe_read_kernel_str
ffffffff9f42ad10 T bpf_probe_write_user
ffffffff9f42bbd0 T bpf_probe_read_compat_str
ffffffff9f42bc70 T bpf_probe_read_compat
ffffffff9f42d900 T bpf_probe_register
ffffffff9f42d950 T bpf_probe_unregister
ffffffffa0434040 d bpf_probe_write_user_proto
ffffffffa04340a0 d bpf_probe_read_compat_str_proto
ffffffffa0434100 d bpf_probe_read_compat_proto
ffffffffa0434160 D bpf_probe_read_kernel_str_proto
ffffffffa04341c0 D bpf_probe_read_kernel_proto
ffffffffa0434220 D bpf_probe_read_user_str_proto
ffffffffa0434280 D bpf_probe_read_user_proto

2.2 CO-RE bpf_core_read 系列和 BPF_CORE_READ/BPF_PROBE_READ
#

bpf/bpf_core_read.h 头文件提供了 CO-RE 方式访问内存的函数,时 libbpf 对 eBPF bpf_probe_read 系列 helper func的封装。主要优势是能根据 target kernel 的 BTF 类型和编译时的 original local BTF 类型,来做 offset relocation。这样 解决了 bpf_probe_read_XX 依赖于内核版本实现的问题,更通用

这些 CO-RE 函数都以 bpf_core_read 开头,区别于内核的 bpf_probe_read 开头的 help func。

读取内核空间指针地址内容:

  • bpf_core_read:底层调用 bpf_probe_read_kernel
  • bpf_core_read_str:底层调用 bpf_probe_read_kernel_str

读取用户空间指针地址内容:

  • bpf_core_read_user:底层调用 bpf_probe_read_user
  • bpf_core_read_user_str:底层调用 bpf_probe_read_user_str
// https://github.com/libbpf/libbpf/blob/a6d7530cb7dff87ac1e64a540e63b67ddde2e0f9/src/bpf_core_read.h#L230

/*
 * bpf_core_read() abstracts away bpf_probe_read_kernel() call and captures offset relocation for source
 * address using __builtin_preserve_access_index() built-in, provided by Clang.
 *
 * __builtin_preserve_access_index() takes as an argument an expression of taking an address of a field within
 * struct/union. It makes compiler emit a relocation, which records BTF type ID describing root struct/union
 * and an accessor string which describes exact embedded field that was used to take an address. See detailed
 * description of this relocation format and semantics in comments to struct bpf_field_reloc in
 * libbpf_internal.h.
 *
 * This relocation allows libbpf to adjust BPF instruction to use correct actual field offset, based on target
 * kernel BTF type that matches original (local) BTF, used to record relocation.
 */
#define bpf_core_read(dst, sz, src)					    \
	bpf_probe_read_kernel(dst, sz, (const void *)__builtin_preserve_access_index(src))

/* NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. */
#define bpf_core_read_user(dst, sz, src)				    \
	bpf_probe_read_user(dst, sz, (const void *)__builtin_preserve_access_index(src))
/*
 * bpf_core_read_str() is a thin wrapper around bpf_probe_read_str()
 * additionally emitting BPF CO-RE field relocation for specified source
 * argument.
 */
#define bpf_core_read_str(dst, sz, src)					    \
	bpf_probe_read_kernel_str(dst, sz, (const void *)__builtin_preserve_access_index(src))

/* NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. */
#define bpf_core_read_user_str(dst, sz, src)				    \
	bpf_probe_read_user_str(dst, sz, (const void *)__builtin_preserve_access_index(src))

使用举例:

struct task_struct *task = (void *)bpf_get_current_task();
struct task_struct *parent_task;
int err;

err = bpf_core_read(&parent_task, sizeof(void *), &task->parent);
if (err) {
    /* handle error */
}

/* parent_task contains the value of task->parent pointer */


struct my_kernel_type {
    const char *name;
    char type[32];
};

struct my_kernel_type *t = ...;
const char *p;
char str[32];

// 对于 struct 中的指针 field,需要先 read 到本地指针,然后再使用本地指针 read 实际内容。
/* get string pointer, CO-RE-relocatable */
bpf_core_read(&p, sizeof(p), &t->name);
/* read the string, non-CO-RE-relocatable, pointer is valid regardless */
bpf_probe_read_kernel_str(str, sizeof(str), p);

// 对于 struct 中数组 field,由于是
/* read string as CO-RE-relocatable */
bpf_core_read_str(str, sizeof(str), &t->type);

为了方便链式调用,bpf/bpf_core_read.h 头文件还提供了 BPF_CORE_READ 和 BPF_PROBE_READ 开头的各种函数宏:

  • BPF_CORE_READ*:
  • BPF_CORE_READ_USER*
  • BPF_PROBE_READ*:
  • BPF_PROBE_READ_USER*:
// https://github.com/libbpf/libbpf/blob/a6d7530cb7dff87ac1e64a540e63b67ddde2e0f9/src/bpf_core_read.h#L346
     70:#define BPF_CORE_READ_BITFIELD_PROBED(s, field) ({			      \
     88:#define BPF_CORE_READ_BITFIELD(s, field) ({				      \

    442:#define BPF_CORE_READ(src, a, ...) ({					    \
    350:#define BPF_CORE_READ_INTO(dst, src, a, ...) ({				    \
    386:#define BPF_CORE_READ_STR_INTO(dst, src, a, ...) ({			    \

    458:#define BPF_CORE_READ_USER(src, a, ...) ({				    \
    360:#define BPF_CORE_READ_USER_INTO(dst, src, a, ...) ({			    \
    396:#define BPF_CORE_READ_USER_STR_INTO(dst, src, a, ...) ({		    \

    465:#define BPF_PROBE_READ(src, a, ...) ({					    \
    366:#define BPF_PROBE_READ_INTO(dst, src, a, ...) ({			    \
    402:#define BPF_PROBE_READ_STR_INTO(dst, src, a, ...) ({			    \

    376:#define BPF_PROBE_READ_USER_INTO(dst, src, a, ...) ({			    \
    413:#define BPF_PROBE_READ_USER_STR_INTO(dst, src, a, ...) ({		    \
    477:#define BPF_PROBE_READ_USER(src, a, ...) ({				    \

参考:

  1. https://nakryiko.com/posts/bpf-core-reference-guide/#bpf-core-read

3 bpf/bpf_tracing.h PT_REGS_XXX 和 BPF_KPROBE 函数宏
#

k[ret]probe/u[ret]probe 类型的 eBPF Program Type 用于对内核或用户函数进行插桩, 内核或用户函数的参数是保存在struct pt_regs 中的 ,这些类型的 event handler func 参数固定为 struct pt_regs *ctx ,可以使用<bpf/bpf_tracing.h> 提供的 PT_REGS_PARMXX 函数宏 来从 ctx 中提取内核或用户函数的参数,可能是指针或数值,具体取决于函数的签名.

PT_REGS_PARM[1-5], 也即从 1 开始计数,对应内核函数的第 1 个参数。

3.1 操作 struct pt_regs 的各种 PT_REGS_XXX 宏函数
#

bpf/bpf_tracing.h 头文件定义了 操作 struct pt_regs 的各种通用 PT_REGS_XXX 宏函数

  • 使用 PT_REGS_XXX 宏函数, 可以屏蔽各种体系结构的差异

struct pt_regs* 主要在 [ret]probe/u[ret]probe 场景使用,它们的 event handler func 的 ctx 是 struct pt_regs* 类型。

struct pt_regs 是体系结构相关的 例如 x86_64 的定义:arch/x86/include/asm/ptrace.h

// arch/x86/include/asm/ptrace.h

struct pt_regs {
/*
 * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
 * unless syscall needs a complete, fully filled "struct pt_regs".
 */
	unsigned long r15;
	unsigned long r14;
	unsigned long r13;
	unsigned long r12;
	unsigned long bp;
	unsigned long bx;
/* These regs are callee-clobbered. Always saved on kernel entry. */
	unsigned long r11;
	unsigned long r10;
	unsigned long r9;
	unsigned long r8;
	unsigned long ax;
	unsigned long cx;
	unsigned long dx;
	unsigned long si;
	unsigned long di;
/*
 * On syscall entry, this is syscall#. On CPU exception, this is error code.
 * On hw interrupt, it's IRQ number:
 */
	unsigned long orig_ax;
/* Return frame for iretq */
	unsigned long ip;
	unsigned long cs;
	unsigned long flags;
	unsigned long sp;
	unsigned long ss;
/* top of stack page */
};

libbpf 中的 bpf_traceing.h (对应 <bpf/bpf_tracing.h>) 定义了各体系结构对应的寄存器名称宏,如:

  • __PT_PARM1_REG, __PT_PARM2_REG,。。。 表示保存系统调用参数的 5 个寄存器名称。
// libbpf/src/bpf_tracing.h, 对应 C 中的 <bpf/bpf_tracing.h> 头文件

// x86 和 x86_64 都对应 bpf_target_x86
/* Fall back to what the compiler says */
#if defined(__x86_64__)
	#define bpf_target_x86
	#define bpf_target_defined



#if defined(bpf_target_x86)

#if defined(__KERNEL__) || defined(__VMLINUX_H__) // 内核代码部分

#define __PT_PARM1_REG di
#define __PT_PARM2_REG si
#define __PT_PARM3_REG dx
#define __PT_PARM4_REG cx
#define __PT_PARM5_REG r8
#define __PT_RET_REG sp
#define __PT_FP_REG bp
#define __PT_RC_REG ax
#define __PT_SP_REG sp
#define __PT_IP_REG ip
/* syscall uses r10 for PARM4 */
#define PT_REGS_PARM4_SYSCALL(x) ((x)->r10)
#define PT_REGS_PARM4_CORE_SYSCALL(x) BPF_CORE_READ(x, r10)

#else // 用户空间部分

#ifdef __i386__  // x86

#define __PT_PARM1_REG eax
#define __PT_PARM2_REG edx
#define __PT_PARM3_REG ecx
/* i386 kernel is built with -mregparm=3 */
#define __PT_PARM4_REG __unsupported__
#define __PT_PARM5_REG __unsupported__
#define __PT_RET_REG esp
#define __PT_FP_REG ebp
#define __PT_RC_REG eax
#define __PT_SP_REG esp
#define __PT_IP_REG eip

#else /* __i386__ */   // x86_64 的如下:

#define __PT_PARM1_REG rdi
#define __PT_PARM2_REG rsi
#define __PT_PARM3_REG rdx
#define __PT_PARM4_REG rcx
#define __PT_PARM5_REG r8
#define __PT_RET_REG rsp
#define __PT_FP_REG rbp
#define __PT_RC_REG rax
#define __PT_SP_REG rsp
#define __PT_IP_REG rip
/* syscall uses r10 for PARM4 */
#define PT_REGS_PARM4_SYSCALL(x) ((x)->r10)
#define PT_REGS_PARM4_CORE_SYSCALL(x) BPF_CORE_READ(x, r10)

#endif /* __i386__ */

#endif /* __KERNEL__ || __VMLINUX_H__ */

然后基于上面体系结构相关的寄存器名称宏,定义 通用的对 struct pt_regs * ctx 操作的寄存器宏函数

根据传入的 struct pt_regs * ctx 的来源不同,提供两组 PT_REGS_PARM* 宏函数:

  1. 第一组:适用于 struct pt_regs *ctx 是内核对 event handler 直接填充的,例如 k[ret]probe/u[retprobe] 等 event handler func 的 ctx 直接是 struct pt_regs * ctx 的情况:

  2. PT_REGS_PARM1-5: 提取 struct pt_regs * ctx 第 1-5 函数参数值。

  3. PT_REGS_PARM[1-5]_SYSCALL(ctx): 等效为 PT_REGS_PARM[1-5];

  4. 第二组:适用于 struct pt_regs * ctx 不是内核对 event handler 直接填充的,而是从 event handler 其他类型 ctx 中间接获取的情况 ,如 raw tracepoint 场景:

  5. PT_REGS_PARM[1-5]_CORE(ctx): 提取 struct pt_regs * ctx 第 1-5 函数参数值。

  6. PT_REGS_PARM[1-5]_CORE_SYSCALL(ctx): 等效为 PT_REGS_PARM[1-5]_CORE;

  7. https://lore.kernel.org/bpf/[email protected]/

// libbpf/src/bpf_tracing.h, 对应 C 中的 <bpf/bpf_tracing.h> 头文件

#if defined(bpf_target_defined)

struct pt_regs;  // 参考前面内核中的定义,如 arch/x86/include/asm/ptrace.h

/* allow some architecutres to override `struct pt_regs` */
#ifndef __PT_REGS_CAST
#define __PT_REGS_CAST(x) (x)
#endif

# 读取寄存器值,结果为参数指针
#define PT_REGS_PARM1(x) (__PT_REGS_CAST(x)->__PT_PARM1_REG)
#define PT_REGS_PARM2(x) (__PT_REGS_CAST(x)->__PT_PARM2_REG)
#define PT_REGS_PARM3(x) (__PT_REGS_CAST(x)->__PT_PARM3_REG)
#define PT_REGS_PARM4(x) (__PT_REGS_CAST(x)->__PT_PARM4_REG)
#define PT_REGS_PARM5(x) (__PT_REGS_CAST(x)->__PT_PARM5_REG)
#define PT_REGS_RET(x) (__PT_REGS_CAST(x)->__PT_RET_REG)
#define PT_REGS_FP(x) (__PT_REGS_CAST(x)->__PT_FP_REG)
#define PT_REGS_RC(x) (__PT_REGS_CAST(x)->__PT_RC_REG)
#define PT_REGS_SP(x) (__PT_REGS_CAST(x)->__PT_SP_REG)
#define PT_REGS_IP(x) (__PT_REGS_CAST(x)->__PT_IP_REG)

# 读取寄存器内参数指针指向的实际值。
#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM1_REG)
#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM2_REG)
#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM3_REG)
#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM4_REG)
#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM5_REG)
#define PT_REGS_RET_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_RET_REG)
#define PT_REGS_FP_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_FP_REG)
#define PT_REGS_RC_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_RC_REG)
#define PT_REGS_SP_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_SP_REG)
#define PT_REGS_IP_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_IP_REG)


# 以下是系统调用的参数对应的函数宏。
#ifndef PT_REGS_PARM1_SYSCALL
#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1(x)
#endif
#define PT_REGS_PARM2_SYSCALL(x) PT_REGS_PARM2(x)
#define PT_REGS_PARM3_SYSCALL(x) PT_REGS_PARM3(x)
#ifndef PT_REGS_PARM4_SYSCALL
#define PT_REGS_PARM4_SYSCALL(x) PT_REGS_PARM4(x)
#endif
#define PT_REGS_PARM5_SYSCALL(x) PT_REGS_PARM5(x)

#ifndef PT_REGS_PARM1_CORE_SYSCALL
#define PT_REGS_PARM1_CORE_SYSCALL(x) PT_REGS_PARM1_CORE(x)
#endif
#define PT_REGS_PARM2_CORE_SYSCALL(x) PT_REGS_PARM2_CORE(x)
#define PT_REGS_PARM3_CORE_SYSCALL(x) PT_REGS_PARM3_CORE(x)
#ifndef PT_REGS_PARM4_CORE_SYSCALL
#define PT_REGS_PARM4_CORE_SYSCALL(x) PT_REGS_PARM4_CORE(x)
#endif
#define PT_REGS_PARM5_CORE_SYSCALL(x) PT_REGS_PARM5_CORE(x)

另外, 在使用 clium/ebpf 的 go generate 来生成 ebpg go 代码文件时, 如果使用了体系结构实现相关的 PT_REGS_PARM* 宏, 需要在 go generate 中明确指定 –target 参数值为对应的 GOARCH 环境变量, 否则编译时报错: “the eBPF is using target specific macros, please provide -target that is not bpf, bpfel or bpfeb”

//go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc $BPF_CLANG -cflags "-O2 -g" --target=$GOARCH bpf nettuple.bpf.c -- -I ./ -I../headers -g

如果没有使用这些体系结构宏, 则可以不指定 –target, 这样同时生成两个 bpfel 和 bpfeb 的 go 文件.

从 v4.17 版本开始,内核默认开启了 CONFIG_ARCH_HAS_SYSCALL_WRAPPER , 则系统调用宏函数 SYSCALL_DEFINEx 将使用文件 asm/syscall_wrapper 中定义的 SYSCALL_DEFINE0() 和__SYSCALL_DEFINEx() 实现:

// arch/x86/include/asm/syscall_wrapper.h

// __64_sys_*(const struct pt_regs *regs) -> __se_sys_*(type1 name1,type2 name2) -> __do_sys_*(type1 name1,type2 name2)

/*
 * Instead of the generic __SYSCALL_DEFINEx() definition, this macro takes
 * struct pt_regs *regs as the only argument of the syscall stub named
 * __x64_sys_*(). It decodes just the registers it needs and passes them on to
 * the __se_sys_*() wrapper performing sign extension and then to the
 * __do_sys_*() function doing the actual job. These wrappers and functions
 * are inlined (at least in very most cases), meaning that the assembly looks
 * as follows (slightly re-ordered for better readability):
 *
 * <__x64_sys_recv>:		<-- syscall with 4 parameters
 *	callq	<__fentry__>
 *
 *	mov	0x70(%rdi),%rdi	<-- decode regs->di
 *	mov	0x68(%rdi),%rsi	<-- decode regs->si
 *	mov	0x60(%rdi),%rdx	<-- decode regs->dx
 *	mov	0x38(%rdi),%rcx	<-- decode regs->r10
 *
 *	xor	%r9d,%r9d	<-- clear %r9
 *	xor	%r8d,%r8d	<-- clear %r8
 *
 *	callq	__sys_recvfrom	<-- do the actual work in __sys_recvfrom()
 *				    which takes 6 arguments
 *
 *	cltq			<-- extend return value to 64-bit
 *	retq			<-- return
 *
 * This approach avoids leaking random user-provided register content down
 * the call chain.
 *
 * If IA32_EMULATION is enabled, this macro generates an additional wrapper
 * named __ia32_sys_*() which decodes the struct pt_regs *regs according
 * to the i386 calling convention (bx, cx, dx, si, di, bp).
 */
#define __SYSCALL_DEFINEx(x, name, ...)					\
	asmlinkage long __x64_sys##name(const struct pt_regs *regs);	\
	ALLOW_ERROR_INJECTION(__x64_sys##name, ERRNO);			\
	static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
	static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
	asmlinkage long __x64_sys##name(const struct pt_regs *regs)	\
	{								\
		return __se_sys##name(SC_X86_64_REGS_TO_ARGS(x,__VA_ARGS__));\
	}								\
	__IA32_SYS_STUBx(x, name, __VA_ARGS__)				\
	static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
	{								\
		long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\
		__MAP(x,__SC_TEST,__VA_ARGS__);				\
		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
		return ret;						\
	}								\
	static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
// __do_sys##name 只有一个函数头,函数体是后面使用类似于 SYSCALL_DEFINE5 实际定义的系统调用函数 body。


// include/linux/syscalls.h
#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
/*
 * It may be useful for an architecture to override the definitions of the
 * SYSCALL_DEFINE0() and __SYSCALL_DEFINEx() macros, in particular to use a
 * different calling convention for syscalls. To allow for that, the prototypes
 * for the sys_*() functions below will *not* be included if
 * CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled.
 */
#include <asm/syscall_wrapper.h>
#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */

...

#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)

#define SYSCALL_DEFINE_MAXARGS	6

#define SYSCALL_DEFINEx(x, sname, ...)				\
	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\
	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)

以 execveat 系统调用为例,它的 SYSCALL_DEFINEx 定义如下:

// fs/exec.c
// 使用上面 __SYSCALL_DEFINEx 定义的 wrapper 实现:
SYSCALL_DEFINE5(execveat,
		int, fd, const char __user *, filename,
		const char __user *const __user *, argv,
		const char __user *const __user *, envp,
		int, flags)
{
	int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;

	return do_execveat(fd,
			   getname_flags(filename, lookup_flags, NULL),
			   argv, envp, flags);
}

int do_execveat(int fd, struct filename *filename,
		const char __user *const __user *__argv,
		const char __user *const __user *__envp,
		int flags)
{
	struct user_arg_ptr argv = { .ptr.native = __argv };
	struct user_arg_ptr envp = { .ptr.native = __envp };

	return do_execveat_common(fd, filename, argv, envp, flags);
}

static int do_execveat_common(int fd, struct filename *filename,
			      struct user_arg_ptr argv,
			      struct user_arg_ptr envp,
			      int flags)
{
	return __do_execve_file(fd, filename, argv, envp, flags, NULL);
}

static int __do_execve_file(int fd, struct filename *filename,
			    struct user_arg_ptr argv,
			    struct user_arg_ptr envp,
			    int flags, struct file *file)
{
	char *pathbuf = NULL;
	struct linux_binprm *bprm;
	struct files_struct *displaced;
	int retval;

	if (IS_ERR(filename))
		return PTR_ERR(filename);
...
}

SYSCALL_DEFINE5(execveat 函数宏产生的效果:

  • 对外暴露的系统调用函数为 long __x64_sys_execveat(const struct pt_regs *regs);
    • 内层还有两个函数, 都是不对外暴露的 static 类型, 它们的参数列表包含用户空间指针:
      • static long __se_sys_execveat(int fd, struct filename *filename, …);
      • static inline long __do_sys_execveat(int fd, struct filename *filename, …);
  • 内部只对外暴露一层函数:
    • int do_execveat(int fd, struct filename *filename, xxx);
// 下面这三个函数都是 SYSCALL_DEFINE5 内部生产的
asmlinkage long __x64_sys_execveat(const struct pt_regs *regs) // 暴露
//- >
static long __se_sys_execveat(int fd, struct filename *filename,
		const char __user *const __user *__argv,
		const char __user *const __user *__envp,
		int flags)
// ->
static inline long __do_sys_execveat(int fd, struct filename *filename,
		const char __user *const __user *__argv,
		const char __user *const __user *__envp,
		int flags)

// 下面这些函数是 SYSCALL_DEFINE5 内部调用的
// ->
int do_execveat(int fd, struct filename *filename,            // 暴露
		const char __user *const __user *__argv,
		const char __user *const __user *__envp,
		int flags)
// ->
static int do_execveat_common(int fd, struct filename *filename,
			      struct user_arg_ptr argv,
			      struct user_arg_ptr envp,
			      int flags)
// ->
static int __do_execve_file(int fd, struct filename *filename,
			    struct user_arg_ptr argv,
			    struct user_arg_ptr envp,
			    int flags, struct file *file)

execveat 系统调用在 x86_64 的 syscall_64.tbl 列表中暴露,入口是生成的 __x64_sys_execveat 函数:

  • __x64_sys 是固定的前缀。
// arch/x86/entry/syscalls/syscall_64.tbl
#
# 64-bit system call numbers and entry vectors
#
# The format is:
# <number> <abi> <name> <entry point>
#
# The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls
#
# The abi is "common", "64" or "x32" for this file.
#
0	common	read			__x64_sys_read
1	common	write			__x64_sys_write
...
322	64	execveat		__x64_sys_execveat/ptregs
545	x32	execveat		__x32_compat_sys_execveat/ptregs

最后,可以被 kprobe/tracepoint 捕获的函数如下:

  • kprobe: __ia32_compat_sys_execveat,__ia32_sys_execveat,__x64_sys_execveat,do_execveat
  • tracepoint: syscalls:sys_enter_execveat,syscalls:sys_exit_execveat
[[email protected] /root]
#bpftrace -l "*execveat*" |grep -E 'kprobe|trace'
tracepoint:syscalls:sys_enter_execveat
tracepoint:syscalls:sys_exit_execveat
kprobe:__ia32_compat_sys_execveat
kprobe:__ia32_sys_execveat
kprobe:__x64_sys_execveat
kprobe:do_execveat

总结:

  1. 内核从 4.17 默认开启了 CONFIG_ARCH_HAS_SYSCALL_WRAPPER 后,导致直接对外暴露的系统调用的名称都 以体系结构名称前缀开始且只有一个 const struct pt_regs *regs 参数 ,例如:
    • __ia32_compat_sys_execveat(const struct pt_regs *regs)
    • __ia32_sys_execveat(const struct pt_regs *regs)
    • __x64_sys_execveat(const struct pt_regs *regs)
    • __arm64_sys_execveat(const struct pt_regs *regs)
  2. 按惯例,SYSCALL_DEFINEx 内部会调用名为 do_XX() 的对外暴露函数,作为 XX 系统调用的具体实现,例如: do_execveat(), 这些函数的参数是系统调用的函数参数列表:
# bpftrace -l "kprobe:do*" |grep 'execve'
kprobe:do_execve_file
kprobe:do_execve
kprobe:do_execveat

BCC 如何处理 CONFIG_ARCH_HAS_SYSCALL_WRAPPER

// handler 函数必须以 syscall__ 开头,后面的名称可以随意自定义(不要求一定 syscall name 相同)。
int syscall__recvfrom_entry(struct pt_regs* ctx)
{
    u64 tgid = bpf_get_current_pid_tgid();
    u32 pid = tgid >> 32;
    FILTER_PID
    void* buf = (void*) PT_REGS_PARM2(ctx);
    bpf_trace_printk("buffer addr %llx", buf);
    recv_map.update(&tgid, &buf);
    return 0;
}
...
// 必须使用 bpf.get_syscall_fnname("recvfrom") 来获取 ARCH 相关的 syscall name。
attach_res = bpf.attach_kprobe(bpf.get_syscall_fnname("recvfrom"), "recvfrom_entry");
if(!attach_res.ok()) {
        LOG_ERROR("attach_kprobe recvfrom_entry failed, error: %s", attach_res.msg().c_str());
        return;
}

libbpf 如何处理 CONFIG_ARCH_HAS_SYSCALL_WRAPPER

  • SEC Name 使用 ksyscall/ 前缀,指定 ARCH 无关的系统调用名称;
  • handler func 使用宏函数:BPF_KSYSCALL/BPF_KPROBE_KSYSCALL(两者互为别名,相互等价)
    • 自动感知和处理 CONFIG_ARCH_HAS_SYSCALL_WRAPPER;
// 使用 kprobe 插桩系统调用函数时,函数名称是 ARCH 相关的,不具有通用性。
SEC("kprobe/sys_execve") // 未开启 CONFIG_ARCH_HAS_SYSCALL_WRAPPER
SEC("kprobe/__ia32_sys_execve") // 开启 CONFIG_ARCH_HAS_SYSCALL_WRAPPER
SEC("kprobe/__arm64_sys_execve")
SEC("kprobe/__x64_sys_execve")

// 使用 ksyscall 前缀,可以指定 ARCH 无关的系统调用名称。
SEC("ksyscall/execve")
// 使用 BPF_KPROBE_SYSCALL 函数宏,收益:
// 1. 可以直接按照 syscall 参数列表来指定各参数类型和名称;
// 2. BPF_KPROBE_SYSCALL 内部自动兼容 CONFIG_ARCH_HAS_SYSCALL_WRAPPER 开启与否;
//    + 未开起:直接从 struct pt_regs *ctx 中读取各系统调用参数;
//    + 开启:先使用 (struct pt_regs *)PT_REGS_PARM1(ctx) 来获取 regs,然后再使用
//    PT_REGS_PARM*_CORE_SYSCALL(ctx) 从 regs 中读取各系统调用参数。
int BPF_KPROBE_SYSCALL(hello, const char *pathname)
{
   struct data_t data = {};
   struct user_msg_t *p;

   data.pid = bpf_get_current_pid_tgid() >> 32;
   data.uid = bpf_get_current_uid_gid() & 0xFFFFFFFF;

   bpf_get_current_comm(&data.command, sizeof(data.command));
   bpf_probe_read_user_str(&data.path, sizeof(data.path), pathname);

   p = bpf_map_lookup_elem(&my_config, &data.uid);
   if (p != 0) {
      bpf_probe_read_kernel_str(&data.message, sizeof(data.message), p->message);
   } else {
      bpf_probe_read_kernel_str(&data.message, sizeof(data.message), message);
   }

   // 还可以继续使用 ctx,类型为原始的 struct pt_regs *.
   bpf_perf_event_output(ctx, &output, BPF_F_CURRENT_CPU, &data, sizeof(data));
   return 0;
}

// libbpf ksyscall 是在使用 libbpf 来做 attach 时自动处理的。(所以 cilium 不支持)
// https://github.com/libbpf/libbpf/blob/1728e3e4bef0e138ea95ffe62163eb9a6ac6fa32/src/libbpf.c#L10379
struct bpf_link *bpf_program__attach_ksyscall(const struct bpf_program *prog,
					      const char *syscall_name,
					      const struct bpf_ksyscall_opts *opts)
{
	LIBBPF_OPTS(bpf_kprobe_opts, kprobe_opts);
	char func_name[128];

	if (!OPTS_VALID(opts, bpf_ksyscall_opts))
		return libbpf_err_ptr(-EINVAL);

	// 内核支持 FEAT_SYSCALL_WRAPPER 的情况下,自动给 syscall_name 添加 ARCH 前缀,如 __x64, __arm64
	// 从而
	if (kernel_supports(prog->obj, FEAT_SYSCALL_WRAPPER)) {
		/* arch_specific_syscall_pfx() should never return NULL here
		 * because it is guarded by kernel_supports(). However, since
		 * compiler does not know that we have an explicit conditional
		 * as well.
		 */
		snprintf(func_name, sizeof(func_name), "__%s_sys_%s",
			 arch_specific_syscall_pfx() ? : "", syscall_name);
	} else {
		snprintf(func_name, sizeof(func_name), "__se_sys_%s", syscall_name);
	}

	kprobe_opts.retprobe = OPTS_GET(opts, retprobe, false);
	kprobe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0);

	return bpf_program__attach_kprobe_opts(prog, func_name, &kprobe_opts);
}

cilium/ebpf 如何处理 CONFIG_ARCH_HAS_SYSCALL_WRAPPER

  • cilium/ebpf 没有专门的 ksyscall/ SEC Name 前缀,所以 不支持 SEC("ksyscall/execve") ;
  • 但是 cilium/ebpf 的 Kprobe link 类型支持自动为系统调用名称添加 __x64_ 前缀,所以 syscall 名称一般需要 以kprobe/sys_ 开头
      // https://github.com/cilium/ebpf/blob/02200f5764662c8fd3779172de216e5b93b3fefc/examples/ringbuffer/ringbuffer.c#L20
      SEC("kprobe/__x64_sys_execve") // ARCH 相关的系统调用名称开头,《=== 不建议
      SEC("kprobe/sys_execve") // 以  sys_<syscall name> 开头的名称,会自动添加 ARCH 相关的前缀。 《=== 建议
      int kprobe_execve(struct pt_regs *ctx) {
          // 先从 ctx 中获得保存系统调用各参数的 struct pt_regs * __ctx
          struct pt_regs *__ctx = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx);
          // 也可以使用 libbpf 提供的宏函数 PT_REGS_SYSCALL_REGS 来一步提取:
          // struct pt_regs *__ctx = PT_REGS_SYSCALL_REGS(ctx);
    
          // 然后再使用 PT_REGS_PARM*_CORE 来从 __ctx 提取各系统调用参数。
          read_args.fd = (int)PT_REGS_PARM1_CORE(__ctx);
          char *read_buf = (char *)PT_REGS_PARM2_CORE(__ctx);
      }
    
    
      // 使用 libbpf 提供给的 BPF_KPROBE_SYSCALL 或 BPF_KSYSCALL 宏时,可以直接指定系统调用参数列表
      // 注意:于 bcc 版本相比,虽然第一个参数不是 struct pt_regs *ctx, 但是函数体中还是可以使用 ctx 的。
      SEC("kprobe/sys_execve")
      int BPF_KPROBE_SYSCALL(execve, int fd, char * read_buf) {
          read_args.fd = fd;
          char *read_buf = read_buf;
      }
    

libbpf 宏函数 BPF_PROG/BPF_KPROBE/BPF_KRETPROBE/BPF_KSYSCALL/BPF_KPROBE_KSYSCALL 等是宏定义,内部使用 PT_REGS_PARM*来按需从 struct pt_regs *ctx 中提取各内核函数的参数。这个能力是 C 预处理器阶段就展开的 ,所以对于 libbpf/cilium/bcc 而言 都是可用的

// cilium/ebpg 使用 BPF_PROG,参数列表中直接指定内核函数的参数列表。

// https://github.com/cilium/ebpf/blob/02200f5764662c8fd3779172de216e5b93b3fefc/examples/fentry/fentry.c#L77
SEC("fentry/tcp_connect")
int BPF_PROG(tcp_connect, struct sock *sk) {
	if (sk->__sk_common.skc_family != AF_INET) {
		return 0;
	}

	struct event *tcp_info;
	tcp_info = bpf_ringbuf_reserve(&events, sizeof(struct event), 0);
	if (!tcp_info) {
		return 0;
	}

	tcp_info->saddr = sk->__sk_common.skc_rcv_saddr; // 对于 fentry,可以直接解引用指针。
	tcp_info->daddr = sk->__sk_common.skc_daddr;
	tcp_info->dport = sk->__sk_common.skc_dport;
	tcp_info->sport = bpf_htons(sk->__sk_common.skc_num);

	bpf_get_current_comm(&tcp_info->comm, TASK_COMM_LEN);

	bpf_ringbuf_submit(tcp_info, 0);

	return 0;
}

SEC(“prefix/name”) 是与使用的 libbpf/cilium/bcc 等有关系的,有些 prefix 是某个实现专用的。

  1. 例如 libbpf 支持 ksyscall/ 前缀,用于指定 ARCH 无关的系统调用名称, 但是 bcc/cilium 都不支持该前缀.
  2. cilium ebpf 支持的前缀: https://github.com/cilium/ebpf/blob/02200f5764662c8fd3779172de216e5b93b3fefc/elf_reader.go#L1202
// https://github.com/cilium/ebpf/issues/1016#issuecomment-1523843050
SEC("kprobe/sys_read") // 对于 cilium/ebpf 而言,系统调用函数必须以 kprobe/sys_ 开头,
int syscall__probe_entry_read(struct pt_regs *ctx)
{
    u64 id = bpf_get_current_pid_tgid();
    u32 pid = id >> 32;
    if (pid != A_PID)
        return 0;

    //
    struct pt_regs *__ctx = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx);
    if (!__ctx)
    {
        bpf_printk("[sys_read_entry]:failed to load original ctx");
        return 0;
    }

    bpf_printk("[sys_read_entry]:called for [PID:%lu]\n", pid);

    // Stash arguments
    struct data_args_t read_args = {};

    read_args.fd = (int)PT_REGS_PARM1_CORE(__ctx);
    char *read_buf = (char *)PT_REGS_PARM2_CORE(__ctx);

    if (!read_buf)
    {
        bpf_printk("[sys_read_entry]:read buf is null");
        return 0;
    }

    read_args.buf = read_buf;
    bpf_map_update_elem(&active_read_args_map, &id, &read_args, BPF_ANY);
    return 0;
}


// https://github.com/time-river/Linux-eBPF-Learning/commits/63e6cebb9fb25f0aff94486bef888b766b21fe30/2-openat/openat3_kern.c
SEC("kprobe/sys_openat")
int hello(struct pt_regs *ctx) {
	char fmt[] = "@dirfd='%d' @pathname='%s'";
	struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1(ctx);
	int dirfd = PT_REGS_PARM1_CORE(real_regs);
	char *pathname = (char *)PT_REGS_PARM2_CORE(real_regs);

	bpf_trace_printk(fmt, sizeof(fmt), dirfd, pathname);

	return 0;
}

// https://blog.cloudflare.com/zh-cn/live-patch-security-vulnerabilities-with-ebpf-lsm-zh-cn/
SEC("lsm/cred_prepare")
int BPF_PROG(handle_cred_prepare, struct cred *new, const struct cred *old,
             gfp_t gfp, int ret)
{
    struct pt_regs *regs;
    struct task_struct *task;
    kernel_cap_t caps;
    int syscall;
    unsigned long flags;

    // If previous hooks already denied, go ahead and deny this one
    if (ret) {
        return ret;
    }

    task = bpf_get_current_task_btf();
    regs = (struct pt_regs *) bpf_task_pt_regs(task); // 也是非直接 struct pt_regs *
    // In x86_64 orig_ax has the syscall interrupt stored here
    syscall = regs->orig_ax;
    caps = task->cred->cap_effective;

    // Only process UNSHARE syscall, ignore all others
    if (syscall != UNSHARE_SYSCALL) {
        return 0;
    }

    // PT_REGS_PARM1_CORE pulls the first parameter passed into the unshare syscall
    flags = PT_REGS_PARM1_CORE(regs); // 所以需要使用 PT_REGS_PARM1_CORE 来从 regs 中获取第一参数值。

    // Ignore any unshare that does not have CLONE_NEWUSER
    if (!(flags & CLONE_NEWUSER)) {
        return 0;
    }

    // Allow tasks with CAP_SYS_ADMIN to unshare (already root)
    if (caps.cap[CAP_TO_INDEX(CAP_SYS_ADMIN)] & CAP_TO_MASK(CAP_SYS_ADMIN)) {
        return 0;
    }

    return -EPERM;
}

<bpf/bpf_tracing.h> 头文件中提供的 PT_REGS_SYSCALL_REGS(ctx) 宏函数可以一步完成从系统调用的 kprobe 的 struct pt_regs *ctx 中提取保存系统调用各参数的 struct pt_regs *ctx:

// libpf/src/bpf_tracing.h

/*
 * When invoked from a syscall handler kprobe, returns a pointer to a
 * struct pt_regs containing syscall arguments and suitable for passing to
 * PT_REGS_PARMn_SYSCALL() and PT_REGS_PARMn_CORE_SYSCALL().
 */
#ifndef PT_REGS_SYSCALL_REGS
/* By default, assume that the arch selects ARCH_HAS_SYSCALL_WRAPPER. */
#define PT_REGS_SYSCALL_REGS(ctx) ((struct pt_regs *)PT_REGS_PARM1(ctx))
#endif

示例二: raw_tracepoint 场景,struct pt_regs * 是 event handler func 参数类型 struct bpf_raw_tracepoint_args *ctx 的 ctx->args[0] 指向,即 struct pt_regs * regs = (struct pt_regs *) ctx->args[0];, 后续对 regs 中各内核参数的提取就必须使用 PT_REGS_PARM[1-5]_CORE(regs) 或 PT_REGS_PARM[1-5]_CORE_SYSCALL(regs)。

// https://www.joyk.com/dig/detail/1653814776687666

/* TP_PROTO(struct pt_regs *regs, long id) 定义了可以通过 bpf_raw_tracepoint_args 的 args 拿到的信息。 id 是系统 */
/* 调用的 id, regs 中包含了对应的系统调用的参数。 可以通过 id 过滤只处理 fchmodat 的系统调用事件(通过命令 */
/* ausyscall fchmodat 找到对应的系统调用 id), */

TRACE_EVENT_FN(sys_enter,
    TP_PROTO(struct pt_regs *regs, long id),
    TP_ARGS(regs, id),
    TP_STRUCT__entry(
        __field(    long,           id              )
        __array(    unsigned long,  args,   6       )
    ),
    TP_fast_assign(
        __entry->id = id;
        syscall_get_arguments(current, regs, __entry->args);
    ),
    TP_printk("NR %ld (%lx, %lx, %lx, %lx, %lx, %lx)",
          __entry->id,
          __entry->args[0], __entry->args[1], __entry->args[2],
          __entry->args[3], __entry->args[4], __entry->args[5]),
    syscall_regfunc, syscall_unregfunc
);

struct bpf_raw_tracepoint_args {
    __u64 args[0];
};

/* fchmodat 这个系统调用的函数定义如下: */
/* : int fchmodat(int dirfd, const char *pathname, mode_t mode, int flags); */

/* 因为 regs 是 pt_regs 类型,所以我们可以通过 PT_REGS_PARM1_CORE(regs) 获取第一个参数的值, */
/* PT_REGS_PARM2_CORE(regs) 获取第二个参数的值, PT_REGS_PARM3_CORE(regs) 获取第三个参数的值,以此类推, 可以通过 */
/* PT_REGS_PARM4_CORE 和 PT_REGS_PARM5_CORE 分别获取 regs 中第四个和第五个参数的值。 */

SEC("raw_tracepoint/sys_enter")
int raw_tracepoint__sys_enter(struct bpf_raw_tracepoint_args *ctx)
{
    unsigned long syscall_id = ctx->args[1];
    if(syscall_id != 268)    // 过滤系统调用 id,只处理 fchmodat 系统调用
        return 0;

    struct pt_regs * regs = (struct pt_regs *) ctx->args[0];

    char pathname[256];
    u32 mode;

    // 读取第二个参数的值
    char *pathname_ptr = (char *) PT_REGS_PARM2_CORE(regs);
    bpf_core_read_user_str(&pathname, sizeof(pathname), pathname_ptr);

    // 读取第三个参数的值
    mode = (u32) PT_REGS_PARM3_CORE(regs);

    char fmt[] = "fchmodat %s %d\n";
    bpf_trace_printk(fmt, sizeof(fmt), &pathname, mode);
    return 0;
}

例子二:

// initial tail call entry from sys_enter.
// purpose is to save the syscall info of relevant syscalls through the task_info map.
// can move to one of:
// 1. sys_enter_submit, general event submit logic from sys_enter
// 2. directly to syscall tail hanler in sys_enter_tails
SEC("raw_tracepoint/sys_enter_init")
int sys_enter_init(struct bpf_raw_tracepoint_args *ctx)
{
    struct task_struct *task = (struct task_struct *) bpf_get_current_task();

    u64 pid_tgid = bpf_get_current_pid_tgid();
    u32 tid = pid_tgid;
    task_info_t *task_info = bpf_map_lookup_elem(&task_info_map, &tid);
    if (unlikely(task_info == NULL)) {
        u32 pid = pid_tgid >> 32;
        task_info = init_task_info(tid, pid, NULL);
        if (unlikely(task_info == NULL)) {
            return 0;
        }
    }

    syscall_data_t *sys = &(task_info->syscall_data);
    sys->id = ctx->args[1];

    if (get_kconfig(ARCH_HAS_SYSCALL_WRAPPER)) {
        struct pt_regs *regs = (struct pt_regs *) ctx->args[0];

        if (is_x86_compat(task)) {
#if defined(bpf_target_x86)
            sys->args.args[0] = BPF_CORE_READ(regs, bx); // regs 是非直接指针,所以需要 CORE READ;
            sys->args.args[1] = BPF_CORE_READ(regs, cx);
            sys->args.args[2] = BPF_CORE_READ(regs, dx);
            sys->args.args[3] = BPF_CORE_READ(regs, si);
            sys->args.args[4] = BPF_CORE_READ(regs, di);
            sys->args.args[5] = BPF_CORE_READ(regs, bp);
#endif // bpf_target_x86
        } else {
            sys->args.args[0] = PT_REGS_PARM1_CORE_SYSCALL(regs);
            sys->args.args[1] = PT_REGS_PARM2_CORE_SYSCALL(regs);
            sys->args.args[2] = PT_REGS_PARM3_CORE_SYSCALL(regs);
            sys->args.args[3] = PT_REGS_PARM4_CORE_SYSCALL(regs);
            sys->args.args[4] = PT_REGS_PARM5_CORE_SYSCALL(regs);
            sys->args.args[5] = PT_REGS_PARM6_CORE_SYSCALL(regs);
        }
    } else {
        bpf_probe_read(sys->args.args, sizeof(6 * sizeof(u64)), (void *) ctx->args);
    }

    if (is_compat(task)) {
        // Translate 32bit syscalls to 64bit syscalls, so we can send to the correct handler
        u32 *id_64 = bpf_map_lookup_elem(&sys_32_to_64_map, &sys->id);
        if (id_64 == 0)
            return 0;

        sys->id = *id_64;
    }

    // exit, exit_group and rt_sigreturn syscalls don't return
    if (sys->id != SYSCALL_EXIT && sys->id != SYSCALL_EXIT_GROUP &&
        sys->id != SYSCALL_RT_SIGRETURN) {
        sys->ts = bpf_ktime_get_ns();
        task_info->syscall_traced = true;
    }

    // if id is irrelevant continue to next tail call
    bpf_tail_call(ctx, &sys_enter_submit_tail, sys->id);

    // call syscall handler, if exists
    bpf_tail_call(ctx, &sys_enter_tails, sys->id);
    return 0;
}

4 基于 btf 信息的 CO-RE event handler 函数宏
#

PR: https://patchwork.ozlabs.org/project/netdev/patch/[email protected]/

该文件定义了基于内核 btf 信息(可以是外部加载提供)的 CO-RE event handler 函数宏:

  • BPF_K[RET]PROBE(name, args…):

    • 适用于任意内核函数,包括系统调用函数。
    • libpf/cilium 一致:SEC(“kprobe/XX”), SEC(“kretporbe/XX”);
      • 系统调用:SEC(“kprobe/__x64_sys_execve”), 后面的名称必须是 ARCH 为前缀的系统调用函数名称。
  • BPF_KPROBE_SYSCALL 或 BPF_KSYSCALL(name, args…):

    • 只适用于系统调用函数。
    • libpf:SEC(“ksyscall/execve”),只需要系统调用名,不需要 ARCH 相关的前缀;
    • cilium: SEC(“kprobe/sys_execve”),即系统调用名称前必须有 sys_ 前缀;
  • BPF_PROG(name, args…):name(unsigned long long *ctx)

    • 只适用于 fentry/fexit/lsm。

这些宏函数用于替代使用 struct pt_regs *ctx 的 k[ret]probe/u[ret]probe/syscall 的 event handler,主要的优势是可以直接指定内核函数或系统调用的 参数列表 ,而不需要自己再从 struct pt_regs * ctx 中提取。

  • 对于系统调用,自己提取会复杂些,因为要根据是否开启 CONFIG_ARCH_HAS_SYSCALL_WRAPPER 来做不同的提取方式:
    • 未开启:直接使用 PT_REGS_PARM[1-5]* 来从 ctx 提起;
    • 开启:需要先使用 PT_REGS_PARM1(ctx) 来获得新的 struct pt_regs * ctx,再从它中提取各内核参数。
  • 虽然宏函数参数列表中没有原始的 struct pt_regs *ctx, 但是在际函数内部还是可以使用变量 ctx。

宏函数定义:

// https://github.com/libbpf/libbpf/blob/1728e3e4bef0e138ea95ffe62163eb9a6ac6fa32/src/bpf_tracing.h#L666

/*
 * BPF_PROG is a convenience wrapper for generic tp_btf/fentry/fexit and
 * similar kinds of BPF programs, that accept input arguments as a single
 * pointer to untyped u64 array, where each u64 can actually be a typed
 * pointer or integer of different size. Instead of requring user to write
 * manual casts and work with array elements by index, BPF_PROG macro
 * allows user to declare a list of named and typed input arguments in the
 * same syntax as for normal C function. All the casting is hidden and
 * performed transparently, while user code can just assume working with
 * function arguments of specified type and name.
 *
 * Original raw context argument is preserved as well as 'ctx' argument.
 * This is useful when using BPF helpers that expect original context
 * as one of the parameters (e.g., for bpf_perf_event_output()).
 */
#define BPF_PROG(name, args...)						    \
name(unsigned long long *ctx);						    \
static __always_inline typeof(name(0))					    \
____##name(unsigned long long *ctx, ##args);				    \
typeof(name(0)) name(unsigned long long *ctx)				    \
{									    \
	_Pragma("GCC diagnostic push")					    \
	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")		    \
	return ____##name(___bpf_ctx_cast(args));			    \
	_Pragma("GCC diagnostic pop")					    \
}									    \
static __always_inline typeof(name(0))					    \
____##name(unsigned long long *ctx, ##args)

/*
 * BPF_KPROBE serves the same purpose for kprobes as BPF_PROG for
 * tp_btf/fentry/fexit BPF programs. It hides the underlying platform-specific
 * low-level way of getting kprobe input arguments from struct pt_regs, and
 * provides a familiar typed and named function arguments syntax and
 * semantics of accessing kprobe input paremeters.
 *
 * Original struct pt_regs* context is preserved as 'ctx' argument. This might
 * be necessary when using BPF helpers like bpf_perf_event_output().
 */
#define BPF_KPROBE(name, args...)					    \
name(struct pt_regs *ctx);						    \
static __always_inline typeof(name(0))					    \
____##name(struct pt_regs *ctx, ##args);				    \
typeof(name(0)) name(struct pt_regs *ctx)				    \
{									    \
	_Pragma("GCC diagnostic push")					    \
	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")		    \
	return ____##name(___bpf_kprobe_args(args));			    \
	_Pragma("GCC diagnostic pop")					    \
}									    \
static __always_inline typeof(name(0))					    \
____##name(struct pt_regs *ctx, ##args)


/*
 * BPF_KRETPROBE is similar to BPF_KPROBE, except, it only provides optional
 * return value (in addition to `struct pt_regs *ctx`), but no input
 * arguments, because they will be clobbered by the time probed function
 * returns.
 */
#define BPF_KRETPROBE(name, args...)					    \
name(struct pt_regs *ctx);						    \
static __always_inline typeof(name(0))					    \
____##name(struct pt_regs *ctx, ##args);				    \
typeof(name(0)) name(struct pt_regs *ctx)				    \
{									    \
	_Pragma("GCC diagnostic push")					    \
	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")		    \
	return ____##name(___bpf_kretprobe_args(args));			    \
	_Pragma("GCC diagnostic pop")					    \
}									    \
static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args)


/*
 * BPF_KSYSCALL is a variant of BPF_KPROBE, which is intended for
 * tracing syscall functions, like __x64_sys_close. It hides the underlying
 * platform-specific low-level way of getting syscall input arguments from
 * struct pt_regs, and provides a familiar typed and named function arguments
 * syntax and semantics of accessing syscall input parameters.
 *
 * Original struct pt_regs * context is preserved as 'ctx' argument. This might
 * be necessary when using BPF helpers like bpf_perf_event_output().
 *
 * At the moment BPF_KSYSCALL does not transparently handle all the calling
 * convention quirks for the following syscalls:
 *
 * - mmap(): __ARCH_WANT_SYS_OLD_MMAP.
 * - clone(): CONFIG_CLONE_BACKWARDS, CONFIG_CLONE_BACKWARDS2 and
 *            CONFIG_CLONE_BACKWARDS3.
 * - socket-related syscalls: __ARCH_WANT_SYS_SOCKETCALL.
 * - compat syscalls.
 *
 * This may or may not change in the future. User needs to take extra measures
 * to handle such quirks explicitly, if necessary.
 *
 * This macro relies on BPF CO-RE support and virtual __kconfig externs.
 */
#define BPF_KSYSCALL(name, args...)					    \
name(struct pt_regs *ctx);						    \
extern _Bool LINUX_HAS_SYSCALL_WRAPPER __kconfig;			    \
static __always_inline typeof(name(0))					    \
____##name(struct pt_regs *ctx, ##args);				    \
typeof(name(0)) name(struct pt_regs *ctx)				    \
{									    \
	struct pt_regs *regs = LINUX_HAS_SYSCALL_WRAPPER		    \
			       ? (struct pt_regs *)PT_REGS_PARM1(ctx)	    \
			       : ctx;					    \
	_Pragma("GCC diagnostic push")					    \
	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")		    \
	if (LINUX_HAS_SYSCALL_WRAPPER)					    \
		return ____##name(___bpf_syswrap_args(args));		    \
	else								    \
		return ____##name(___bpf_syscall_args(args));		    \
	_Pragma("GCC diagnostic pop")					    \
}									    \
static __always_inline typeof(name(0))					    \
____##name(struct pt_regs *ctx, ##args)

#define BPF_KPROBE_SYSCALL BPF_KSYSCALL

/* BPF_UPROBE and BPF_URETPROBE are identical to BPF_KPROBE and BPF_KRETPROBE,
 * but are named way less confusingly for SEC("uprobe") and SEC("uretprobe")
 * use cases.
 */
#define BPF_UPROBE(name, args...)  BPF_KPROBE(name, ##args)
#define BPF_URETPROBE(name, args...)  BPF_KRETPROBE(name, ##args)

示例:

int do_execve(
	struct filename *filename,
	const char __user *const __user *__argv,
	const char __user *const __user *__envp)
// do_execve 为内核函数名称(非系统调用名称)。
SEC("kprobe/do_execve")
int BPF_KPROBE(kprobe_do_execve, struct filename *filename) { // 内核函数参数列表
}
SEC("kretprobe/do_unlinkat")
int BPF_KRETPROBE(do_unlinkat_exit, long ret) {
}

// __x64_sys_close 为系统调用名称,使用  BPF_KPROBE_SYSCALL/ BPF_KSYSCALL 宏。
SEC("kprobe/__x64_sys_close")
int BPF_KPROBE_SYSCALL(do_sys_close, int fd) // 系统调用函数参数列表
{}

// libbpf 版本:execve 为系统调用函数名称。
SEC("ksyscall/execve") // 如果是 cilium 这需要使用 SEC("kprobe/sys_execve")
int BPF_KPROBE_SYSCALL(hello, const char *pathname) // 等效为  int BPF_KSYSCALL(hello, const char *pathname)
{
   struct data_t data = {};
   struct user_msg_t *p;

   data.pid = bpf_get_current_pid_tgid() >> 32;
   data.uid = bpf_get_current_uid_gid() & 0xFFFFFFFF;

   bpf_get_current_comm(&data.command, sizeof(data.command));
   bpf_probe_read_user_str(&data.path, sizeof(data.path), pathname);

   p = bpf_map_lookup_elem(&my_config, &data.uid);
   if (p != 0) {
      bpf_probe_read_kernel_str(&data.message, sizeof(data.message), p->message);
   } else {
      bpf_probe_read_kernel_str(&data.message, sizeof(data.message), message);
   }

   // 虽然没有在函数宏参数列表中声明 ctx ,但是内部还可以继续使用。
   bpf_perf_event_output(ctx, &output, BPF_F_CURRENT_CPU, &data, sizeof(data));
   return 0;
}

// BPF_PROG 只适用于 fentry/fexit/lsm,即内核内置支持 BTF 的情况(5.2 版本开始)
SEC("fentry/do_execve")
int BPF_PROG(fentry_execve, struct filename *filename) {
}
SEC("fexit/do_unlinkat")
int BPF_PROG(do_unlinkat_exit, int dfd, struct filename *name, long ret) {
}
SEC("lsm/path_chmod")
int BPF_PROG(path_chmod, const struct path *path, umode_t mode) {
	bpf_printk("Change mode of file name %s\n", path->dentry->d_iname);
	return 0;
}

// https://docs.kernel.org/bpf/prog_lsm.html
int file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot);
SEC("lsm/file_mprotect")
int BPF_PROG(mprotect_audit, struct vm_area_struct *vma,
             unsigned long reqprot, unsigned long prot, int ret)
{
        /* ret is the return value from the previous BPF program
         * or 0 if it's the first hook.
         */
        if (ret != 0)
                return ret;

        int is_heap;

        is_heap = (vma->vm_start >= vma->vm_mm->start_brk && // lsm 类型可以直接解指针引用
                   vma->vm_end <= vma->vm_mm->brk);

        /* Return an -EPERM or write information to the perf events buffer
         * for auditing
         */
        if (is_heap)
                return -EPERM;
}

具体例子参考:

  1. https://github.com/aquasecurity/tracee/blob/main/pkg/ebpf/c/tracee.bpf.c
  2. https://github.com/lizrice/learning-ebpf/blob/main/chapter7/hello.bpf.c

相关文章

libbpf skeleton 用户空间程序分析
·15745 字
Ebpf
libbpf skeleton 用户空间程序分析
perf_event_open() 系统调用分析
·2476 字
Ebpf
perf_event_open() 系统调用分析
Linux 内核追踪和 eBPF 介绍
··8393 字
Ebpf Ebpf

eBPF 是当今热门的底层技术,在网络、安全、可观测性、云原生等场景得到广泛应用。

本文档先介绍 Linux 内核的各种追踪技术,让大家对于各种事件源、内核各种追踪框架、用户工具等有个初步了解,然后介绍 eBPF 的发展历程、开发和执行流程、开发框架选择和 Demo 示例,希望对于想了解 Linux 内核追踪和 eBPF 技术的同学有所帮助。

不安全:unsafe
··824 字
Rust
Rust