kprobe/uprobe/tracepoint/perf_event 以及 eBPF 的 PERF_EVENT_ARRAY Maps 均使用了 perf_event_open 系统调用。但是:
- raw_tracepoint/raw_tp: 使用 bpf 系统调用 BPF_RAW_TRACEPOINT_OPEN 命令来创建一个 pfd;
- trace 类, 如 tp_btf/fentry/fexit/fmod_ret/freplace: 使用 bpf 系统调用 BPF_LINK_CREATE 命令来创建一个 pfd,如果内核不支持(如 4.19)则 fallback 到 BPF_RAW_TRACEPOINT_OPEN 命令。
perf_event_open 通过 type 来区分各种类型:
- PERF_TYPE_HARDWARE: SEC(“perf_event”) CPU/Core 采样使用的类型(见后文);
- PERF_TYPE_SOFTWARE: 后续用户空间读取 eBPF perf_event map 时打开的 perf event buff 使用的类型(见后文)
- PERF_TYPE_TRACEPOINT: tracepoint 使用的 type;
- 其他:kprobe:5,uprobe:7
- 注意:raw_tracepoint/raw_tp, 以及各种 trace 类不使用 perf event 机制,所以没有对应的 type。
// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/include/uapi/linux/perf_event.h#L29
/*
* attr.type
*/
enum perf_type_id {
PERF_TYPE_HARDWARE = 0,
PERF_TYPE_SOFTWARE = 1,
PERF_TYPE_TRACEPOINT = 2,
PERF_TYPE_HW_CACHE = 3,
PERF_TYPE_RAW = 4,
PERF_TYPE_BREAKPOINT = 5,
PERF_TYPE_MAX, /* non-ABI */
};
查看:
#head /sys/bus/event_source/devices/*/type
==> /sys/bus/event_source/devices/breakpoint/type <==
5
==> /sys/bus/event_source/devices/cpu/type <==
4
==> /sys/bus/event_source/devices/kprobe/type <== // kprobe
6
==> /sys/bus/event_source/devices/msr/type <==
9
==> /sys/bus/event_source/devices/power/type <==
10
==> /sys/bus/event_source/devices/software/type <==
1
==> /sys/bus/event_source/devices/tracepoint/type <== // tracepoint
2
==> /sys/bus/event_source/devices/uprobe/type <== // uprobe
7
...
perf_event_open 返回的 pfd,一般需要在 attach 到 bpf program 后内核才开始处理指定 event:
- 一般是 libpf 各种 attach_XX() 中的 bpf_program__attach_perf_event_opts() 来实现的;
- 先使用 ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd) 来设置 CPU perf buff;
- 再使用 ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) 在开启该 buff,开始实际处理 event;
link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts);
// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L9985
struct bpf_link *bpf_program__attach_perf_event_opts(const struct bpf_program *prog, int pfd,
const struct bpf_perf_event_opts *opts)
{
char errmsg[STRERR_BUFSIZE];
struct bpf_link_perf *link;
int prog_fd, link_fd = -1, err;
bool force_ioctl_attach;
// 。。。
// 返回一个 Program fd
prog_fd = bpf_program__fd(prog);
if (prog_fd < 0) {
pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n",
prog->name);
return libbpf_err_ptr(-EINVAL);
}
link = calloc(1, sizeof(*link));
if (!link)
return libbpf_err_ptr(-ENOMEM);
link->link.detach = &bpf_link_perf_detach;
link->link.dealloc = &bpf_link_perf_dealloc;
link->perf_event_fd = pfd;
force_ioctl_attach = OPTS_GET(opts, force_ioctl_attach, false);
if (kernel_supports(prog->obj, FEAT_PERF_LINK) && !force_ioctl_attach) {
// 老内核不支持,如 4.19.91 不支持 PERF_LINK
DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_opts,
.perf_event.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0));
link_fd = bpf_link_create(prog_fd, pfd, BPF_PERF_EVENT, &link_opts);
if (link_fd < 0) {
err = -errno;
pr_warn("prog '%s': failed to create BPF link for perf_event FD %d: %d (%s)\n",
prog->name, pfd,
err, libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
goto err_out;
}
link->link.fd = link_fd;
} else {
if (OPTS_GET(opts, bpf_cookie, 0)) {
pr_warn("prog '%s': user context value is not supported\n", prog->name);
err = -EOPNOTSUPP;
goto err_out;
}
// 设置 Perf Buff
if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd) < 0) {
err = -errno;
pr_warn("prog '%s': failed to attach to perf_event FD %d: %s\n",
prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
if (err == -EPROTO)
pr_warn("prog '%s': try add PERF_SAMPLE_CALLCHAIN to or remove exclude_callchain_[kernel|user] from pfd %d\n",
prog->name, pfd);
goto err_out;
}
link->link.fd = pfd;
}
// 开启 Perf Buff
if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
err = -errno;
pr_warn("prog '%s': failed to enable perf_event FD %d: %s\n",
prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
goto err_out;
}
return &link->link;
// 。。。
}
用户空间程序再 pull 各种 Maps 数据:
- Arrary/Hash 等 Map 类型:
- Perf Buffer 等 Map 类型:
对于 Perf Buffer Mpas 用户空间需要创建一个 perf buffer 来 pull 数据。
// 基于 perf buff maps 创建一个 perf buff
pb = perf_buffer__new(bpf_map__fd(skel->maps.output), 8, handle_event, lost_event, NULL, NULL);
if (!pb) {
err = -1;
fprintf(stderr, "Failed to create ring buffer\n");
hello_verifier_bpf__destroy(skel);
return 1;
}
while (true) {
err = perf_buffer__poll(pb, 100 /* timeout, ms */);
// Ctrl-C gives -EINTR
if (err == -EINTR) {
err = 0;
break;
}
if (err < 0) {
printf("Error polling perf buffer: %d\n", err);
break;
}
}
1 kprobe/uprobe/tracepoint #
uprobe/kprobe 用户空间进程均使用 perf_event_open_probe,tracepoing/tp 用户空间进程均使 perf_event_open_tracepoint, 这两个函数内部调用的是 perf_event_open,但是 type,attr 和 config 均不相同
perf_event_open_probe:
- type:6: kprobe, 7: uprobe;
- config1 和 config2:传递 uprobe 的 binay path 和两者的 func name
perf_event_open_tracepoint:
- type: 固定为 tracepoint 2;
- config:传递 tracepoint 的 category/name 对应的 ID (来源于 /path/to/tracefs/<catagory>/<name>/id 文件)
tracefs 的两种类型:
- debugfs: /sys/kernel/debug/tracing
- tracefs:/sys/kernel/tracing
注:系统如果没有 mount tracefs,需要手动挂载:mount -t tracefs nodev /sys/kernel/tracing
# 对于 4.19 内核而言,两个文件文件系统均存在,内容一致:
#ls /sys/kernel/tracing/events/
alarmtimer dma_fence ftrace irq mei page_isolation regmap smbus tlb xhci-hcd
block drm header_event irq_matrix migrate pagemap resctrl sock ucsi
bridge enable header_page irq_vectors module percpu rpcrdma sunrpc udp
cgroup exceptions huge_memory jbd2 mpx power rpm swiotlb vmscan
clk ext4 hyperv kmem msr printk rseq syscalls vsyscall
cma fib i2c kvm napi qdisc rtc target workqueue
compaction fib6 initcall kvmmmu net random sched task writeback
context_tracking filelock iocost libata nmi ras scsi tcp x86_fpu
cpuhp filemap iommu mce nvme raw_syscalls signal thermal xdp
devlink fs_dax io_uring mdio oom rcu skb timer xen
#ls /sys/kernel/debug/tracing/events/
alarmtimer dma_fence ftrace irq mei page_isolation regmap smbus tlb xhci-hcd
block drm header_event irq_matrix migrate pagemap resctrl sock ucsi
bridge enable header_page irq_vectors module percpu rpcrdma sunrpc udp
cgroup exceptions huge_memory jbd2 mpx power rpm swiotlb vmscan
clk ext4 hyperv kmem msr printk rseq syscalls vsyscall
cma fib i2c kvm napi qdisc rtc target workqueue
compaction fib6 initcall kvmmmu net random sched task writeback
context_tracking filelock iocost libata nmi ras scsi tcp x86_fpu
cpuhp filemap iommu mce nvme raw_syscalls signal thermal xdp
devlink fs_dax io_uring mdio oom rcu skb timer xen
# 查看 tracepoint syscalls/sys_enter_bind 的 ID
#cat /sys/kernel/tracing/events/syscalls/sys_enter_bind/id
1181
perf_event_open_probe 和 perf_event_open_tracepoint:
// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L10128
static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name,
uint64_t offset, int pid, size_t ref_ctr_off)
{
const size_t attr_sz = sizeof(struct perf_event_attr);
struct perf_event_attr attr;
char errmsg[STRERR_BUFSIZE];
int type, pfd;
if (ref_ctr_off >= (1ULL << PERF_UPROBE_REF_CTR_OFFSET_BITS))
return -EINVAL;
memset(&attr, 0, attr_sz);
type = uprobe ? determine_uprobe_perf_type()
: determine_kprobe_perf_type();
if (type < 0) {
pr_warn("failed to determine %s perf type: %s\n",
uprobe ? "uprobe" : "kprobe",
libbpf_strerror_r(type, errmsg, sizeof(errmsg)));
return type;
}
if (retprobe) {
int bit = uprobe ? determine_uprobe_retprobe_bit()
: determine_kprobe_retprobe_bit();
if (bit < 0) {
pr_warn("failed to determine %s retprobe bit: %s\n",
uprobe ? "uprobe" : "kprobe",
libbpf_strerror_r(bit, errmsg, sizeof(errmsg)));
return bit;
}
attr.config |= 1 << bit;
}
attr.size = attr_sz;
// 参考后文: 0: perf_event, 6: kprobe, 7: uprobe,
attr.type = type;
// config 指定要 probe 的 path 和 func
attr.config |= (__u64)ref_ctr_off << PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
attr.config1 = ptr_to_u64(name); /* kprobe_func or uprobe_path */
attr.config2 = offset; /* kprobe_addr or probe_offset */
// 最终调用系统调用 perf_event_open
/* pid filter is meaningful only for uprobes */
pfd = syscall(__NR_perf_event_open, &attr,
pid < 0 ? -1 : pid /* pid */,
pid == -1 ? 0 : -1 /* cpu */,
-1 /* group_fd */, PERF_FLAG_FD_CLOEXEC);
return pfd >= 0 ? pfd : -errno;
}
// https://github.com/libbpf/libbpf/blob/f7eb43b90f4c8882edf6354f8585094f8f3aade0/src/libbpf.c#L11440
static int perf_event_open_tracepoint(const char *tp_category,
const char *tp_name)
{
const size_t attr_sz = sizeof(struct perf_event_attr);
struct perf_event_attr attr;
char errmsg[STRERR_BUFSIZE];
int tp_id, pfd, err;
tp_id = determine_tracepoint_id(tp_category, tp_name);
if (tp_id < 0) {
pr_warn("failed to determine tracepoint '%s/%s' perf event ID: %s\n",
tp_category, tp_name,
libbpf_strerror_r(tp_id, errmsg, sizeof(errmsg)));
return tp_id;
}
memset(&attr, 0, attr_sz);
// type 是固定的 PERF_TYPE_TRACEPOINT, 值为 2
attr.type = PERF_TYPE_TRACEPOINT;
attr.size = attr_sz;
// config 传递 tracepoint 的 category/name 对应的 ID
attr.config = tp_id;
pfd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, 0 /* cpu */,
-1 /* group_fd */, PERF_FLAG_FD_CLOEXEC);
if (pfd < 0) {
err = -errno;
pr_warn("tracepoint '%s/%s' perf_event_open() failed: %s\n",
tp_category, tp_name,
libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
return err;
}
return pfd;
}
2 SEC(“perf_event”) #
kprobe/uprobe/tracepoint/trace 是记录每次调用特定 func 或 tracepoint 的 event(通过 SEC name 来表示, 或者后续attach 来指定), 也是使用 perf event 机制(调用 syscall perf_event_open) ,但是不需要采样,而是每次执行到该func 或 tracepoint 时自动产生一个event,进而执行 SEC 修饰的 event handler 函数。
而 perf_event 是在每一次函数调用都会执行的, 可以获取每一次 CPU 执行函数的寄存器信息, 进而可以获取任意进程调用stack 的信息, 不管是 kernel stack 还是 user space stack, 所以可以用于实现任意内核函数或用户空间代码的profiling, 生成火焰图.
获得每一次函数调用的 event 的数据量和性能影响较大, 所以一般用户空间在 perf_event_open() 时的 attr 中要指定采样频率和周期, 这样按照指定采样频率来调用 SEC(“perf_event”) 修饰的 event handler 函数。
用户空间代码调用 perf_event_open 传递的 attr 参数如下:
- attr.type: 指定为 PERF_TYPE_HARDWARE,表示 CPU/Core 采样;
- attr.sample_freq 和 attr.freq 指定采样频率和周期;
memset(&attr, 0, sizeof(attr));
attr.type = PERF_TYPE_HARDWARE; // performance monitoring on a CPU/Core
attr.size = sizeof(attr);
attr.config = PERF_COUNT_HW_CPU_CYCLES;
attr.sample_freq = freq; // 采样频率
attr.freq = 1; // 采样周期
for (cpu = 0; cpu < num_cpus; cpu++) {
/* skip offline/not present CPUs */
if (cpu >= num_online_cpus || !online_mask[cpu])
continue;
/* Set up performance monitoring on a CPU/Core */
pefd = perf_event_open(&attr, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC);
if (pefd < 0) {
fprintf(stderr, "Fail to set up performance monitor on a CPU/Core\n");
err = -1;
goto cleanup;
}
pefds[cpu] = pefd;
/* Attach a BPF program on a CPU */
links[cpu] = bpf_program__attach_perf_event(skel->progs.profile, pefd);
if (!links[cpu]) {
err = -1;
goto cleanup;
}
}
/* Wait and receive stack traces */
while (ring_buffer__poll(ring_buf, -1) >= 0) {
}
具体代码参考:
- libbpf/libbpf-bootstrap 的 profile.c
- kernel bptf example sampleip sampleip_user.c
3 读取 PERF_EVENT_ARRAY Map #
PERF_EVENT_ARRAY Map 是一个 perl-CPU 的 ring buffer 类型, load eBPF Program 时,libbpf 创建一个 perf buffer event;
在 libpf load kernel Program 时,调用 ebpf 的 BPF_MAP_CREATE 来创建 Maps,Mpas 不需要 Attach,但是后续需要在用户空间为该 maps 打开一个 perf buffer,然后进行 pull 读取数据。
eBPF kernel Program 使用 bpf_perf_event_output() 来发送 event,用户空间使用 perf_event_open() 系统调用来创建一个读取该 maps event 的 perf buff,attr 要求如下:
- 构造后续创建 perf buffer 的 perf_event_open 系统调用的 attr(注意与前面各种 attach_XX() 中的区别):
- attr.config = PERF_COUNT_SW_BPF_OUTPUT;
- attr.type = PERF_TYPE_SOFTWARE;
- attr.sample_type = PERF_SAMPLE_RAW;