一、Native crash发生
当程序执行以下操作,会触发native crash:
1)程序自己调用 abort() 函数触发,用于表示出现严重的错误或异常情况,需要终止程序执行
2)内存对齐错误或非法地址访问
3)零除错误(除数为零),浮点溢出或下溢出等
4)使用了非法的机器指令或指令参数不当而导致
5)进程试图访问不允许访问的内存地址,例如访问已释放的内存,或者栈溢出等
6)常见于协处理器栈错误或FPU错误
7)进程试图执行未定义或不支持的系统调用
8)程序的特定位置设置断点时会触发SIGTRAP信号
当程序发生以上操作时,Linux kernel将发送异常信号给信号处理程序。
二、Native crash日志收集
Native异常发生的时候,CPU通过中断的形式触发异常处理流程。Linux kernel将会处理中断,统一成信号发送,应用进程注册和处理信号。所有的so都需要通过linker加载,Android空间的linker程序会默认注册信号处理函数。
2.1 注册异常信号
// bionic/linker/linker_main.cpp
static ElfW(Addr) linker_main(KernelArgumentBlock& args, const char* exe_to_load) {
......
// Register the debuggerd signal handler.
linker_debuggerd_init();
......
}
// bionic/linker/linker_debuggerd_android.cpp
void linker_debuggerd_init() {
// There may be a version mismatch between the bootstrap linker and the crash_dump in the APEX,
// so don't pass in any process info from the bootstrap linker.
debuggerd_callbacks_t callbacks = {
#if defined(__ANDROID_APEX__)
.get_process_info = get_process_info,
.get_gwp_asan_callbacks = get_gwp_asan_callbacks,
#endif
.post_dump = notify_gdb_of_libraries,
};
// 调用libdebuggerd_handler_fallback库,执行debuggerd_init方法
debuggerd_init(&callbacks);
}
void debuggerd_init(debuggerd_callbacks_t* callbacks) {
if (callbacks) {
g_callbacks = *callbacks;
}
size_t thread_stack_pages = 8;
void* thread_stack_allocation = mmap(nullptr, PAGE_SIZE * (thread_stack_pages + 2), PROT_NONE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
if (thread_stack_allocation == MAP_FAILED) {
fatal_errno("failed to allocate debuggerd thread stack");
}
char* stack = static_cast<char*>(thread_stack_allocation) + PAGE_SIZE;
if (mprotect(stack, PAGE_SIZE * thread_stack_pages, PROT_READ | PROT_WRITE) != 0) {
fatal_errno("failed to mprotect debuggerd thread stack");
}
// Stack grows negatively, set it to the last byte in the page...
stack = (stack + thread_stack_pages * PAGE_SIZE - 1);
// and align it.
stack -= 15;
pseudothread_stack = stack;
struct sigaction action;
memset(&action, 0, sizeof(action));
sigfillset(&action.sa_mask);
action.sa_sigaction = debuggerd_signal_handler; // 注册函数名称
action.sa_flags = SA_RESTART | SA_SIGINFO;
// Use the alternate signal stack if available so we can catch stack overflows.
action.sa_flags |= SA_ONSTACK;
#define SA_EXPOSE_TAGBITS 0x00000800
// Request that the kernel set tag bits in the fault address. This is necessary for diagnosing MTE
// faults.
action.sa_flags |= SA_EXPOSE_TAGBITS;
// 注册异常信号和debuggerd_signal_handler信号处理方法
debuggerd_register_handlers(&action);
}
// system/core/debuggerd/include/debuggerd/handler.h
static void __attribute__((__unused__)) debuggerd_register_handlers(struct sigaction* action) {
bool enabled = true;
#if ANDROID_DEBUGGABLE
char value[PROP_VALUE_MAX] = "";
enabled = !(__system_property_get("debug.debuggerd.disable", value) > 0 && !strcmp(value, "1"));
#endif
if (enabled) {
sigaction(SIGABRT, action, nullptr);
sigaction(SIGBUS, action, nullptr);
sigaction(SIGFPE, action, nullptr);
sigaction(SIGILL, action, nullptr);
sigaction(SIGSEGV, action, nullptr);
sigaction(SIGSTKFLT, action, nullptr);
sigaction(SIGSYS, action, nullptr);
sigaction(SIGTRAP, action, nullptr);
}
sigaction(BIONIC_SIGNAL_DEBUGGER, action, nullptr);
}
通过sigaction方法,注册接收的信号有:SIGABRT,SIGBUS,SIGFPE,SIGILL,SIGSEGV,SIGSTKFLT,SIGSYS,SIGTRAP,DEBUGGER_SIGNAL,共计9个信号,当kernel发出这些信号时,回调debuggerd_signal_handler方法。
异常信号 |
说明 |
|
1 |
SIGABRT |
当程序调用abort函数时产生的异常 |
2 |
SIGBUS |
总线错误异常,如内存对齐错误或非法地址访问 |
3 |
SIGFPE |
浮点异常 |
4 |
SIGILL |
非法指令异常,如使用了非法的机器指令或指令参数 |
5 |
SIGSEGV |
内存段错误,如进程试图访问不允许访问的内存地址,例如访问已释放的内存,或者栈溢出等 |
6 |
SIGSTKFLT |
协处理器堆栈错误,如协处理器栈错误或FPU错误 |
7 |
SIGSYS |
非法系统调用异常,如进程试图执行未定义或不支持的系统调用 |
8 |
SIGTRAP |
断点跟踪异常,如程序的特定位置设置断点时会触发SIGTRAP信号 |
9 |
DEBUGGER_SIGNAL |
在Android Bionic运行时使用的调试器相关信号 |
2.2 异常信号处理
static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void* context) {
// Make sure we don't change the value of errno, in case a signal comes in between the process
// making a syscall and checking errno.
ErrnoRestorer restorer;
.......
// 打印"Fatal signal"信息,如"F libc : Fatal signal 11 (SIGSEGV), code 1 (SEGV_MAPERR), fault addr 0x74 in tid 2992"
log_signal_summary(info);
.......
// Essentially pthread_create without CLONE_FILES, so we still work during file descriptor
// exhaustion.
// clone出子进程,并执行debuggerd_dispatch_pseudothread方法
pid_t child_pid =
clone(debuggerd_dispatch_pseudothread, pseudothread_stack,
CLONE_THREAD | CLONE_SIGHAND | CLONE_VM | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
&thread_info, nullptr, nullptr, &thread_info.pseudothread_tid);
.......
}
debuggerd_dispatch_pseudothread方法主要是执行execle启动crash_dump64进程。
static int debuggerd_dispatch_pseudothread(void* arg) {
debugger_thread_info* thread_info = static_cast<debugger_thread_info*>(arg);
......
execle(CRASH_DUMP_PATH, CRASH_DUMP_NAME, main_tid, pseudothread_tid, debuggerd_dump_type,
nullptr, nullptr);
async_safe_format_log(ANDROID_LOG_FATAL, "libc", "failed to exec crash_dump helper: %s",
strerror(errno));
return 1;
}
......
}
crash_dump64进程主要作用:
1)解析传入的参数,包括tid、目标进程名,并调用getProcessTids方法获取目标进程的所有线程id集合。
2)通过ptrace读取crash进程中所有线程的寄存器信息,最终汇总所有的异常信息,包括机型版本、abi、信号、寄存器、backtrace等,输出到log中。
crash_dump进程会fork出一个新进程,父进程通过read去等待子进程的结果,而子进程继续执行crash_dump的任务。
int main(int argc, char** argv) {
......
// 1.fork子进程
pid_t forkpid = fork();
//解析参数
ParseArgs(argc, argv, &pseudothread_tid, &dump_type);
......
// 2. Ptrace循环遍历读取这个进程中的所有线程
if (!ptrace_interrupt(thread, &info.signo)) {
PLOG(WARNING) << "failed to ptrace interrupt thread " << thread;
ptrace(PTRACE_DETACH, thread, 0, 0);
continue;
}
struct iovec tagged_addr_iov = {
&info.tagged_addr_ctrl,
sizeof(info.tagged_addr_ctrl),
};
if (ptrace(PTRACE_GETREGSET, thread, NT_ARM_TAGGED_ADDR_CTRL,
reinterpret_cast<void*>(&tagged_addr_iov)) == -1) {
info.tagged_addr_ctrl = -1;
}
struct iovec pac_enabled_keys_iov = {
&info.pac_enabled_keys,
sizeof(info.pac_enabled_keys),
};
if (ptrace(PTRACE_GETREGSET, thread, NT_ARM_PAC_ENABLED_KEYS,
reinterpret_cast<void*>(&pac_enabled_keys_iov)) == -1) {
info.pac_enabled_keys = -1;
}
// 3.读取寄存器、backtrace等信息
if (thread == g_target_thread) {
// Read the thread's registers along with the rest of the crash info out of the pipe.
ReadCrashInfo(input_pipe, &siginfo, &info.registers, &process_info,
&recoverable_gwp_asan_crash);
info.siginfo = &siginfo;
info.signo = info.siginfo->si_signo;
info.command_line = get_command_line(g_target_thread);
} else {
info.registers.reset(unwindstack::Regs::RemoteGet(thread));
if (!info.registers) {
PLOG(WARNING) << "failed to fetch registers for thread " << thread;
ptrace(PTRACE_DETACH, thread, 0, 0);
continue;
}
}
// 信息保存到thread_info数组
thread_info[thread] = std::move(info);
.....
// 发送给tombstoned进程,保存到/data/tomgbstones目录
g_tombstoned_connected = connect_tombstone_server(g_target_thread, &g_tombstoned_socket,
&g_output_fd, &g_proto_fd, dump_type);
// 发送给system_server进程,保存到/data/system/dropbox目录
if (fatal_signal) {
// Don't try to notify ActivityManager if it just crashed, or we might hang until timeout.
if (thread_info[target_process].thread_name != "system_server") {
activity_manager_notify(target_process, signo, amfd_data, recoverable_gwp_asan_crash);
}
}
}
通过Socket通知tombstoned进程(系统常驻进程),传输异常信息,由tombstoned进程将所有异常信息输出到/data/tombstones/tombstone_xx文件中。
通过Socket通知system_server进程,传输异常信息,由system_server进程的dropbox服务将所有异常信息输出到/data/system/dropbox文件中。
三、Native crash问题分析
分析流程:
1)从tombstone日志中查询出现异常signal类型及调用栈函数名
2)如果堆栈无法看到带函数名调用信息。这种情况下,需要找到对应版本带符号表的so,用addr2line工具去解析调用栈的出错的pc指针定位出错的函数或者可以从logcat日志中查找是否打印出错的函数调用栈
3)定位到具体的异常函数后,结合源码分析crash的原因
tombstone日志:文章来源:https://www.toymoban.com/news/detail-791837.html
文章来源地址https://www.toymoban.com/news/detail-791837.html
到了这里,关于Android Native crash触发原理及处理的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!