Android Native crash触发原理及处理

这篇具有很好参考价值的文章主要介绍了Android Native crash触发原理及处理。希望对大家有所帮助。如果存在错误或未考虑完全的地方，请大家不吝赐教，您也可以点击"举报违法"按钮提交疑问。

一、Native crash发生

当程序执行以下操作，会触发native crash：

1）程序自己调用 abort() 函数触发，用于表示出现严重的错误或异常情况，需要终止程序执行

2）内存对齐错误或非法地址访问

3）零除错误（除数为零），浮点溢出或下溢出等

4）使用了非法的机器指令或指令参数不当而导致

5）进程试图访问不允许访问的内存地址，例如访问已释放的内存，或者栈溢出等

6）常见于协处理器栈错误或FPU错误

7）进程试图执行未定义或不支持的系统调用

8）程序的特定位置设置断点时会触发SIGTRAP信号

当程序发生以上操作时，Linux kernel将发送异常信号给信号处理程序。

二、Native crash日志收集

Native异常发生的时候，CPU通过中断的形式触发异常处理流程。Linux kernel将会处理中断，统一成信号发送，应用进程注册和处理信号。所有的so都需要通过linker加载，Android空间的linker程序会默认注册信号处理函数。

Android Native crash触发原理及处理,stability,android

2.1 注册异常信号

// bionic/linker/linker_main.cpp
static ElfW(Addr) linker_main(KernelArgumentBlock& args, const char* exe_to_load) {
        ......
        // Register the debuggerd signal handler.
        linker_debuggerd_init();
        ......
}

// bionic/linker/linker_debuggerd_android.cpp
void linker_debuggerd_init() {
  // There may be a version mismatch between the bootstrap linker and the crash_dump in the APEX,
  // so don't pass in any process info from the bootstrap linker.
  debuggerd_callbacks_t callbacks = {
#if defined(__ANDROID_APEX__)
    .get_process_info = get_process_info,
    .get_gwp_asan_callbacks = get_gwp_asan_callbacks,
#endif
    .post_dump = notify_gdb_of_libraries,
  };
  // 调用libdebuggerd_handler_fallback库，执行debuggerd_init方法
  debuggerd_init(&callbacks);
}

void debuggerd_init(debuggerd_callbacks_t* callbacks) {
  if (callbacks) {
    g_callbacks = *callbacks;
  }

  size_t thread_stack_pages = 8;
  void* thread_stack_allocation = mmap(nullptr, PAGE_SIZE * (thread_stack_pages + 2), PROT_NONE,
                                       MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
  if (thread_stack_allocation == MAP_FAILED) {
    fatal_errno("failed to allocate debuggerd thread stack");
  }

  char* stack = static_cast<char*>(thread_stack_allocation) + PAGE_SIZE;
  if (mprotect(stack, PAGE_SIZE * thread_stack_pages, PROT_READ | PROT_WRITE) != 0) {
    fatal_errno("failed to mprotect debuggerd thread stack");
  }

  // Stack grows negatively, set it to the last byte in the page...
  stack = (stack + thread_stack_pages * PAGE_SIZE - 1);
  // and align it.
  stack -= 15;
  pseudothread_stack = stack;

  struct sigaction action;
  memset(&action, 0, sizeof(action));
  sigfillset(&action.sa_mask);
  action.sa_sigaction = debuggerd_signal_handler;    // 注册函数名称
  action.sa_flags = SA_RESTART | SA_SIGINFO;

  // Use the alternate signal stack if available so we can catch stack overflows.
  action.sa_flags |= SA_ONSTACK;

#define SA_EXPOSE_TAGBITS 0x00000800
  // Request that the kernel set tag bits in the fault address. This is necessary for diagnosing MTE
  // faults.
  action.sa_flags |= SA_EXPOSE_TAGBITS;
  // 注册异常信号和debuggerd_signal_handler信号处理方法
  debuggerd_register_handlers(&action);
}

// system/core/debuggerd/include/debuggerd/handler.h
static void __attribute__((__unused__)) debuggerd_register_handlers(struct sigaction* action) {
  bool enabled = true;
#if ANDROID_DEBUGGABLE
  char value[PROP_VALUE_MAX] = "";
  enabled = !(__system_property_get("debug.debuggerd.disable", value) > 0 && !strcmp(value, "1"));
#endif
  if (enabled) {
    sigaction(SIGABRT, action, nullptr);
    sigaction(SIGBUS, action, nullptr);
    sigaction(SIGFPE, action, nullptr);
    sigaction(SIGILL, action, nullptr);
    sigaction(SIGSEGV, action, nullptr);
    sigaction(SIGSTKFLT, action, nullptr);
    sigaction(SIGSYS, action, nullptr);
    sigaction(SIGTRAP, action, nullptr);
  }

  sigaction(BIONIC_SIGNAL_DEBUGGER, action, nullptr);
}

通过sigaction方法，注册接收的信号有：SIGABRT，SIGBUS，SIGFPE，SIGILL，SIGSEGV，SIGSTKFLT，SIGSYS，SIGTRAP，DEBUGGER_SIGNAL，共计9个信号，当kernel发出这些信号时，回调debuggerd_signal_handler方法。

	异常信号	说明
1	SIGABRT	当程序调用abort函数时产生的异常
2	SIGBUS	总线错误异常，如内存对齐错误或非法地址访问
3	SIGFPE	浮点异常
4	SIGILL	非法指令异常，如使用了非法的机器指令或指令参数
5	SIGSEGV	内存段错误，如进程试图访问不允许访问的内存地址，例如访问已释放的内存，或者栈溢出等
6	SIGSTKFLT	协处理器堆栈错误，如协处理器栈错误或FPU错误
7	SIGSYS	非法系统调用异常，如进程试图执行未定义或不支持的系统调用
8	SIGTRAP	断点跟踪异常，如程序的特定位置设置断点时会触发SIGTRAP信号
9	DEBUGGER_SIGNAL	在Android Bionic运行时使用的调试器相关信号

2.2 异常信号处理

static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void* context) {
  // Make sure we don't change the value of errno, in case a signal comes in between the process
  // making a syscall and checking errno.
  ErrnoRestorer restorer;

    .......
  // 打印"Fatal signal"信息，如"F libc : Fatal signal 11 (SIGSEGV), code 1 (SEGV_MAPERR), fault addr 0x74 in tid 2992"
  log_signal_summary(info);

    .......

  // Essentially pthread_create without CLONE_FILES, so we still work during file descriptor
  // exhaustion.
  // clone出子进程，并执行debuggerd_dispatch_pseudothread方法
  pid_t child_pid =
    clone(debuggerd_dispatch_pseudothread, pseudothread_stack,
          CLONE_THREAD | CLONE_SIGHAND | CLONE_VM | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
          &thread_info, nullptr, nullptr, &thread_info.pseudothread_tid);
  .......
}

debuggerd_dispatch_pseudothread方法主要是执行execle启动crash_dump64进程。

static int debuggerd_dispatch_pseudothread(void* arg) {
  debugger_thread_info* thread_info = static_cast<debugger_thread_info*>(arg);

  ......

    execle(CRASH_DUMP_PATH, CRASH_DUMP_NAME, main_tid, pseudothread_tid, debuggerd_dump_type,
           nullptr, nullptr);
    async_safe_format_log(ANDROID_LOG_FATAL, "libc", "failed to exec crash_dump helper: %s",
                          strerror(errno));
    return 1;
  }

 ......
}

crash_dump64进程主要作用：

1）解析传入的参数，包括tid、目标进程名，并调用getProcessTids方法获取目标进程的所有线程id集合。

2）通过ptrace读取crash进程中所有线程的寄存器信息，最终汇总所有的异常信息，包括机型版本、abi、信号、寄存器、backtrace等，输出到log中。

crash_dump进程会fork出一个新进程，父进程通过read去等待子进程的结果，而子进程继续执行crash_dump的任务。

int main(int argc, char** argv) {
   ......
   // 1.fork子进程
   pid_t forkpid = fork();
    //解析参数
  ParseArgs(argc, argv, &pseudothread_tid, &dump_type);

......
        // 2. Ptrace循环遍历读取这个进程中的所有线程
      if (!ptrace_interrupt(thread, &info.signo)) {
        PLOG(WARNING) << "failed to ptrace interrupt thread " << thread;
        ptrace(PTRACE_DETACH, thread, 0, 0);
        continue;
      }

      struct iovec tagged_addr_iov = {
          &info.tagged_addr_ctrl,
          sizeof(info.tagged_addr_ctrl),
      };
      if (ptrace(PTRACE_GETREGSET, thread, NT_ARM_TAGGED_ADDR_CTRL,
                 reinterpret_cast<void*>(&tagged_addr_iov)) == -1) {
        info.tagged_addr_ctrl = -1;
      }

      struct iovec pac_enabled_keys_iov = {
          &info.pac_enabled_keys,
          sizeof(info.pac_enabled_keys),
      };
      if (ptrace(PTRACE_GETREGSET, thread, NT_ARM_PAC_ENABLED_KEYS,
                 reinterpret_cast<void*>(&pac_enabled_keys_iov)) == -1) {
        info.pac_enabled_keys = -1;
      }
        // 3.读取寄存器、backtrace等信息
      if (thread == g_target_thread) {
        // Read the thread's registers along with the rest of the crash info out of the pipe.
        ReadCrashInfo(input_pipe, &siginfo, &info.registers, &process_info,
                      &recoverable_gwp_asan_crash);
        info.siginfo = &siginfo;
        info.signo = info.siginfo->si_signo;

        info.command_line = get_command_line(g_target_thread);
      } else {
        info.registers.reset(unwindstack::Regs::RemoteGet(thread));
        if (!info.registers) {
          PLOG(WARNING) << "failed to fetch registers for thread " << thread;
          ptrace(PTRACE_DETACH, thread, 0, 0);
          continue;
        }
      }
      // 信息保存到thread_info数组
      thread_info[thread] = std::move(info);
      .....
      // 发送给tombstoned进程，保存到/data/tomgbstones目录
      g_tombstoned_connected = connect_tombstone_server(g_target_thread, &g_tombstoned_socket,
                                                      &g_output_fd, &g_proto_fd, dump_type);
      
    // 发送给system_server进程，保存到/data/system/dropbox目录  
    if (fatal_signal) {
        // Don't try to notify ActivityManager if it just crashed, or we might hang until timeout.
        if (thread_info[target_process].thread_name != "system_server") {
          activity_manager_notify(target_process, signo, amfd_data, recoverable_gwp_asan_crash);
        }
    }
}

通过Socket通知tombstoned进程（系统常驻进程），传输异常信息，由tombstoned进程将所有异常信息输出到/data/tombstones/tombstone_xx文件中。

通过Socket通知system_server进程，传输异常信息，由system_server进程的dropbox服务将所有异常信息输出到/data/system/dropbox文件中。