COMMAND

    kernel

SYSTEMS AFFECTED

    Linux 2.2.1

PROBLEM

    Richard Kail seems to found  anoter /proc bug in the  2.2.1  linux
    kernel.  An example:

	modprobe coda.o     # will load coda.o as dynamically module
			    # the kernel will create a /proc/fs/coda directory
			    # on the fly.

	cd /proc/fs/coda    # make shell chdir("/proc/fs/coda")

	rmmod coda.o        # works ! (shouldn't)

	ls -l               # generate a oops.

    after this, the  coda.o module seems  not to work  at all.   Maybe
    this is a systematic problem of more than one driver.  A user  can
    trigger a  "modprobe module"  command using  the kerneld  (oh, its
    called kmod now...)  mechanism.  A  user can chdir  /proc/fs/coda.
    If  the  system  has  implemented  the  cron  job  as suggested in
    linux/Documentation/kmod.txt  the  user  has  just  to  wait  with
    wd=/proc/fs/code to crash the system.

SOLUTION

    Here the bugfix against 2.2.1:

    Index: array.c
    ===================================================================
    RCS file: /var/cvs/linux/fs/proc/array.c,v
    retrieving revision 1.1.1.5
    diff -u -r1.1.1.5 array.c
    --- array.c 1999/01/29 14:50:53     1.1.1.5
    +++ linux/fs/proc/array.c   1999/02/04 13:59:26
    @@ -386,21 +386,57 @@
	    return sprintf(buffer, "%s\n", saved_command_line);
     }

    -static unsigned long get_phys_addr(struct task_struct * p, unsigned long ptr)
    +/*
    + * Caller must release_mm the mm_struct later.
    + * You don't get any access to init_mm.
    + */
    +static struct mm_struct * grab_mm(int pid)
    +{
    +       struct mm_struct * mm;
    +       struct task_struct * tsk;
    +
    +       if (current->pid == pid)
    +               return current->mm;
    +
    +       mm = NULL;
    +       read_lock(&tasklist_lock);
    +       tsk = find_task_by_pid(pid);
    +       /*
    +        * NOTE: this doesn't race because we are protected by the
    +        * big kernel lock. -arca
    +        */
    +       if (tsk && tsk->mm && tsk->mm != &init_mm)
    +               mmget(mm = tsk->mm);
    +       read_unlock(&tasklist_lock);
    +       if (mm)
    +               down(&mm->mmap_sem);
    +       return mm;
    +}
    +
    +static void release_mm(struct mm_struct *mm)
     {
    +       if (current->mm != mm)
    +       {
    +               up(&mm->mmap_sem);
    +               mmput(mm);
    +       }
    +}
    +     
    +static unsigned long get_phys_addr(struct mm_struct *mm, unsigned long ptr)
    +{
	    pgd_t *page_dir;
	    pmd_t *page_middle;
	    pte_t pte;

    -       if (!p || !p->mm || ptr >= TASK_SIZE)
    +       if (ptr >= TASK_SIZE)
		    return 0;
	    /* Check for NULL pgd .. shouldn't happen! */
    -       if (!p->mm->pgd) {
    -               printk("get_phys_addr: pid %d has NULL pgd!\n", p->pid);
    +       if (!mm->pgd) {
    +               printk(KERN_DEBUG "missing pgd for mm %p\n", mm);
		    return 0;
	    }

    -       page_dir = pgd_offset(p->mm,ptr);
    +       page_dir = pgd_offset(mm,ptr);
	    if (pgd_none(*page_dir))
		    return 0;
	    if (pgd_bad(*page_dir)) {
    @@ -422,7 +458,7 @@
	    return pte_page(pte) + (ptr & ~PAGE_MASK);
     }

    -static int get_array(struct task_struct *p, unsigned long start, unsigned long end, char * buffer)
    +static int get_array(struct mm_struct *mm, unsigned long start, unsigned long end, char * buffer)
     {
	    unsigned long addr;
	    int size = 0, result = 0;
    @@ -431,7 +467,7 @@
	    if (start >= end)
		    return result;
	    for (;;) {
    -               addr = get_phys_addr(p, start);
    +               addr = get_phys_addr(mm, start);
		    if (!addr)
			    return result;
		    do {
    @@ -453,27 +489,28 @@

     static int get_env(int pid, char * buffer)
     {
    -       struct task_struct *p;
    -
    -       read_lock(&tasklist_lock);
    -       p = find_task_by_pid(pid);
    -       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
    +       struct mm_struct *mm;
    +       int res = 0;

    -       if (!p || !p->mm)
    -               return 0;
    -       return get_array(p, p->mm->env_start, p->mm->env_end, buffer);
    +       mm = grab_mm(pid);
    +       if (mm) {
    +               res = get_array(mm, mm->env_start, mm->env_end, buffer);
    +               release_mm(mm);
    +       }
    +       return res;
     }

     static int get_arg(int pid, char * buffer)
     {
    -       struct task_struct *p;
    +       struct mm_struct *mm;
    +       int res = 0;

    -       read_lock(&tasklist_lock);
    -       p = find_task_by_pid(pid);
    -       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
    -       if (!p || !p->mm)
    -               return 0;
    -       return get_array(p, p->mm->arg_start, p->mm->arg_end, buffer);
    +       mm = grab_mm(pid);
    +       if (mm) {
    +               res = get_array(mm, mm->arg_start, mm->arg_end, buffer);
    +               release_mm(mm);
    +       }
    +       return res;
     }

     /*
    @@ -722,12 +759,10 @@
	    return buffer;
     }

    -static inline char * task_mem(struct task_struct *p, char *buffer)
    +static inline char * task_mem(struct mm_struct * mm, char *buffer)
     {
    -       struct mm_struct * mm = p->mm;
    -
    -       if (mm && mm != &init_mm) {
    -               struct vm_area_struct * vma = mm->mmap;
    +       if (mm) {
    +               struct vm_area_struct * vma;
		    unsigned long data = 0, stack = 0;
		    unsigned long exec = 0, lib = 0;

    @@ -817,47 +852,96 @@
				cap_t(p->cap_effective));
     }

    +static struct task_struct *grab_task(int pid, struct mm_struct ** mm)
    +{
    +       struct task_struct *tsk = current;
    +
    +       if (current->pid == pid)
    +       {
    +               *mm = current->mm;
    +               return current;
    +       }
    +
    +       *mm = NULL;
    +       read_lock(&tasklist_lock);
    +       tsk = find_task_by_pid(pid);
    +       if (tsk)
    +       {
    +               struct mm_struct * __mm;
    +               struct page * page = mem_map + MAP_NR(tsk);
    +               atomic_inc(&page->count);
    +               /*
    +                * NOTE: this doesn't race because we are protected
    +                * by the big kernel lock. -arca
    +                */
    +               __mm = tsk->mm;
    +               if (__mm && __mm != &init_mm)
    +               {
    +                       mmget(__mm);
    +                       *mm = __mm;
    +               }
    +       }
    +       read_unlock(&tasklist_lock);
    +       if (*mm)
    +               down(&(*mm)->mmap_sem);
    +
    +       return tsk;
    +}
    +
    +static void release_task(struct task_struct *tsk, struct mm_struct * mm)
    +{
    +       if (current != tsk)
    +       {
    +               if (mm)
    +               {
    +                       up(&mm->mmap_sem);
    +                       mmput(mm);
    +               }
    +               free_pages((unsigned long) tsk, 1);
    +       }
    +}

     static int get_status(int pid, char * buffer)
     {
	    char * orig = buffer;
	    struct task_struct *tsk;
    -
    -       read_lock(&tasklist_lock);
    -       tsk = find_task_by_pid(pid);
    -       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
    +       struct mm_struct * mm;
    +
    +       tsk = grab_task(pid, &mm);
	    if (!tsk)
		    return 0;
	    buffer = task_name(tsk, buffer);
	    buffer = task_state(tsk, buffer);
    -       buffer = task_mem(tsk, buffer);
    +       buffer = task_mem(mm, buffer);
	    buffer = task_sig(tsk, buffer);
	    buffer = task_cap(tsk, buffer);
    +       release_task(tsk, mm);
	    return buffer - orig;
     }

     static int get_stat(int pid, char * buffer)
     {
	    struct task_struct *tsk;
    +       struct mm_struct * mm;
	    unsigned long vsize, eip, esp, wchan;
	    long priority, nice;
	    int tty_pgrp;
	    sigset_t sigign, sigcatch;
	    char state;
    +       int res;

    -       read_lock(&tasklist_lock);
    -       tsk = find_task_by_pid(pid);
    -       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
    +       tsk = grab_task(pid, &mm);
	    if (!tsk)
		    return 0;
	    state = *get_task_state(tsk);
	    vsize = eip = esp = 0;
    -       if (tsk->mm && tsk->mm != &init_mm) {
    -               struct vm_area_struct *vma = tsk->mm->mmap;
    -               while (vma) {
    +       if (mm) {
    +               struct vm_area_struct *vma;
    +
    +               for (vma = mm->mmap; vma; vma = vma->vm_next) {
			    vsize += vma->vm_end - vma->vm_start;
    -                       vma = vma->vm_next;
		    }
    +
		    eip = KSTK_EIP(tsk);
		    esp = KSTK_ESP(tsk);
	    }
    @@ -878,7 +962,7 @@
	    nice = tsk->priority;
	    nice = 20 - (nice * 20 + DEF_PRIORITY / 2) / DEF_PRIORITY;

    -       return sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
    +       res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
     %lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %lu %lu %ld %lu %lu %lu %lu %lu \
     %lu %lu %lu %lu %lu %lu %lu %lu %d\n",
		    pid,
    @@ -904,11 +988,11 @@
		    tsk->it_real_value,
		    tsk->start_time,
		    vsize,
    -               tsk->mm ? tsk->mm->rss : 0, /* you might want to shift this left 3 */
    +               mm ? mm->rss : 0, /* you might want to shift this left 3 */
		    tsk->rlim ? tsk->rlim[RLIMIT_RSS].rlim_cur : 0,
    -               tsk->mm ? tsk->mm->start_code : 0,
    -               tsk->mm ? tsk->mm->end_code : 0,
    -               tsk->mm ? tsk->mm->start_stack : 0,
    +               mm ? mm->start_code : 0,
    +               mm ? mm->end_code : 0,
    +               mm ? mm->start_stack : 0,
		    esp,
		    eip,  
		    /* The signal information here is obsolete.
    @@ -923,6 +1007,9 @@
		    tsk->nswap,
		    tsk->cnswap,
		    tsk->exit_signal);
    +
    +       release_task(tsk, mm);
    +       return res;
     }

     static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size,
    @@ -1000,19 +1087,15 @@

     static int get_statm(int pid, char * buffer)
     {
    -       struct task_struct *tsk;
	    int size=0, resident=0, share=0, trs=0, lrs=0, drs=0, dt=0;
    +       struct mm_struct *mm;

    -       read_lock(&tasklist_lock);
    -       tsk = find_task_by_pid(pid);
    -       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
    -       if (!tsk)
    -               return 0;
    -       if (tsk->mm && tsk->mm != &init_mm) {
    -               struct vm_area_struct * vma = tsk->mm->mmap;
    +       mm = grab_mm(pid);
    +       if (mm) {
    +               struct vm_area_struct * vma = mm->mmap;

		    while (vma) {
    -                       pgd_t *pgd = pgd_offset(tsk->mm, vma->vm_start);
    +                       pgd_t *pgd = pgd_offset(mm, vma->vm_start);
			    int pages = 0, shared = 0, dirty = 0, total = 0;

			    statm_pgd_range(pgd, vma->vm_start, vma->vm_end, &pages, &shared, &dirty, &total);
    @@ -1030,6 +1113,7 @@
				    drs += pages;
			    vma = vma->vm_next;
		    }
    +               release_mm(mm);
	    }
	    return sprintf(buffer,"%d %d %d %d %d %d %d\n",
			   size, resident, share, trs, lrs, drs, dt);
    @@ -1067,16 +1151,15 @@

     #define MAPS_LINE_MAX  MAPS_LINE_MAX8

    -
     static ssize_t read_maps (int pid, struct file * file, char * buf,
			      size_t count, loff_t *ppos)
     {
    -       struct task_struct *p;
    +       struct task_struct * p;
    +       struct mm_struct * mm;
	    struct vm_area_struct * map, * next;
	    char * destptr = buf, * buffer;
	    loff_t lineno;
	    ssize_t column, i;
    -       int volatile_task;
	    long retval;

	    /*
    @@ -1088,24 +1171,19 @@
		    goto out;

	    retval = -EINVAL;
    -       read_lock(&tasklist_lock);
    -       p = find_task_by_pid(pid);
    -       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
    +       p = grab_task(pid, &mm);
	    if (!p)
		    goto freepage_out;

    -       if (!p->mm || p->mm == &init_mm || count == 0)
    +       if (!mm || count == 0)
		    goto getlen_out;

    -       /* Check whether the mmaps could change if we sleep */
    -       volatile_task = (p != current || atomic_read(&p->mm->count) > 1);
    -
	    /* decode f_pos */
	    lineno = *ppos >> MAPS_LINE_SHIFT;
	    column = *ppos & (MAPS_LINE_LENGTH-1);

	    /* quickly go to line lineno */
    -       for (map = p->mm->mmap, i = 0; map && (i < lineno); map = map->vm_next, i++)
    +       for (map = mm->mmap, i = 0; map && (i < lineno); map = map->vm_next, i++)
		    continue;

	    for ( ; map ; map = next ) {
    @@ -1176,18 +1254,13 @@
		    /* done? */
		    if (count == 0)
			   break;
    -
    -               /* By writing to user space, we might have slept.
    -                * Stop the loop, to avoid a race condition.
    -                */
    -               if (volatile_task)
    -                       break;
	    }

	    /* encode f_pos */
	    *ppos = (lineno << MAPS_LINE_SHIFT) + column;

     getlen_out:
    +       release_task(p, mm);
	    retval = destptr - buf;

     freepage_out:
    @@ -1199,15 +1272,12 @@
     #ifdef __SMP__
     static int get_pidcpu(int pid, char * buffer)
     {
    -       struct task_struct * tsk = current ;
    +       struct task_struct * tsk;
    +       struct mm_struct * mm;
	    int i, len;
    -
    -       read_lock(&tasklist_lock);
    -       if (pid != tsk->pid)
    -               tsk = find_task_by_pid(pid);
    -       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */

    -       if (tsk == NULL)
    +       tsk = grab_task(pid, &mm);
    +       if (!tsk)
		    return 0;

	    len = sprintf(buffer,
    @@ -1221,6 +1291,7 @@
			    tsk->per_cpu_utime[cpu_logical_map(i)],
			    tsk->per_cpu_stime[cpu_logical_map(i)]);

    +       release_task(tsk, mm);
	    return len;
     }
     #endif