COMMAND
kernel
SYSTEMS AFFECTED
Linux 2.2.1
PROBLEM
Richard Kail seems to found anoter /proc bug in the 2.2.1 linux
kernel. An example:
modprobe coda.o # will load coda.o as dynamically module
# the kernel will create a /proc/fs/coda directory
# on the fly.
cd /proc/fs/coda # make shell chdir("/proc/fs/coda")
rmmod coda.o # works ! (shouldn't)
ls -l # generate a oops.
after this, the coda.o module seems not to work at all. Maybe
this is a systematic problem of more than one driver. A user can
trigger a "modprobe module" command using the kerneld (oh, its
called kmod now...) mechanism. A user can chdir /proc/fs/coda.
If the system has implemented the cron job as suggested in
linux/Documentation/kmod.txt the user has just to wait with
wd=/proc/fs/code to crash the system.
SOLUTION
Here the bugfix against 2.2.1:
Index: array.c
===================================================================
RCS file: /var/cvs/linux/fs/proc/array.c,v
retrieving revision 1.1.1.5
diff -u -r1.1.1.5 array.c
--- array.c 1999/01/29 14:50:53 1.1.1.5
+++ linux/fs/proc/array.c 1999/02/04 13:59:26
@@ -386,21 +386,57 @@
return sprintf(buffer, "%s\n", saved_command_line);
}
-static unsigned long get_phys_addr(struct task_struct * p, unsigned long ptr)
+/*
+ * Caller must release_mm the mm_struct later.
+ * You don't get any access to init_mm.
+ */
+static struct mm_struct * grab_mm(int pid)
+{
+ struct mm_struct * mm;
+ struct task_struct * tsk;
+
+ if (current->pid == pid)
+ return current->mm;
+
+ mm = NULL;
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid(pid);
+ /*
+ * NOTE: this doesn't race because we are protected by the
+ * big kernel lock. -arca
+ */
+ if (tsk && tsk->mm && tsk->mm != &init_mm)
+ mmget(mm = tsk->mm);
+ read_unlock(&tasklist_lock);
+ if (mm)
+ down(&mm->mmap_sem);
+ return mm;
+}
+
+static void release_mm(struct mm_struct *mm)
{
+ if (current->mm != mm)
+ {
+ up(&mm->mmap_sem);
+ mmput(mm);
+ }
+}
+
+static unsigned long get_phys_addr(struct mm_struct *mm, unsigned long ptr)
+{
pgd_t *page_dir;
pmd_t *page_middle;
pte_t pte;
- if (!p || !p->mm || ptr >= TASK_SIZE)
+ if (ptr >= TASK_SIZE)
return 0;
/* Check for NULL pgd .. shouldn't happen! */
- if (!p->mm->pgd) {
- printk("get_phys_addr: pid %d has NULL pgd!\n", p->pid);
+ if (!mm->pgd) {
+ printk(KERN_DEBUG "missing pgd for mm %p\n", mm);
return 0;
}
- page_dir = pgd_offset(p->mm,ptr);
+ page_dir = pgd_offset(mm,ptr);
if (pgd_none(*page_dir))
return 0;
if (pgd_bad(*page_dir)) {
@@ -422,7 +458,7 @@
return pte_page(pte) + (ptr & ~PAGE_MASK);
}
-static int get_array(struct task_struct *p, unsigned long start, unsigned long end, char * buffer)
+static int get_array(struct mm_struct *mm, unsigned long start, unsigned long end, char * buffer)
{
unsigned long addr;
int size = 0, result = 0;
@@ -431,7 +467,7 @@
if (start >= end)
return result;
for (;;) {
- addr = get_phys_addr(p, start);
+ addr = get_phys_addr(mm, start);
if (!addr)
return result;
do {
@@ -453,27 +489,28 @@
static int get_env(int pid, char * buffer)
{
- struct task_struct *p;
-
- read_lock(&tasklist_lock);
- p = find_task_by_pid(pid);
- read_unlock(&tasklist_lock); /* FIXME!! This should be done after the last use */
+ struct mm_struct *mm;
+ int res = 0;
- if (!p || !p->mm)
- return 0;
- return get_array(p, p->mm->env_start, p->mm->env_end, buffer);
+ mm = grab_mm(pid);
+ if (mm) {
+ res = get_array(mm, mm->env_start, mm->env_end, buffer);
+ release_mm(mm);
+ }
+ return res;
}
static int get_arg(int pid, char * buffer)
{
- struct task_struct *p;
+ struct mm_struct *mm;
+ int res = 0;
- read_lock(&tasklist_lock);
- p = find_task_by_pid(pid);
- read_unlock(&tasklist_lock); /* FIXME!! This should be done after the last use */
- if (!p || !p->mm)
- return 0;
- return get_array(p, p->mm->arg_start, p->mm->arg_end, buffer);
+ mm = grab_mm(pid);
+ if (mm) {
+ res = get_array(mm, mm->arg_start, mm->arg_end, buffer);
+ release_mm(mm);
+ }
+ return res;
}
/*
@@ -722,12 +759,10 @@
return buffer;
}
-static inline char * task_mem(struct task_struct *p, char *buffer)
+static inline char * task_mem(struct mm_struct * mm, char *buffer)
{
- struct mm_struct * mm = p->mm;
-
- if (mm && mm != &init_mm) {
- struct vm_area_struct * vma = mm->mmap;
+ if (mm) {
+ struct vm_area_struct * vma;
unsigned long data = 0, stack = 0;
unsigned long exec = 0, lib = 0;
@@ -817,47 +852,96 @@
cap_t(p->cap_effective));
}
+static struct task_struct *grab_task(int pid, struct mm_struct ** mm)
+{
+ struct task_struct *tsk = current;
+
+ if (current->pid == pid)
+ {
+ *mm = current->mm;
+ return current;
+ }
+
+ *mm = NULL;
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid(pid);
+ if (tsk)
+ {
+ struct mm_struct * __mm;
+ struct page * page = mem_map + MAP_NR(tsk);
+ atomic_inc(&page->count);
+ /*
+ * NOTE: this doesn't race because we are protected
+ * by the big kernel lock. -arca
+ */
+ __mm = tsk->mm;
+ if (__mm && __mm != &init_mm)
+ {
+ mmget(__mm);
+ *mm = __mm;
+ }
+ }
+ read_unlock(&tasklist_lock);
+ if (*mm)
+ down(&(*mm)->mmap_sem);
+
+ return tsk;
+}
+
+static void release_task(struct task_struct *tsk, struct mm_struct * mm)
+{
+ if (current != tsk)
+ {
+ if (mm)
+ {
+ up(&mm->mmap_sem);
+ mmput(mm);
+ }
+ free_pages((unsigned long) tsk, 1);
+ }
+}
static int get_status(int pid, char * buffer)
{
char * orig = buffer;
struct task_struct *tsk;
-
- read_lock(&tasklist_lock);
- tsk = find_task_by_pid(pid);
- read_unlock(&tasklist_lock); /* FIXME!! This should be done after the last use */
+ struct mm_struct * mm;
+
+ tsk = grab_task(pid, &mm);
if (!tsk)
return 0;
buffer = task_name(tsk, buffer);
buffer = task_state(tsk, buffer);
- buffer = task_mem(tsk, buffer);
+ buffer = task_mem(mm, buffer);
buffer = task_sig(tsk, buffer);
buffer = task_cap(tsk, buffer);
+ release_task(tsk, mm);
return buffer - orig;
}
static int get_stat(int pid, char * buffer)
{
struct task_struct *tsk;
+ struct mm_struct * mm;
unsigned long vsize, eip, esp, wchan;
long priority, nice;
int tty_pgrp;
sigset_t sigign, sigcatch;
char state;
+ int res;
- read_lock(&tasklist_lock);
- tsk = find_task_by_pid(pid);
- read_unlock(&tasklist_lock); /* FIXME!! This should be done after the last use */
+ tsk = grab_task(pid, &mm);
if (!tsk)
return 0;
state = *get_task_state(tsk);
vsize = eip = esp = 0;
- if (tsk->mm && tsk->mm != &init_mm) {
- struct vm_area_struct *vma = tsk->mm->mmap;
- while (vma) {
+ if (mm) {
+ struct vm_area_struct *vma;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
vsize += vma->vm_end - vma->vm_start;
- vma = vma->vm_next;
}
+
eip = KSTK_EIP(tsk);
esp = KSTK_ESP(tsk);
}
@@ -878,7 +962,7 @@
nice = tsk->priority;
nice = 20 - (nice * 20 + DEF_PRIORITY / 2) / DEF_PRIORITY;
- return sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
+ res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
%lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %lu %lu %ld %lu %lu %lu %lu %lu \
%lu %lu %lu %lu %lu %lu %lu %lu %d\n",
pid,
@@ -904,11 +988,11 @@
tsk->it_real_value,
tsk->start_time,
vsize,
- tsk->mm ? tsk->mm->rss : 0, /* you might want to shift this left 3 */
+ mm ? mm->rss : 0, /* you might want to shift this left 3 */
tsk->rlim ? tsk->rlim[RLIMIT_RSS].rlim_cur : 0,
- tsk->mm ? tsk->mm->start_code : 0,
- tsk->mm ? tsk->mm->end_code : 0,
- tsk->mm ? tsk->mm->start_stack : 0,
+ mm ? mm->start_code : 0,
+ mm ? mm->end_code : 0,
+ mm ? mm->start_stack : 0,
esp,
eip,
/* The signal information here is obsolete.
@@ -923,6 +1007,9 @@
tsk->nswap,
tsk->cnswap,
tsk->exit_signal);
+
+ release_task(tsk, mm);
+ return res;
}
static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size,
@@ -1000,19 +1087,15 @@
static int get_statm(int pid, char * buffer)
{
- struct task_struct *tsk;
int size=0, resident=0, share=0, trs=0, lrs=0, drs=0, dt=0;
+ struct mm_struct *mm;
- read_lock(&tasklist_lock);
- tsk = find_task_by_pid(pid);
- read_unlock(&tasklist_lock); /* FIXME!! This should be done after the last use */
- if (!tsk)
- return 0;
- if (tsk->mm && tsk->mm != &init_mm) {
- struct vm_area_struct * vma = tsk->mm->mmap;
+ mm = grab_mm(pid);
+ if (mm) {
+ struct vm_area_struct * vma = mm->mmap;
while (vma) {
- pgd_t *pgd = pgd_offset(tsk->mm, vma->vm_start);
+ pgd_t *pgd = pgd_offset(mm, vma->vm_start);
int pages = 0, shared = 0, dirty = 0, total = 0;
statm_pgd_range(pgd, vma->vm_start, vma->vm_end, &pages, &shared, &dirty, &total);
@@ -1030,6 +1113,7 @@
drs += pages;
vma = vma->vm_next;
}
+ release_mm(mm);
}
return sprintf(buffer,"%d %d %d %d %d %d %d\n",
size, resident, share, trs, lrs, drs, dt);
@@ -1067,16 +1151,15 @@
#define MAPS_LINE_MAX MAPS_LINE_MAX8
-
static ssize_t read_maps (int pid, struct file * file, char * buf,
size_t count, loff_t *ppos)
{
- struct task_struct *p;
+ struct task_struct * p;
+ struct mm_struct * mm;
struct vm_area_struct * map, * next;
char * destptr = buf, * buffer;
loff_t lineno;
ssize_t column, i;
- int volatile_task;
long retval;
/*
@@ -1088,24 +1171,19 @@
goto out;
retval = -EINVAL;
- read_lock(&tasklist_lock);
- p = find_task_by_pid(pid);
- read_unlock(&tasklist_lock); /* FIXME!! This should be done after the last use */
+ p = grab_task(pid, &mm);
if (!p)
goto freepage_out;
- if (!p->mm || p->mm == &init_mm || count == 0)
+ if (!mm || count == 0)
goto getlen_out;
- /* Check whether the mmaps could change if we sleep */
- volatile_task = (p != current || atomic_read(&p->mm->count) > 1);
-
/* decode f_pos */
lineno = *ppos >> MAPS_LINE_SHIFT;
column = *ppos & (MAPS_LINE_LENGTH-1);
/* quickly go to line lineno */
- for (map = p->mm->mmap, i = 0; map && (i < lineno); map = map->vm_next, i++)
+ for (map = mm->mmap, i = 0; map && (i < lineno); map = map->vm_next, i++)
continue;
for ( ; map ; map = next ) {
@@ -1176,18 +1254,13 @@
/* done? */
if (count == 0)
break;
-
- /* By writing to user space, we might have slept.
- * Stop the loop, to avoid a race condition.
- */
- if (volatile_task)
- break;
}
/* encode f_pos */
*ppos = (lineno << MAPS_LINE_SHIFT) + column;
getlen_out:
+ release_task(p, mm);
retval = destptr - buf;
freepage_out:
@@ -1199,15 +1272,12 @@
#ifdef __SMP__
static int get_pidcpu(int pid, char * buffer)
{
- struct task_struct * tsk = current ;
+ struct task_struct * tsk;
+ struct mm_struct * mm;
int i, len;
-
- read_lock(&tasklist_lock);
- if (pid != tsk->pid)
- tsk = find_task_by_pid(pid);
- read_unlock(&tasklist_lock); /* FIXME!! This should be done after the last use */
- if (tsk == NULL)
+ tsk = grab_task(pid, &mm);
+ if (!tsk)
return 0;
len = sprintf(buffer,
@@ -1221,6 +1291,7 @@
tsk->per_cpu_utime[cpu_logical_map(i)],
tsk->per_cpu_stime[cpu_logical_map(i)]);
+ release_task(tsk, mm);
return len;
}
#endif