diff -Naur linux-2.6.8.1/Makefile linux-2.6.8.1-dd/Makefile --- linux-2.6.8.1/Makefile 2004-08-14 06:55:35.000000000 -0400 +++ linux-2.6.8.1-dd/Makefile 2005-01-19 14:13:10.000000000 -0500 @@ -453,8 +453,9 @@ CFLAGS += -fomit-frame-pointer endif +CFLAGS += -g -gstabs ifdef CONFIG_DEBUG_INFO -CFLAGS += -g +CFLAGS += -g -gstabs endif # warn about C99 declaration after statement diff -Naur linux-2.6.8.1/arch/i386/kernel/entry.S linux-2.6.8.1-dd/arch/i386/kernel/entry.S --- linux-2.6.8.1/arch/i386/kernel/entry.S 2004-08-14 06:55:09.000000000 -0400 +++ linux-2.6.8.1-dd/arch/i386/kernel/entry.S 2005-01-19 14:11:35.000000000 -0500 @@ -181,6 +181,17 @@ popl %eax jmp syscall_exit +# Entry point for a newly created speculative task. -- tongli +ENTRY(ret_from_spec_fork) + pushl %eax + call schedule_tail + call flip_wait_condition + # %eax now contains the syscall return value. -- tongli + GET_THREAD_INFO(%ebp) + # Simply restore %esp. %ebx doesn't have anything special. + popl %ebx + jmp syscall_exit + /* * Return to user mode is not as complex as all this looks, * but we want the default path for a system call return to @@ -886,5 +897,7 @@ .long sys_mq_notify .long sys_mq_getsetattr .long sys_ni_syscall /* reserved for kexec */ + .long sys_futex_dd /* new syscall -- tongli */ + .long sys_detect_deadlock /* new syscall -- tongli */ syscall_table_size=(.-sys_call_table) diff -Naur linux-2.6.8.1/arch/i386/kernel/process.c linux-2.6.8.1-dd/arch/i386/kernel/process.c --- linux-2.6.8.1/arch/i386/kernel/process.c 2004-08-14 06:54:46.000000000 -0400 +++ linux-2.6.8.1-dd/arch/i386/kernel/process.c 2005-01-19 14:11:34.000000000 -0500 @@ -54,6 +54,8 @@ #include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); +/* Entry point for speculative tasks. -- tongli */ +asmlinkage void ret_from_spec_fork(void) __asm__("ret_from_spec_fork"); int hlt_counter; @@ -361,7 +363,12 @@ p->thread.esp = (unsigned long) childregs; p->thread.esp0 = (unsigned long) (childregs+1); - p->thread.eip = (unsigned long) ret_from_fork; + + /* A speculative task has a special entry point. -- tongli */ + if (p->dd_state == DD_SPEC) + p->thread.eip = (unsigned long) ret_from_spec_fork; + else + p->thread.eip = (unsigned long) ret_from_fork; savesegment(fs,p->thread.fs); savesegment(gs,p->thread.gs); @@ -573,6 +580,7 @@ */ tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; } + return prev_p; } @@ -775,4 +783,3 @@ return -EFAULT; return 0; } - diff -Naur linux-2.6.8.1/fs/open.c linux-2.6.8.1-dd/fs/open.c --- linux-2.6.8.1/fs/open.c 2004-08-14 06:54:48.000000000 -0400 +++ linux-2.6.8.1-dd/fs/open.c 2005-01-19 14:12:48.000000000 -0500 @@ -1024,6 +1024,10 @@ struct file * filp; struct files_struct *files = current->files; + /* Speculative tasks return immediately. -- tongli */ + if (current->dd_state == DD_SPEC) + return 0; + spin_lock(&files->file_lock); if (fd >= files->max_fds) goto out_unlock; diff -Naur linux-2.6.8.1/fs/pipe.c linux-2.6.8.1-dd/fs/pipe.c --- linux-2.6.8.1/fs/pipe.c 2004-08-14 06:55:33.000000000 -0400 +++ linux-2.6.8.1-dd/fs/pipe.c 2005-01-19 14:12:51.000000000 -0500 @@ -259,6 +259,117 @@ return ret; } +/* New function for deadlock detection. -- tongli */ +static ssize_t +pipe_writev_dd(struct file *filp, const struct iovec *_iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct inode *inode = filp->f_dentry->d_inode; + ssize_t ret; + size_t min; + int do_wakeup; + struct iovec *iov = (struct iovec *)_iov; + size_t total_len; + /* Get a pointer to the current task struct. -- tongli */ + struct task_struct *tsk = current; + + total_len = iov_length(iov, nr_segs); + /* Null write succeeds. */ + if (unlikely(total_len == 0)) + return 0; + + do_wakeup = 0; + ret = 0; + min = total_len; + if (min > PIPE_BUF) + min = 1; + down(PIPE_SEM(*inode)); + for (;;) { + int free; + if (!PIPE_READERS(*inode)) { + send_sig(SIGPIPE, current, 0); + if (!ret) ret = -EPIPE; + break; + } + free = PIPE_FREE(*inode); + if (free >= min) { + /* transfer data */ + ssize_t chars = PIPE_MAX_WCHUNK(*inode); + char *pipebuf = PIPE_BASE(*inode) + PIPE_END(*inode); + /* Always wakeup, even if the copy fails. Otherwise + * we lock up (O_NONBLOCK-)readers that sleep due to + * syscall merging. + */ + do_wakeup = 1; + if (chars > total_len) + chars = total_len; + if (chars > free) + chars = free; + + if (pipe_iov_copy_from_user(pipebuf, iov, chars)) { + if (!ret) ret = -EFAULT; + break; + } + ret += chars; + + PIPE_LEN(*inode) += chars; + total_len -= chars; + if (!total_len) + break; + } + if (PIPE_FREE(*inode) && ret) { + /* handle cyclic data buffers */ + min = 1; + continue; + } + if (filp->f_flags & O_NONBLOCK) { + if (!ret) ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + if (!ret) ret = -ERESTARTSYS; + break; + } + if (do_wakeup) { + wake_up_interruptible_sync(PIPE_WAIT(*inode)); + kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); + do_wakeup = 0; + } + PIPE_WAITING_WRITERS(*inode)++; + /* New code for deadlock detection. -- tongli */ + /* We're going to sleep. Record what we're sleeping on. */ + if (tsk->dd_state != DD_SPEC) { + struct dd_sleep_info *wait_info; + if (tsk->dd_info == NULL) { + tsk->dd_info = kmalloc(sizeof(struct dd_struct), + GFP_KERNEL); + tsk->dd_info->wait_info.next = NULL; + } + wait_info = &tsk->dd_info->wait_info; + wait_info->addr = (unsigned long) inode; + wait_info->op = DD_AND; + wait_info->val = POLLOUT | POLLWRNORM; + strcpy(wait_info->comments, filp->f_dentry->d_iname); + tsk->dd_info->wake_info_last = 0; + tsk->dd_info->ret = total_len; + tsk->dd_state = DD_SLEEP; + } + /********************************************************/ + pipe_wait(inode); + PIPE_WAITING_WRITERS(*inode)--; + } + up(PIPE_SEM(*inode)); + if (do_wakeup) { + wake_up_interruptible(PIPE_WAIT(*inode)); + kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); + } + if (ret > 0) + inode_update_time(inode, 1); /* mtime and ctime */ + return ret; +} + +/* Replaced by the function below for deadlock detection. -- tongli */ +#if 0 static ssize_t pipe_write(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) @@ -266,6 +377,16 @@ struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; return pipe_writev(filp, &iov, 1, ppos); } +#endif + +/* New function for deadlock detection. -- tongli */ +static ssize_t +pipe_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; + return pipe_writev_dd(filp, &iov, 1, ppos); +} static ssize_t bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) diff -Naur linux-2.6.8.1/fs/read_write.c linux-2.6.8.1-dd/fs/read_write.c --- linux-2.6.8.1/fs/read_write.c 2004-08-14 06:55:35.000000000 -0400 +++ linux-2.6.8.1-dd/fs/read_write.c 2005-01-19 14:12:51.000000000 -0500 @@ -280,6 +280,8 @@ file->f_pos = pos; } +/* Replaced by the function below for deadlock detection. -- tongli */ +#if 0 asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) { struct file *file; @@ -296,8 +298,85 @@ return ret; } +#endif + +/* New read syscall for deadlock detection. -- tongli */ +asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) +{ + struct file *file; + ssize_t ret = -EBADF; + int fput_needed; + struct task_struct *tsk = current; + + if (tsk->dd_state == DD_SPEC) { + /* + * For speculative tasks, we only record the events created by + * the read without really performing the read. We assume the + * read will succeed and return 0 (indicating end-of-file). The + * real execution, however, may be different. For example, the + * read may return a positive count of bytes. We cannot return + * the exact count because we don't allow speculative tasks to + * change file positions. We could allow this if we copied the + * file structs in do_fork. But, this still doesn't allow us to + * correctly update the file positions because the only way to + * achieve that is to let the task really perform the read + * operation, which, however, violates safety. Note that we could + * prevent normal file reads from violating safety if we copied + * the file structs, but pipe reads definitely violate safety + * because they change the pipe length maintained in the inode + * struct. Therefore, to ensure safety, we should not perform + * the read. For the value to return, we have two choices, 0 or + * count. We choose 0 here because the caller may rely on + * checking if a read returns 0 to determine the end of file and + * possibly when to exit a loop. Returning count would in this + * case cause the caller to loop infinitely. Also, as we saw in + * the apache example, returning count causes segmentation fault + * and makes the caller (speculative) thread exit before it + * collects enough information for deadlock detection. No matter + * what value we return, the problem inherent in our + * implementation is that the events we record for the read, thus + * the dependence we construct based on these events, may not be + * always accurate. This can possibly lead to false alarms of + * deadlock. -- tongli + */ + struct files_struct *files = tsk->files; + + if (likely((atomic_read(&files->count) == 1))) { + file = fcheck_files(files, fd); + } else { + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + spin_unlock(&files->file_lock); + } + + /* Currently we only track pipe reads for speculative tasks. If + * non-pipe reads, return immediately. -- tongli */ + if (file->f_dentry->d_inode->i_pipe == NULL) + return 0; + + /* Record events produced by the speculative task. -- tongli */ + dd_record_wake_event(tsk->parent, + (unsigned long) file->f_dentry->d_inode, + DD_AND, POLLOUT | POLLWRNORM, + file->f_dentry->d_iname); + ret = 0; + } else { + file = fget_light(fd, &fput_needed); + if (file) { + loff_t pos = file_pos_read(file); + ret = vfs_read(file, buf, count, &pos); + file_pos_write(file, pos); + fput_light(file, fput_needed); + } + } + + return ret; +} + EXPORT_SYMBOL_GPL(sys_read); +/* Replaced by the function below for deadlock detection. -- tongli */ +#if 0 asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count) { struct file *file; @@ -314,6 +393,69 @@ return ret; } +#endif + +/* New write syscall for deadlock detection. -- tongli */ +asmlinkage ssize_t sys_write(struct pt_regs regs) +{ + unsigned int fd = regs.ebx; + const char __user *buf = (char __user *) regs.ecx; + size_t count = regs.edx; + struct file *file; + ssize_t ret = -EBADF; + int fput_needed; + struct task_struct *tsk = current; + + if (tsk->dd_state == DD_SPEC) { + /* + * For speculative tasks, we only record the events created by + * the write without really performing the write. We assume the + * write will succeed with all count bytes being written without + * errors. The real execution, however, may not always succeed. + * Therefore, the events we record for this write, thus the + * dependence we construct based on these events, may not be + * always accurate. This can be a source of deadlock false alarms. + * -- tongli + */ + struct files_struct *files = tsk->files; + + if (count == 0) + return 0; + + if (likely((atomic_read(&files->count) == 1))) { + file = fcheck_files(files, fd); + } else { + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + spin_unlock(&files->file_lock); + } + + /* Currently we only track pipe writes for speculative tasks. If + * non-pipe writes, return immediately. -- tongli */ + if (file->f_dentry->d_inode->i_pipe == NULL) + return count; + + /* Record events produced by the speculative task. -- tongli */ + dd_record_wake_event(tsk->parent, + (unsigned long) file->f_dentry->d_inode, + DD_AND, POLLIN | POLLRDNORM, + file->f_dentry->d_iname); + ret = count; + } else { + /* Record regs so that in-kernel fork can set up registers + * correctly. -- tongli */ + tsk->dd_regs = ®s; + file = fget_light(fd, &fput_needed); + if (file) { + loff_t pos = file_pos_read(file); + ret = vfs_write(file, buf, count, &pos); + file_pos_write(file, pos); + fput_light(file, fput_needed); + } + } + + return ret; +} asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) @@ -543,6 +685,8 @@ return ret; } +/* Replaced by the function below for deadlock detection. -- tongli */ +#if 0 asmlinkage ssize_t sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) { @@ -560,6 +704,82 @@ return ret; } +#endif + +/* New writev syscall for deadlock detection. -- tongli */ +asmlinkage ssize_t +sys_writev(struct pt_regs regs) +{ + unsigned long fd = regs.ebx; + const struct iovec __user *vec = (struct iovec __user *) regs.ecx; + unsigned long vlen = regs.edx; + struct file *file; + ssize_t ret = -EBADF; + int fput_needed; + struct task_struct *tsk = current; + + if (tsk->dd_state == DD_SPEC) { + /* + * For speculative tasks, we only record the events created by + * the write without really performing the write. We assume the + * write will succeed with all requested bytes being written + * without errors. The real execution, however, may not always + * succeed. Therefore, the events we record for this write, thus + * the dependence we construct based on these events, may not be + * always accurate. This can be a source of deadlock false + * alarms. -- tongli + */ + struct files_struct *files = tsk->files; + int i; + ssize_t tot_len = 0; + + for (i = 0; i < vlen; i++) { + ssize_t len = (ssize_t)vec[i].iov_len; + + if (len < 0) /* size_t not fitting an ssize_t .. */ + return -EINVAL; + tot_len += len; + if ((ssize_t)tot_len < 0) /* maths overflow on the ssize_t */ + return -EINVAL; + } + if (tot_len == 0) + return 0; + + if (likely((atomic_read(&files->count) == 1))) { + file = fcheck_files(files, fd); + } else { + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + spin_unlock(&files->file_lock); + } + + /* Currently we only track pipe writes for speculative tasks. If + * non-pipe writes, return immediately. -- tongli */ + if (file->f_dentry->d_inode->i_pipe == NULL) + return tot_len; + + /* Record events produced by the speculative task. -- tongli */ + dd_record_wake_event(tsk->parent, + (unsigned long) file->f_dentry->d_inode, + DD_AND, POLLIN | POLLRDNORM, + file->f_dentry->d_iname); + ret = tot_len; + } else { + /* Record regs so that in-kernel fork can set up registers + * correctly. -- tongli */ + tsk->dd_regs = ®s; + + file = fget_light(fd, &fput_needed); + if (file) { + loff_t pos = file_pos_read(file); + ret = vfs_writev(file, vec, vlen, &pos); + file_pos_write(file, pos); + fput_light(file, fput_needed); + } + } + + return ret; +} static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) @@ -653,6 +873,10 @@ off_t off; ssize_t ret; + /* Speculative tasks return immediately. -- tongli */ + if (current->dd_state == DD_SPEC) + return count; + if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; diff -Naur linux-2.6.8.1/fs/select.c linux-2.6.8.1-dd/fs/select.c --- linux-2.6.8.1/fs/select.c 2004-08-14 06:54:47.000000000 -0400 +++ linux-2.6.8.1-dd/fs/select.c 2005-01-19 14:12:48.000000000 -0500 @@ -427,6 +427,8 @@ } } +/* Replaced by the function below for deadlock detection. -- tongli */ +#if 0 static int do_poll(unsigned int nfds, struct poll_list *list, struct poll_wqueues *wait, long timeout) { @@ -455,7 +457,91 @@ __set_current_state(TASK_RUNNING); return count; } +#endif +/* New function for deadlock detection. -- tongli */ +static int do_poll_dd(unsigned int nfds, struct poll_list *list, + struct poll_wqueues *wait, long timeout, + struct pt_regs *regs) +{ + int count = 0; + poll_table* pt = &wait->pt; + struct task_struct *tsk = current; + + if (!timeout) + pt = NULL; + + for (;;) { + struct poll_list *walk; + set_current_state(TASK_INTERRUPTIBLE); + walk = list; + while(walk != NULL) { + do_pollfd( walk->len, walk->entries, &pt, &count ); + walk = walk->next; + } + pt = NULL; + if (count || !timeout || signal_pending(current)) + break; + count = wait->error; + if (count) + break; + + /* We're going to sleep. Record what we're sleeping on. */ + if (tsk->dd_state != DD_SPEC) { + struct poll_list *walk; + struct dd_sleep_info *wait_info; + int i; + if (tsk->dd_info == NULL) { + tsk->dd_info = kmalloc(sizeof(struct dd_struct), + GFP_KERNEL); + tsk->dd_info->wait_info.next = NULL; + } + walk = list; + wait_info = &tsk->dd_info->wait_info; + while (walk != NULL) { + i = 0; + while (i < walk->len) { + int fd; + struct pollfd *fdp = walk->entries + i; + struct file *file; + fd = fdp->fd; + if (fd < 0) + continue; + file = fget(fd); + if (file == NULL) + continue; + fput(file); + if (wait_info == NULL) { + wait_info = kmalloc(sizeof(struct dd_sleep_info), + GFP_KERNEL); + wait_info->next = NULL; + } + wait_info->addr + = (unsigned long) file->f_dentry->d_inode; + wait_info->op = DD_AND; + wait_info->val = fdp->events; + strcpy(wait_info->comments, file->f_dentry->d_iname); + wait_info = wait_info->next; + i++; + } + walk = walk->next; + } + tsk->dd_info->wake_info_last = 0; + /* Always let sys_pipe return 1 when it speculatively + * returns. */ + tsk->dd_info->ret = 1; + tsk->dd_state = DD_SLEEP; + tsk->dd_regs = regs; + } + + timeout = schedule_timeout(timeout); + } + __set_current_state(TASK_RUNNING); + return count; +} + +/* Replaced by the function below for deadlock detection. -- tongli */ +#if 0 asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long timeout) { struct poll_wqueues table; @@ -533,3 +619,87 @@ poll_freewait(&table); return err; } +#endif + +/* New system call for deadlock detection. -- tongli */ +asmlinkage long sys_poll(struct pt_regs regs) +{ + struct pollfd __user *ufds = (struct pollfd __user *) regs.ebx; + unsigned int nfds = regs.ecx; + long timeout = regs.edx; + + struct poll_wqueues table; + int fdcount, err; + unsigned int i; + struct poll_list *head; + struct poll_list *walk; + + /* Do a sanity check on nfds ... */ + if (nfds > current->files->max_fdset && nfds > OPEN_MAX) + return -EINVAL; + + if (timeout) { + /* Careful about overflow in the intermediate values */ + if ((unsigned long) timeout < MAX_SCHEDULE_TIMEOUT / HZ) + timeout = (unsigned long)(timeout*HZ+999)/1000+1; + else /* Negative or overflow */ + timeout = MAX_SCHEDULE_TIMEOUT; + } + + poll_initwait(&table); + + head = NULL; + walk = NULL; + i = nfds; + err = -ENOMEM; + while(i!=0) { + struct poll_list *pp; + pp = kmalloc(sizeof(struct poll_list)+ + sizeof(struct pollfd)* + (i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i), + GFP_KERNEL); + if(pp==NULL) + goto out_fds; + pp->next=NULL; + pp->len = (i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i); + if (head == NULL) + head = pp; + else + walk->next = pp; + + walk = pp; + if (copy_from_user(pp->entries, ufds + nfds-i, + sizeof(struct pollfd)*pp->len)) { + err = -EFAULT; + goto out_fds; + } + i -= pp->len; + } + fdcount = do_poll_dd(nfds, head, &table, timeout, ®s); + + /* OK, now copy the revents fields back to user space. */ + walk = head; + err = -EFAULT; + while(walk != NULL) { + struct pollfd *fds = walk->entries; + int j; + + for (j=0; j < walk->len; j++, ufds++) { + if(__put_user(fds[j].revents, &ufds->revents)) + goto out_fds; + } + walk = walk->next; + } + err = fdcount; + if (!fdcount && signal_pending(current)) + err = -EINTR; +out_fds: + walk = head; + while(walk!=NULL) { + struct poll_list *pp = walk->next; + kfree(walk); + walk = pp; + } + poll_freewait(&table); + return err; +} diff -Naur linux-2.6.8.1/include/asm-i386/unistd.h linux-2.6.8.1-dd/include/asm-i386/unistd.h --- linux-2.6.8.1/include/asm-i386/unistd.h 2004-08-14 06:55:35.000000000 -0400 +++ linux-2.6.8.1-dd/include/asm-i386/unistd.h 2005-01-19 14:11:25.000000000 -0500 @@ -289,8 +289,13 @@ #define __NR_mq_notify (__NR_mq_open+4) #define __NR_mq_getsetattr (__NR_mq_open+5) #define __NR_sys_kexec_load 283 - -#define NR_syscalls 284 +/* New syscalls for deadlock detection -- tongli */ +#define __NR_futex_dd 284 +#define __NR_detect_deadlock 285 + +/* #define NR_syscalls 284 */ +/* Increased by 2 -- tongli */ +#define NR_syscalls 286 /* user-visible error numbers are in the range -1 - -124: see */ diff -Naur linux-2.6.8.1/include/linux/init_task.h linux-2.6.8.1-dd/include/linux/init_task.h --- linux-2.6.8.1/include/linux/init_task.h 2004-08-14 06:54:49.000000000 -0400 +++ linux-2.6.8.1-dd/include/linux/init_task.h 2005-01-19 14:11:16.000000000 -0500 @@ -112,6 +112,13 @@ .proc_lock = SPIN_LOCK_UNLOCKED, \ .switch_lock = SPIN_LOCK_UNLOCKED, \ .journal_info = NULL, \ + /* Initialization for deadlock detection. -- tongli */ \ + .dd_state = DD_NORMAL, \ + .dd_sleep_last_interval = 0, \ + .dd_being_checked = 0, \ + .dd_regs = NULL, \ + .dd_info = NULL, \ + .dd_info_lock = SPIN_LOCK_UNLOCKED, \ } diff -Naur linux-2.6.8.1/include/linux/sched.h linux-2.6.8.1-dd/include/linux/sched.h --- linux-2.6.8.1/include/linux/sched.h 2004-08-14 06:54:49.000000000 -0400 +++ linux-2.6.8.1-dd/include/linux/sched.h 2005-01-19 14:11:16.000000000 -0500 @@ -387,6 +387,107 @@ struct audit_context; /* See audit.c */ struct mempolicy; +/************************************************************************* + * New structs for deadlock detection -- tongli * +**************************************************************************/ + +/* Deadlock detection state of a task. */ +#define DD_NORMAL 0 /* no deadlock detection for this task */ +#define DD_SLEEP 1 /* task is sleeping because of a syscall that we've + instrumented */ +#define DD_FORK 2 /* task is forking a speculative task */ +#define DD_SPEC 3 /* task is speculative */ + +/* Deadlock detection wait condition operators. */ +/* Compare operations between two integers */ +#define DD_LESS 0 +#define DD_LESS_EQUAL 1 +#define DD_EQUAL 2 +#define DD_GREATER 3 +#define DD_GREATER_EQUAL 4 +#define DD_NOT_EQUAL 5 +/* Bit operations between two integers */ +#define DD_AND 6 + +/* Maximum number of speculative wakeups a speculative task can perform. */ +#define DD_WAKE_NUM 10 + +/* Addresses and conditions that block or unblock a sleeping task. -- tongli */ +struct dd_sleep_info { + unsigned long addr; + char op; + int val; + struct dd_sleep_info *next; + /* Comments about the waited object. It can be "futex" if the + * waited object is a futex, or a file name if the sleeping task + * is waiting for some condition about a file to change. */ + char comments[36]; +}; + +/* Deadlock detection information kept for each sleeping task. -- tongli */ +struct dd_struct { + /* Address and condition the thread is waiting on. */ + struct dd_sleep_info wait_info; + /* Information on up to DD_WAKE_NUM tasks that the sleeping task can + * wake up if speculatively running ahead. Each entry keeps the + * address and condition produced by one speculative wakeup + * performed by the sleeping task. */ + struct dd_sleep_info wake_info[DD_WAKE_NUM]; + /* index to the last empty entry in the wake_info array */ + int wake_info_last; + /* Each sleep task must be put to sleep by a system call. After we + * speculatively wake up the task, we need to fake a return value as + * if the task were really woken up. This field keeps the fake + * return value. */ + int ret; +}; + +struct neighbor_node { + struct graph_node *gnode; + struct neighbor_node *next; +}; + +enum node_state {NOT_VISITED, BEING_VISITED, DONE_VISITED}; +enum node_type {TASK, RESOURCE}; + +struct graph_node { + /* If type is TASK, info_ptr points to a dd_task struct. If type is + * RESOURCE, it points to a wait_info struct. */ + unsigned long info_ptr; + struct neighbor_node *neighbor_list; + enum node_state visited; + enum node_type type; + int indegree; + int outdegree; + int id; +}; + +struct dd_task { + pid_t pid; + char comm[16]; + struct dd_sleep_info wait_info; + struct dd_sleep_info wake_info[DD_WAKE_NUM]; + int wake_info_last; + struct dd_task *next; + struct graph_node *gnode; +}; + +struct dd_spec_task { + struct task_struct *task; + struct dd_spec_task *next; +}; + + +struct dd_resource { + unsigned long addr; + struct dd_resource *next; + struct graph_node *gnode; +}; + +extern void dd_record_wake_event(struct task_struct *task, unsigned long addr, + char op, int val, char *comments); +/*************************************************************************/ + struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ struct thread_info *thread_info; @@ -527,6 +628,18 @@ struct mempolicy *mempolicy; short il_next; /* could be shared with used_math */ #endif + /************************************************/ + /* New fields for deadlock detection. -- tongli */ + /************************************************/ + + char dd_state; + int dd_being_checked; /* task is being checked for deadlock */ + int dd_sleep_last_interval:1; + /* Pointer to the registers saved when the task entered the kernel. */ + struct pt_regs *dd_regs; + struct dd_struct *dd_info; + /* Lock protecting dd_info. */ + spinlock_t dd_info_lock; }; static inline pid_t process_group(struct task_struct *tsk) @@ -887,6 +1000,9 @@ extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); extern struct task_struct * copy_process(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); +/* New function for deadlock detection. -- tongli */ +extern void dd_kill_task(struct task_struct *); + #ifdef CONFIG_SMP extern void wait_task_inactive(task_t * p); #else diff -Naur linux-2.6.8.1/include/linux/syscalls.h linux-2.6.8.1-dd/include/linux/syscalls.h --- linux-2.6.8.1/include/linux/syscalls.h 2004-08-14 06:56:24.000000000 -0400 +++ linux-2.6.8.1-dd/include/linux/syscalls.h 2005-01-19 14:11:17.000000000 -0500 @@ -50,6 +50,7 @@ struct tms; struct utimbuf; struct mq_attr; +struct pt_regs; #include #include @@ -485,4 +486,9 @@ asmlinkage long sys_uselib(const char __user *library); asmlinkage long sys_ni_syscall(void); +/* New system calls for deadlock detection. -- tongli */ +#include +asmlinkage long sys_futex_dd(struct pt_regs regs); +asmlinkage void sys_detect_deadlock(void); + #endif diff -Naur linux-2.6.8.1/kernel/Makefile linux-2.6.8.1-dd/kernel/Makefile --- linux-2.6.8.1/kernel/Makefile 2004-08-14 06:54:51.000000000 -0400 +++ linux-2.6.8.1-dd/kernel/Makefile 2005-01-19 14:13:07.000000000 -0500 @@ -7,7 +7,8 @@ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o intermodule.o extable.o params.o posix-timers.o \ - kthread.o + kthread.o \ +# detect-deadlock.o # new object for deadlock detection -- tongli obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o diff -Naur linux-2.6.8.1/kernel/exit.c linux-2.6.8.1-dd/kernel/exit.c --- linux-2.6.8.1/kernel/exit.c 2004-08-14 06:56:01.000000000 -0400 +++ linux-2.6.8.1-dd/kernel/exit.c 2005-01-19 14:13:07.000000000 -0500 @@ -797,6 +797,12 @@ { struct task_struct *tsk = current; + /* Change the parent of the speculative task to init. In this way, + * the speculative task is completely separate from the normal task + * (so they can exit on their own). -- tongli */ + if (tsk->dd_state == DD_SPEC) + tsk->real_parent = child_reaper; + if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) @@ -1209,3 +1215,12 @@ } #endif + +void dd_kill_task(struct task_struct *task) +{ + /* Change the parent of the speculative task to init. In this way, + * the speculative task is completely separate from the normal task + * (so they can exit on their own). */ + task->real_parent = child_reaper; + force_sig(SIGKILL, task); +} diff -Naur linux-2.6.8.1/kernel/fork.c linux-2.6.8.1-dd/kernel/fork.c --- linux-2.6.8.1/kernel/fork.c 2004-08-14 06:54:49.000000000 -0400 +++ linux-2.6.8.1-dd/kernel/fork.c 2005-01-19 14:13:07.000000000 -0500 @@ -87,6 +87,21 @@ WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + /* Free dd_info for deadlock detection. -- tongli */ + spin_lock(&tsk->dd_info_lock); + if (tsk->dd_info) { + struct dd_sleep_info *sleep_info = tsk->dd_info->wait_info.next; + struct dd_sleep_info *next_sleep_info; + while (sleep_info) { + next_sleep_info = sleep_info->next; + kfree(sleep_info); + sleep_info = next_sleep_info; + } + kfree(tsk->dd_info); + tsk->dd_info = NULL; + } + spin_unlock(&tsk->dd_info_lock); + if (unlikely(tsk->audit_context)) audit_free(tsk); security_task_free(tsk); @@ -260,6 +275,21 @@ tsk->thread_info = ti; ti->task = tsk; + /****************************************************/ + /* Initialization for deadlock detection. -- tongli */ + if (orig->dd_state == DD_FORK) { + tsk->dd_state = DD_SPEC; + if (orig->dd_info) + orig->dd_info->wake_info_last = 0; + } else + tsk->dd_state = DD_NORMAL; + tsk->dd_sleep_last_interval = 0; + tsk->dd_being_checked = 0; + tsk->dd_regs = NULL; + tsk->dd_info = NULL; + tsk->dd_info_lock = SPIN_LOCK_UNLOCKED; + /****************************************************/ + /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); return tsk; @@ -729,9 +759,50 @@ for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; - if (f) - get_file(f); - *new_fds++ = f; + /* The following code copies each file struct for the + * speculative task. This is suggested by Fay Chang's + * USENIX '03 paper. However, I found this actually doesn't + * work, since sockfd_lookup() checks if a socket's file + * struct pointer equals the file struct pointer kept in the + * inode struct. If we allocate a new file struct to the + * speculative task without changing the inode struct, that + * check will fail. -- tongli */ +#if 0 + /* Allocate a new file struct for a speculative task, + * instead of sharing it with the parent. -- tongli */ + if (current->dd_state == DD_FORK) { + struct file *new_f; + if (f) { + /* get_empty_filp() might sleep so we need to + * unlock first -- tongli */ + spin_unlock(&oldf->file_lock); + error = -ENOMEM; + new_f = get_empty_filp(); + if (!new_f) + goto out; + /* relock */ + spin_lock(&oldf->file_lock); + /* Copy current taks's file struct. -- tongli */ + *new_f = *f; + /* + * Initialize f_ep_links to empty. We cannot copy + * the parent's f_ep_links because + * eventpoll_release_file (called by __fput when + * a task exits) deletes entries on this list, + * thus changing state belonging to the parent. + */ + INIT_LIST_HEAD(&new_f->f_ep_links); + } else + new_f = NULL; + *new_fds++ = new_f; + } else { +#endif + if (f) + get_file(f); + *new_fds++ = f; +#if 0 + } +#endif } spin_unlock(&oldf->file_lock); @@ -1015,7 +1086,12 @@ p->parent_exec_id = p->self_exec_id; /* ok, now we should be set up.. */ - p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); + /* Speculative tasks don't send a signal to their parents when + * exiting. -- tongli */ + if (p->dd_state == DD_SPEC) + p->exit_signal = -1; + else + p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); p->pdeath_signal = 0; /* Perform scheduler related setup */ @@ -1078,9 +1154,16 @@ spin_unlock(¤t->sighand->siglock); } - SET_LINKS(p); - if (p->ptrace & PT_PTRACED) - __ptrace_link(p, current->parent); + /* If forking a speculative task, we don't link it to the children + * list of the parent. Note that the child has a link (parent) back + * to the parent though.-- tongli */ + if (p->dd_state == DD_SPEC) + list_add_tail(&(p)->tasks,&init_task.tasks); + else { + SET_LINKS(p); + if (p->ptrace & PT_PTRACED) + __ptrace_link(p, current->parent); + } attach_pid(p, PIDTYPE_PID, p->pid); if (thread_group_leader(p)) { diff -Naur linux-2.6.8.1/kernel/futex.c linux-2.6.8.1-dd/kernel/futex.c --- linux-2.6.8.1/kernel/futex.c 2004-08-14 06:55:09.000000000 -0400 +++ linux-2.6.8.1-dd/kernel/futex.c 2005-01-19 14:13:07.000000000 -0500 @@ -710,6 +710,95 @@ (unsigned long)uaddr2, val2, val3); } +/* + * New system call for deadlock detection, adapted from sys_futex. + * The caller needs to pass the following arguments: + * Arg Reg Comments + * uaddr ebx Address waited on by the sleeping thread + * op ecx Futex operation (e.g., FUTEX_WAIT) + * val edx Current value at uaddr or number of threads to + * wake up. + * wait_op esi + * wait_val edi These two args together form the condition + * that can unblock the thread. For example, if + * wait_op = DD_GREATER and wait_val = 0, then the + * thread is waiting for the value at uaddr to + * become greater than 0. + * utime ebp Timeout + * + * Although this system call requires that the above arguments reside in + * their corresponding registers when it's called, this syscall's function + * only has one parameter: regs. The idea is that, when a system call is + * called, the kernel saves all the general-purpose registers on the + * caller's kernel mode stack (see SAVE_ALL in arch/i386/kernel/entry.S). + * The argument regs contains the values of all these saved registers, + * which include the ones passed by the caller as arguments to this system + * call. Thus we can extract these arguments inside the system call from + * regs. We need to pass the entire regs because they're needed later in + * do_fork when we fork speculative tasks. + * + * -- tongli + */ +asmlinkage long sys_futex_dd(struct pt_regs regs) +{ + u32 __user *uaddr = (u32 __user *) regs.ebx; + int op = regs.ecx; + int val = regs.edx; + char wait_op = regs.esi; + int wait_val = regs.edi; + struct timespec __user *utime = (struct timespec __user *) regs.ebp; + struct timespec t; + unsigned long timeout = MAX_SCHEDULE_TIMEOUT; + int ret = 0; + struct task_struct *tsk = current; + + if (utime) { + if (copy_from_user(&t, utime, sizeof(t)) != 0) + return -EFAULT; + timeout = timespec_to_jiffies(&t) + 1; + } + + switch (op) { + case FUTEX_WAIT: + /* Speculative tasks don't enter the DD_SLEEP state. This + * prevents deadlock detection from being applied to these + * tasks, thus avoiding deadlock. */ + if (tsk->dd_state != DD_SPEC) { + /* Record in the task struct the wait conditions. */ + if (tsk->dd_info == NULL) { + tsk->dd_info = kmalloc(sizeof(struct dd_struct), + GFP_KERNEL); + tsk->dd_info->wait_info.next = NULL; + } + tsk->dd_info->wait_info.addr = (unsigned long) uaddr; + tsk->dd_info->wait_info.op = wait_op; + tsk->dd_info->wait_info.val = wait_val; + strcpy(tsk->dd_info->wait_info.comments, "futex"); + tsk->dd_info->wake_info_last = 0; + tsk->dd_regs = ®s; + /* Always let sys_futex return 0 when it speculatively + * returns. */ + tsk->dd_info->ret = 0; + tsk->dd_state = DD_SLEEP; + } + ret = futex_wait((unsigned long)uaddr, val, timeout); + break; + case FUTEX_WAKE: + if (tsk->dd_state == DD_SPEC) { + /* Record the wakeup information in the parent's task + * struct. */ + dd_record_wake_event(tsk->parent, + (unsigned long) uaddr, wait_op, + wait_val, "futex"); + } else + ret = futex_wake((unsigned long)uaddr, val); + break; + default: + ret = -ENOSYS; + } + return ret; +} + static struct super_block * futexfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) diff -Naur linux-2.6.8.1/kernel/sched.c linux-2.6.8.1-dd/kernel/sched.c --- linux-2.6.8.1/kernel/sched.c 2004-08-14 06:55:59.000000000 -0400 +++ linux-2.6.8.1-dd/kernel/sched.c 2005-01-19 14:13:07.000000000 -0500 @@ -1068,6 +1068,28 @@ rq->prev_mm = oldmm; } + /* + * If next's dd_state is DD_SLEEP, it must be a task that had been + * put to sleep by a syscall that we instrumented. Now the task must + * have been woken up. Thus we reset its dd_state to DD_NORMAL. + * -- tongli + */ + if (next->dd_state == DD_SLEEP) { + struct dd_sleep_info *sleep_info = next->dd_info->wait_info.next; + struct dd_sleep_info *next_sleep_info; + next->dd_state = DD_NORMAL; + spin_lock(&next->dd_info_lock); + while (sleep_info) { + next_sleep_info = sleep_info->next; + kfree(sleep_info); + sleep_info = next_sleep_info; + } + kfree(next->dd_info); + next->dd_info = NULL; + next->dd_sleep_last_interval = 0; + spin_unlock(&next->dd_info_lock); + } + /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -4043,3 +4065,728 @@ EXPORT_SYMBOL(__preempt_write_lock); #endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */ + +/***********************************************************************/ +/* New functions for deadlock detection. -- tongli */ +/***********************************************************************/ + +/* Switch tasks from prev back to next. -- tongli */ +#define dd_switch_back(prev, next, last) do { \ + unsigned long esi,edi; \ + asm volatile("movl %3,%%esp\n\t" /* restore ESP */ \ + "pushl %4\n\t" /* restore EIP */ \ + "jmp __switch_to\n" \ + :"=a" (last),"=S" (esi),"=D" (edi) \ + :"m" (next->thread.esp),"m" (next->thread.eip), \ + "0" (prev), "d" (next)); \ +} while (0) + +/* When entering the function, the return address is on the top of the + * stack, followed by the first argument, the second argument, and so on. + * Since this function never returns, the return address is only a place + * holder. See dd_switch_to, in which we push a constant 0 as the return + * address. -- tongli + */ +static void in_kernel_fork(struct task_struct *next) +{ + /* We're now in the context of the sleeping task. */ + struct task_struct *prev = current; + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; + runqueue_t *rq; + + prev->state = TASK_RUNNING; + + /* Set current task's dd_state to DD_FORK temporarily. */ + prev->dd_state = DD_FORK; + + /* Temporarily enqueue this task to the runqueue. */ + rq = this_rq(); + rq->curr = prev; + enqueue_task(prev, rq->active); + + /* We've finished the context switch, so re-enable interrupts and + * release runqueue lock (this is necessary because do_fork may + * sleep and invoke the scheduler). */ + spin_unlock_irq(&rq->lock); + preempt_enable_no_resched(); + + do_fork(SIGCHLD, prev->dd_regs->esp, prev->dd_regs, 0, NULL, NULL); + + /* Now we need to context switch again, so disable interrupts and + * obtain runqueue lock. The current CPU may be different from the + * one before do_fork. */ + preempt_disable(); + rq = this_rq(); + spin_lock_irq(&rq->lock); + + /* If we are on a different CPU from the one before the previous + * context switch, we need to update the CPU field of the task + * running fork_process(). */ + next->thread_info->cpu = prev->thread_info->cpu; + + /* Dequeue this task from the runqueue. */ + dequeue_task(prev, prev->array); + /* It's important to reset prev->array to NULL since it's originally + * NULL. If we don't reset it to NULL, things may go wrong with the + * sleeping task (e.g., if we do 'fg' and then Ctrl-C, we won't be + * able to kill the task correctly. */ + prev->array = NULL; + + if (unlikely(!mm)) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next); + } else + switch_mm(oldmm, mm, next); + + if (unlikely(!prev->mm)) + prev->active_mm = NULL; + + dd_switch_back(prev, next, prev); +} + +/* This macro is adapted from the switch_to macro in include/asm-i386/system.h. + * It switches in the next task and calls in_kernel_fork in the next task's + * context. -- tongli */ +#define dd_switch_to(prev,next,last) do { \ + unsigned long esi,edi; \ + asm volatile("pushfl\n\t" \ + "pushl %%gs\n\t" \ + "pushl %%fs\n\t" \ + "pushl %%es\n\t" \ + "pushl %%ds\n\t" \ + "pushl %%eax\n\t" \ + "pushl %%ebp\n\t" \ + "pushl %%edi\n\t" \ + "pushl %%esi\n\t" \ + "pushl %%edx\n\t" \ + "pushl %%ecx\n\t" \ + "pushl %%ebx\n\t" \ + "movl %%esp,%0\n\t" /* save ESP */ \ + "movl %5,%%esp\n\t" /* restore ESP */ \ + "movl $1f,%1\n\t" /* save EIP */ \ + "pushl %7\n\t" /* pass prev */ \ + "pushl $0\n\t" /* return addr (placeholder) */\ + "pushl %6\n\t" /* set new EIP */ \ + "jmp __switch_to\n" \ + "1:\t" \ + "popl %%ebx\n\t" \ + "popl %%ecx\n\t" \ + "popl %%edx\n\t" \ + "popl %%esi\n\t" \ + "popl %%edi\n\t" \ + "popl %%ebp\n\t" \ + "popl %%eax\n\t" \ + "popl %%ds\n\t" \ + "popl %%es\n\t" \ + "popl %%fs\n\t" \ + "popl %%gs\n\t" \ + "popfl" \ + :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \ + "=a" (last),"=S" (esi),"=D" (edi) \ + :"m" (next->thread.esp), "i" (in_kernel_fork), \ + "2" (prev), "d" (next)); \ +} while (0) + +/* Fork a given process -- tongli */ +asmlinkage void fork_process(struct task_struct *tsk) +{ + struct task_struct *prev = current; + struct mm_struct *mm = tsk->mm; + struct mm_struct *oldmm = prev->active_mm; + struct runqueue *rq; + /* + * Save the following fields of the sleeping task because they will + * be changed by what we do for speculative execution, and we need + * to restore them to their original values when we put this task + * back to sleep. + */ + int orig_cpu = tsk->thread_info->cpu; + long orig_state = tsk->state; + unsigned long orig_esp = tsk->thread.esp; + unsigned long orig_eip = tsk->thread.eip; + unsigned long orig_gs = tsk->thread.gs; + unsigned long orig_fs = tsk->thread.fs; + + /* We are about to context switch, so disable interrupts and obtain + * runqueue lock. The context switch code in this function is based + * on the code in schedule(). */ + preempt_disable(); + release_kernel_lock(prev); + rq = this_rq(); + spin_lock_irq(&rq->lock); + + /* Deactivate this task, because we don't want the scheduler to + * choose this task to run until we've done the fork. This is + * similar to how sys_sched_yield is implemented. */ + dequeue_task(prev, prev->array); + + /* We're going to speculatively switch in the sleeping task, which + * may have run on a different CPU than our current CPU when it was + * put to sleep. So we need to set the task's cpu field temporarily + * to the current cpu. + */ + tsk->thread_info->cpu = prev->thread_info->cpu; + + if (unlikely(!mm)) { + tsk->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, tsk); + } else + switch_mm(oldmm, mm, tsk); + + if (unlikely(!prev->mm)) + prev->active_mm = NULL; + + /* Switch to the sleeping task. */ + dd_switch_to(prev, tsk, prev); + barrier(); + + /* Restore fields of the sleeping task. */ + tsk->thread_info->cpu = orig_cpu; + tsk->state = orig_state; + tsk->thread.esp = orig_esp; + tsk->thread.eip = orig_eip; + tsk->thread.gs = orig_gs; + tsk->thread.fs = orig_fs; + tsk->dd_state = DD_SLEEP; + + /* We're switched back in. Re-activate this task. We may be on a + * different CPU than the one before the context switch. */ + rq = this_rq(); + rq->curr = prev; + enqueue_task(prev, rq->active); + + /* Release runqueue lock and re-enable interrupts. */ + spin_unlock_irq(&rq->lock); + reacquire_kernel_lock(prev); + preempt_enable_no_resched(); +} + +/* Flip the wait condition of current task. -- tongli */ +asmlinkage int flip_wait_condition(void) +{ + struct task_struct *task = current; + struct dd_struct *dd_info; + char op; + int val, ret; + int *addr; + spin_lock(&task->parent->dd_info_lock); + dd_info = task->parent->dd_info; + if (dd_info == NULL) { + /* Parent was awakened before the speculative child starts to + * run. Kill the child. */ + spin_unlock(&task->parent->dd_info_lock); + do_exit(0); + } + op = dd_info->wait_info.op; + val = dd_info->wait_info.val; + addr = (int *) dd_info->wait_info.addr; + ret = dd_info->ret; + spin_unlock(&task->parent->dd_info_lock); + + switch (op) { + case DD_EQUAL: + put_user(val, addr); + break; + case DD_GREATER: + put_user(val + 1, addr); + break; + case DD_AND: + break; + default: + BUG(); + } + /* Return the system call return value in %eax. */ + return ret; +} + +/* Detect if there is a cycle starting and ending at a given node. -- tongli */ +int detect_cycle(struct graph_node *node, int start) +{ + struct neighbor_node *neighbor; + struct dd_task *task; + struct dd_sleep_info *sleep_info; + + if (node->visited == BEING_VISITED && node->id == start) { + /* Cycle detected. */ + task = (struct dd_task *) node->info_ptr; + printk("Cycle: %d (%s)", task->pid, task->comm); + return 1; + } + + if (node->visited == DONE_VISITED) + return 0; + + if (node->id < start) + return 0; + + node->visited = BEING_VISITED; + for (neighbor = node->neighbor_list; + neighbor; + neighbor = neighbor->next) + if (detect_cycle(neighbor->gnode, start)) { + if (node->type == TASK) { + task = (struct dd_task *) node->info_ptr; + printk(" <-- %d (%s)", task->pid, task->comm); + } else { + sleep_info = (struct dd_sleep_info *) node->info_ptr; + printk(" <-- 0x%lx (%s)", + sleep_info->addr, sleep_info->comments); + } + node->visited = DONE_VISITED; + return 1; + } + + node->visited = DONE_VISITED; + return 0; +} + +/* New system call for deadlock detection. -- tongli */ +asmlinkage void sys_detect_deadlock(void) +{ + struct task_struct *g, *p; + int i, DD_TIMEOUT = 1; + + struct dd_task *dd_task_list = NULL, *node, *next_node, *sleep_node, + *wake_node; + struct dd_spec_task *spec_task_list = NULL, *spec_task, *next_spec_task; + struct dd_sleep_info *sleep_info, *wake_info; + struct graph_node *gnode; + struct neighbor_node *neighbor = NULL; + struct dd_resource *dd_resource_list = NULL; + struct dd_resource *resource, *next_resource; + int invoke_detection = 0, total_gnodes = 0, node_id_min, node_id_max; + int cycles; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if ((p->state == TASK_UNINTERRUPTIBLE + || p->state == TASK_INTERRUPTIBLE) && p->dd_state == DD_SLEEP) { + /* Only invoke deadlock detection if the thread has been + * sleeping since at least the last detection interval. */ + if (p->dd_sleep_last_interval) { + /* Between the read_unlock below and fork_process finishing + * successefully, the task struct pointed to by p could be + * released (e.g., somebody kills p). Futhermore, we want p + * to be valid until deadlock detection is done. Thus, we + * increment p's usage count here, and decrement it when p + * is not used any more. */ + get_task_struct(p); + read_unlock(&tasklist_lock); + invoke_detection = 1; + p->dd_being_checked = 1; + printk("Speculatively fork process %d (%s):\n", p->pid, + p->comm); + fork_process(p); + read_lock(&tasklist_lock); + } else { + p->dd_sleep_last_interval = 1; + } + } + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + + if (!invoke_detection) { + /* Fast path. No need to invoke deadlock detection. */ + return; + } + + /* Sleep for DD_TIMEOUT seconds. */ + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(DD_TIMEOUT * HZ); + + /* + * Speculative execution stops if either of the following is true: + * 1. The speculative task exits normally. + * 2. The speculative task has done DD_WAKE_NUM wakeups. + * 3. DD_TIMEOUT seconds have elapsed. + * + * Thus, if we get here and a speculative task is still running, we + * should kill it. + */ + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if ((p->state == TASK_UNINTERRUPTIBLE + || p->state == TASK_INTERRUPTIBLE) + && p->dd_being_checked && p->dd_sleep_last_interval) { + /* Sleeping task */ + read_unlock(&tasklist_lock); +#if 0 + printk("Thread %d (%s):\n", p->pid, p->comm); + sleep_info = &p->dd_info->wait_info; + while (sleep_info) { + printk("waits on 0x%lx (%s) op %d val %d (0x%x)\n", + sleep_info->addr, sleep_info->comments, + sleep_info->op, sleep_info->val, sleep_info->val); + sleep_info = sleep_info->next; + } + for (i = 0; i < p->dd_info->wake_info_last; i++) { + printk("wakes up 0x%lx (%s) op %d val %d (0x%x)\n", + p->dd_info->wake_info[i].addr, + p->dd_info->wake_info[i].comments, + p->dd_info->wake_info[i].op, + p->dd_info->wake_info[i].val, + p->dd_info->wake_info[i].val); + } +#endif + /* Record this task in dd_task_list */ + node = kmalloc(sizeof(struct dd_task), GFP_KERNEL); + node->pid = p->pid; + strcpy(node->comm, p->comm); + node->gnode = NULL; + node->next = dd_task_list; + + spin_lock(&p->dd_info_lock); + /* The task may have exited or woken up and thus freed + * dd_info. Check it here. */ + if (p->dd_info) { + struct dd_sleep_info *dest; + node->wait_info = p->dd_info->wait_info; + sleep_info = p->dd_info->wait_info.next; + dest = &node->wait_info; + while (sleep_info) { + dest->next = + kmalloc(sizeof(struct dd_sleep_info), GFP_KERNEL); + dest = dest->next; + *dest = *sleep_info; + sleep_info = sleep_info->next; + } + for (i = 0; i < DD_WAKE_NUM; i++) + node->wake_info[i] = p->dd_info->wake_info[i]; + node->wake_info_last = p->dd_info->wake_info_last; + /* Reset task's wake_info_last. */ + p->dd_info->wake_info_last = 0; + dd_task_list = node; + } else + kfree(node); + spin_unlock(&p->dd_info_lock); + + read_lock(&tasklist_lock); + } else if (p->dd_state == DD_SPEC) { + /* Speculative task */ + read_unlock(&tasklist_lock); + /* Record this task in spec_task_list */ + spec_task = kmalloc(sizeof(struct dd_spec_task), GFP_KERNEL); + spec_task->task = p; + spec_task->next = spec_task_list; + spec_task_list = spec_task; + read_lock(&tasklist_lock); + } + if (p->dd_being_checked) { + /* We're done using p's task struct. */ + p->dd_being_checked = 0; + put_task_struct(p); + } + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + + /* Kill all the speculative tasks. */ + spec_task = spec_task_list; + while (spec_task) { + force_sig(SIGKILL, spec_task->task); + + next_spec_task = spec_task->next; + kfree(spec_task); + spec_task = next_spec_task; + } + + printk("Deadlock detection results:\n"); + printk("----------------------------------------\n"); + + /* Scan dd_task_list to establish task dependencies. */ + for (sleep_node = dd_task_list; + sleep_node; + sleep_node = sleep_node->next) { + for (sleep_info = &sleep_node->wait_info; + sleep_info; + sleep_info = sleep_info->next) { + /* sleep_info represents what the task is waiting for, now we + * check if it matches any wakeup event produced by another + * task */ + for (wake_node = dd_task_list; + wake_node; + wake_node = wake_node->next) { + if (wake_node == sleep_node) + continue; + for (i = 0; i < wake_node->wake_info_last; i++) + { + int match = 0; + wake_info = &wake_node->wake_info[i]; + if (sleep_info->op == wake_info->op && + sleep_info->addr == wake_info->addr) { + switch (sleep_info->op) { + case DD_EQUAL: + if (wake_info->val == sleep_info->val) + match = 1; + break; + case DD_GREATER: + if (wake_info->val >= sleep_info->val) + match = 1; + break; + case DD_AND: + if (wake_info->val & sleep_info->val) + match = 1; + break; + default: + BUG(); + } + } + if (match) { + int new_requestor = 0, new_producer = 0, + new_resource = 0; + /* We've identified an edge (dependence). Construct a + * graph node for the requestor if we haven't done + * so. */ + if (sleep_node->gnode == NULL) { + gnode = kmalloc(sizeof(struct graph_node),GFP_KERNEL); + gnode->info_ptr = (unsigned long) sleep_node; + gnode->neighbor_list = NULL; + gnode->visited = NOT_VISITED; + gnode->type = TASK; + gnode->indegree = 0; + gnode->outdegree = 0; + sleep_node->gnode = gnode; + total_gnodes++; + new_requestor = 1; + printk("Task: %d (%s)\n", + sleep_node->pid, sleep_node->comm); + } + + /* Construct a graph node for the producer task if we + * haven't done so. */ + if (wake_node->gnode == NULL) { + gnode = kmalloc(sizeof(struct graph_node),GFP_KERNEL); + gnode->info_ptr = (unsigned long) wake_node; + gnode->neighbor_list = NULL; + gnode->visited = NOT_VISITED; + gnode->type = TASK; + gnode->indegree = 0; + gnode->outdegree = 0; + wake_node->gnode = gnode; + total_gnodes++; + new_producer = 1; + printk("Task: %d (%s)\n", wake_node->pid, + wake_node->comm); + } + /* Construct a graph node for the resource if we + * haven't done so. */ + for (resource = dd_resource_list; + resource; + resource = resource->next) { + if (resource->addr == sleep_info->addr) + break; + } + if (!resource) { + /* We haven't constructed this resource node. */ + gnode = kmalloc(sizeof(struct graph_node),GFP_KERNEL); + gnode->info_ptr = (unsigned long) sleep_info; + gnode->neighbor_list = NULL; + gnode->visited = NOT_VISITED; + gnode->type = RESOURCE; + gnode->indegree = 0; + gnode->outdegree = 0; + /* Record in dd_resource_list */ + resource = + kmalloc(sizeof(struct dd_resource), GFP_KERNEL); + resource->addr = sleep_info->addr; + resource->next = dd_resource_list; + resource->gnode = gnode; + dd_resource_list = resource; + total_gnodes++; + new_resource = 1; + printk("Resource: 0x%lx (%s)\n", resource->addr, + sleep_info->comments); + } + + /* Construct an edge between the requestor and the + * resource if we haven't done so. */ + if (new_requestor || new_resource) { + neighbor = + kmalloc(sizeof(struct neighbor_node), GFP_KERNEL); + neighbor->gnode = resource->gnode; + neighbor->next = sleep_node->gnode->neighbor_list; + sleep_node->gnode->neighbor_list = neighbor; + resource->gnode->indegree++; + sleep_node->gnode->outdegree++; + printk("Edge: %d -> 0x%lx\n", sleep_node->pid, + resource->addr); + } + + if (new_producer || new_resource) { + /* Construct an edge between the resource and the + * producer. */ + neighbor = + kmalloc(sizeof(struct neighbor_node), GFP_KERNEL); + neighbor->gnode = wake_node->gnode; + neighbor->next = resource->gnode->neighbor_list; + resource->gnode->neighbor_list = neighbor; + wake_node->gnode->indegree++; + resource->gnode->outdegree++; + printk("Edge: 0x%lx -> %d\n", resource->addr, + wake_node->pid); + } + } + } + } + } + } + + node_id_min = 0; + node_id_max = total_gnodes - 1; + /* Assign a unique number to each graph node. */ + for (node = dd_task_list; node; node = node->next) { + if (node->gnode == NULL) + /* There's no graph node for this task. It can be also viewed + * as that this task has a graph node but the node doesn't + * connect to any other node. */ + continue; + if (node->gnode->indegree > 1 || node->gnode->outdegree > 1) + node->gnode->id = node_id_max--; + else + node->gnode->id = node_id_min++; + } + for (resource = dd_resource_list; resource; resource = resource->next) + resource->gnode->id = node_id_min++; + + cycles = 0; + /* + * Detect all cycles. We assign a unique number to each graph node. + * For each node, we use DFS to search for a cycle that starts and + * ends at the node. To avoid printing out duplicate cycles, we + * abort the DFS whenever it visits a node with a smaller number + * than the number of the start node. Thus we output a cycle only if + * the DFS starts at the smallest numbered node in the cycle. If a + * node has more than one incoming or outgoing edge, and this node + * is the smallest numbered node in two different cycles, our + * algorithm would output only one of the two cycles and miss + * the other one. To reduce the chance of this problem occurring, we + * assign relatively big numbers to nodes with in and out degrees + * greater than one (see code above). This ensures that if a cycle + * has at least one node with in and out degrees of one (thus its + * assigned number is relatively small), then our algorithm will + * detect this cycle starting from this node. However, if a cycle + * has all its nodes with in and out degrees greater than one, we + * still may not be able to detect this cycle. In practice, this + * can be rare. Thus our algorithm is 100% accurate for detecting + * the existence of cycles and accurate with high probability for + * detecting ALL the cycles. + */ + for (node = dd_task_list; node; node = node->next) { + struct dd_task *tmp; + if (node->gnode == NULL) + continue; + if (detect_cycle(node->gnode, node->gnode->id)) { + cycles++; + printk("\n"); + } + if (node->next) { + /* Reset node states to NOT_VISITED. */ + for (tmp = dd_task_list; tmp; tmp = tmp->next) { + if (tmp->gnode == NULL) + continue; + tmp->gnode->visited = NOT_VISITED; + } + for (resource = dd_resource_list; + resource; resource = resource->next) { + if (resource->gnode == NULL) + continue; + resource->gnode->visited = NOT_VISITED; + } + } + } + + if (cycles == 1) + printk("1 cycle found!\n"); + else + printk("%d cycles found!\n", cycles); + printk("----------------------------------------\n"); + + /* Cleanup */ + node = dd_task_list; + while (node) { + struct dd_sleep_info *next_sleep_info; + next_node = node->next; + sleep_info = node->wait_info.next; + while (sleep_info) { + next_sleep_info = sleep_info->next; + kfree(sleep_info); + sleep_info = next_sleep_info; + } + if (node->gnode) { + struct neighbor_node *next_neighbor; + neighbor = node->gnode->neighbor_list; + while (neighbor) { + next_neighbor = neighbor->next; + kfree(neighbor); + neighbor = next_neighbor; + } + kfree(node->gnode); + } + kfree(node); + node = next_node; + } + resource = dd_resource_list; + while (resource) { + next_resource = resource->next; + if (resource->gnode) { + struct neighbor_node *next_neighbor; + next_neighbor = resource->gnode->neighbor_list; + while (neighbor) { + next_neighbor = neighbor->next; + kfree(neighbor); + neighbor = next_neighbor; + } + kfree(resource->gnode); + } + kfree(resource); + resource = next_resource; + } +} + +/* Record a wakeup event. -- tongli */ +void dd_record_wake_event(struct task_struct *task, unsigned long addr, + char op, int val, char *comments) +{ + int idx, i; + struct dd_struct *dd_info; + + spin_lock(&task->dd_info_lock); + dd_info = task->dd_info; + if (dd_info == NULL) { + /* This must be the case that the parent has woken up and started + * running again (see code in context_switch()). Thus the + * speculative child should exit now. */ + spin_unlock(&task->dd_info_lock); + do_exit(0); + } + + idx = dd_info->wake_info_last; + /* Check if we've saved this event before. If so, don't save it + * again. */ + for (i = 0; i < idx; i++) { + if (dd_info->wake_info[i].addr == addr && + dd_info->wake_info[i].op == op && + dd_info->wake_info[i].val == val) + break; + } + if (i < idx) { + /* Found the event. */ + spin_unlock(&task->dd_info_lock); + return; + } + + dd_info->wake_info[idx].addr = addr; + dd_info->wake_info[idx].op = op; + dd_info->wake_info[idx].val = val; + strcpy(dd_info->wake_info[idx].comments, comments); + idx++; + dd_info->wake_info_last = idx; + if (idx == DD_WAKE_NUM) { + /* Maximum wakeups reached. */ + spin_unlock(&task->dd_info_lock); + do_exit(0); + } + spin_unlock(&task->dd_info_lock); +} diff -Naur linux-2.6.8.1/kernel/signal.c linux-2.6.8.1-dd/kernel/signal.c --- linux-2.6.8.1/kernel/signal.c 2004-08-14 06:55:19.000000000 -0400 +++ linux-2.6.8.1-dd/kernel/signal.c 2005-01-19 14:13:07.000000000 -0500 @@ -2171,6 +2171,10 @@ { struct siginfo info; + /* Speculative tasks return immediately. -- tongli */ + if (current->dd_state == DD_SPEC) + return 0; + info.si_signo = sig; info.si_errno = 0; info.si_code = SI_USER; diff -Naur linux-2.6.8.1/kernel/sys.c linux-2.6.8.1-dd/kernel/sys.c --- linux-2.6.8.1/kernel/sys.c 2004-08-14 06:54:49.000000000 -0400 +++ linux-2.6.8.1-dd/kernel/sys.c 2005-01-19 14:13:07.000000000 -0500 @@ -281,6 +281,10 @@ cond_syscall(sys_pciconfig_write) cond_syscall(sys_pciconfig_iobase) +/* New syscalls for deadlock detection. -- tongli */ +cond_syscall(sys_futex_dd) +cond_syscall(sys_detect_deadlock) + static int set_one_prio(struct task_struct *p, int niceval, int error) { int no_nice; diff -Naur linux-2.6.8.1/net/socket.c linux-2.6.8.1-dd/net/socket.c --- linux-2.6.8.1/net/socket.c 2004-08-14 06:55:10.000000000 -0400 +++ linux-2.6.8.1-dd/net/socket.c 2005-01-19 14:12:42.000000000 -0500 @@ -1669,6 +1669,10 @@ int err; struct socket *sock; + /* Speculative tasks return immediately. -- tongli */ + if (current->dd_state == DD_SPEC) + return 0; + if ((sock = sockfd_lookup(fd, &err))!=NULL) { err = security_socket_shutdown(sock, how);