***** SCHEDULING!!!! ***** - This time: What processes are runnable? - Next time: How to choose between runnable processes? * What is a process? - = user environment - Memory context - Resource context (file descriptors, signals, etc. in Unix) - Processor context (registers) - Process relationships (parents) - Unix: Identities (user ID, group ID, effective user ID, ...; or capabilities) - JOS: struct Env { struct Trapframe env_tf; // Saved registers LIST_ENTRY(Env) env_link; // Free list link pointers envid_t env_id; // Unique environment identifier envid_t env_parent_id; // env_id of this env's parent unsigned env_status; // Status of the environment // Address space pde_t* env_pgdir; // Kernel virtual address of page dir physaddr_t env_cr3; // Physical address of page dir // ... more later ... }; -- Do we need both env_pgdir and env_cr3? (No) - Linux: 1440 bytes!!!!!!!!!!! struct task_struct { /* * offsets of these are hardcoded elsewhere - touch with care */ >>>>> volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ unsigned long flags; /* per process flags, defined below */ int sigpending; >>>>> mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ struct exec_domain *exec_domain; volatile long need_resched; unsigned long ptrace; int lock_depth; /* Lock depth */ /* * offset 32 begins here on 32-bit platforms. We keep * all fields in a single cacheline that are needed for * the goodness() loop in schedule(). */ long counter; long nice; unsigned long policy; struct mm_struct *mm; int processor; /* * cpus_runnable is ~0 if the process is not running on any * CPU. It's (1 << cpu) if it's running on a CPU. This mask * is updated under the runqueue lock. * * To determine whether a process might run on a CPU, this * mask is AND-ed with cpus_allowed. */ unsigned long cpus_runnable, cpus_allowed; /* * (only the 'next' pointer fits into the cacheline, but * that's just fine.) */ struct list_head run_list; unsigned long sleep_time; >>>>> struct task_struct *next_task, *prev_task; struct mm_struct *active_mm; >>>>> struct list_head local_pages; unsigned int allocation_order, nr_local_pages; /* task state */ struct linux_binfmt *binfmt; int exit_code, exit_signal; int pdeath_signal; /* The signal sent when the parent dies */ /* ??? */ unsigned long personality; int did_exec:1; unsigned task_dumpable:1; >>>>> pid_t pid; pid_t pgrp; pid_t tty_old_pgrp; pid_t session; pid_t tgid; /* boolean value for session group leader */ int leader; /* * pointers to (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with * p->p_pptr->pid) */ >>>>> struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr; struct list_head thread_group; /* PID hash table linkage. */ struct task_struct *pidhash_next; struct task_struct **pidhash_pprev; wait_queue_head_t wait_chldexit; /* for wait4() */ struct completion *vfork_done; /* for vfork() */ unsigned long rt_priority; unsigned long it_real_value, it_prof_value, it_virt_value; unsigned long it_real_incr, it_prof_incr, it_virt_incr; struct timer_list real_timer; struct tms times; unsigned long start_time; long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS]; /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; int swappable:1; /* process credentials */ uid_t uid,euid,suid,fsuid; gid_t gid,egid,sgid,fsgid; int ngroups; gid_t groups[NGROUPS]; kernel_cap_t cap_effective, cap_inheritable, cap_permitted; int keep_capabilities:1; struct user_struct *user; /* limits */ struct rlimit rlim[RLIM_NLIMITS]; unsigned short used_math; char comm[16]; /* file system info */ int link_count, total_link_count; struct tty_struct *tty; /* NULL if no tty */ unsigned int locks; /* How many file locks are being held */ /* ipc stuff */ struct sem_undo *semundo; struct sem_queue *semsleeping; /* CPU-specific state of this task */ >>>>> struct thread_struct thread; /* filesystem information */ struct fs_struct *fs; /* open file information */ struct files_struct *files; /* namespace */ struct namespace *namespace; /* signal handlers */ spinlock_t sigmask_lock; /* Protects signal and blocked */ struct signal_struct *sig; sigset_t blocked; struct sigpending pending; unsigned long sas_ss_sp; size_t sas_ss_size; int (*notifier)(void *priv); void *notifier_data; sigset_t *notifier_mask; /* Thread group tracking */ u32 parent_exec_id; u32 self_exec_id; /* Protection of (de-)allocation: mm, files, fs, tty */ spinlock_t alloc_lock; /* journalling filesystem info */ void *journal_info; }; struct thread_struct { /* Equivalent to JOS Trapframe */ unsigned long esp0; unsigned long eip; unsigned long esp; unsigned long fs; unsigned long gs; /* Hardware debugging registers */ unsigned long debugreg[8]; /* %%db0-7 debug registers */ /* fault info */ unsigned long cr2, trap_no, error_code; /* floating point info */ union i387_union i387; /* virtual 86 mode info */ struct vm86_struct * vm86_info; unsigned long screen_bitmap; unsigned long v86flags, v86mask, saved_esp0; /* IO permissions */ int ioperm; unsigned long io_bitmap[IO_BITMAP_SIZE+1]; }; * The schedulable list - A list of processes that are runnable - The kernel's central scheduling loop, often called schedule() or similar, chooses a process and runs it - The kernel has a chance to change its mind every time it gets called -- System calls -- Timer interrupts, in case a process goes nuts -- In many OSes, kernel will *not* change its mind on *every* system call; but if it wanted to, it could! - How to decide between runnable processes is a big deal -- Don't want to switch unnecessarily, particularly so on the x86; why? ... Must clear TLB on each context switch! ... Hardware limitation; could have a processor that remembers "memory context IDs" in its TLB, such as the MIPS with its ASIDs -- System responsiveness -- Fairness -- Real-time guarantees? -- Next lecture * What processes are runnable? - Many reasons why a process might not be able to do its work -- Reading from a process that's generating output slowly -- Waiting for user input -- Reading from a slow network connection - How is this handled? -- Blocking -- Polling - Advantages/disadvantages? -- Switching between processes is not free! -- Polling's expensive therefore -- So it doesn't make sense to wake up a process that has nothing to do * Blocking implementation in conventional Unix - Every process has a *kernel stack*, for execution in the kernel -- Runs kernel code in response to application requests -- "system time" reported by time() means time spent on that kernel stack (I think) -- Contrast JOS: single kernel stack - How does blocking happen? -- The application reaches a point where it cannot continue any more -- So it sets up a *callback* attached to some event ... On Linux, a waitqueue ... Event == waitqueue ... Waitqueue entry == wake me up if something happens init_waitqueue_entry(wait_queue_t*, task*) ... Attach entry to queue: queue_head represents the event add_wait_queue(wait_queue_head_t*, wait_queue_t*) ... Wake up everything listening on a queue wake_up(wait_queue_head_t*, unsigned mode, int nr /* for exclusive wakeups: wake up one process listening */) -- After setting up callback, call the scheduler ... On Linux, schedule() is the scheduler; schedule() runs on current process's kernel stack, then calls switch_to() to switch stacks ... When we switch back to this process, schedule() will return! ... On every kernel stack, the topmost stack frame belongs to schedule()/ switch_to() .... (Probably) -- BSD implementation of select: everything that supports select() [files, ttys] has an object of type 'struct selinfo' ... struct selinfo.si_thread: struct thread* -- the thread that's waiting on this object - The problem with blocking -- Can only block one one thing at a time! -- What if a process could make progress on multiple "requests"? - Solution: select() -- int select(int nfds, fd_set* readfds, fd_set* writefds, fd_set* exceptfds, struct timeval* timeout) - What does this do? -- Wait until timeout expires (how is this implemented?) or any of the events fire (how is this implemented?) -- Adds a *bunch* of waitqueue entries -- Remove them all when done - The problem with select() -- Expensive! -- Expensive to move the bitmaps across into the kernel -- Expensive to traverse the bitmaps -- Wasteful to add and delete so many waitqueue entries -- Example: Banga et al., scalable event delivery ... Select was a horrible problem ... Speed up some operations using bit twiddling ... Speed up other operations using complex manipulations Remember previously interesting descriptors (old select()) Scan those descriptors on new select() only if they've changed ... System 2x more efficient (1/2 CPU time/request) ... Still select() is 25% of the overhead!! - Even worse: select collision on BSD -- More than one process is select()ing the same file object -- How can this happen? ... File descriptor inheritance ... Concrete example: name resolution - How do we solve this? - Essentially: Leave the waitqueue entries around! - Design goals for kqueue() -- Eliminate per-fd*syscall overhead from select, poll -- Reliable: never lose an event ... Counterexample: signals ... Changes what events mean: "something is available", not an endless list ... Bits, not linked lists of data ... Allocate all necessary memory *before the event arrives* -- Level-triggered, not edge-triggered ... API reports the *existence of a condition*, not the one-time occurrence of an event ... Events are not reported unless they change the condition -- Libraryable ... Counterexample: signals -- Correct: only report an event if it's applicable ... Calling this "correctness" is debatable - kqueue commands -- int kqueue(void) -- int kevent(int kq, const struct kevent* changelist, int nchanges, struct kevent* eventlist, int nevents, const struct timespec* to) struct kevent { uintptr_t ident; // identifier for event // e.g. file descriptor, signal number int16_t filter; // filter for event // EVFILT_READ, EVFILT_WRITE, EVFILT_AIO, // EVFILT_VNODE, EVFILT_PROC, EVFILT_SIGNAL uint16_t flags; // action flags for kq // EV_ADD, EV_ENABLE, EV_DISABLE, EV_DELETE, EV_CLEAR, // EV_ONESHOT uint32_t fflags; // filter flag value // filter-specific // NOTE_DELETE, NOTE_WRITE, NOTE_EXTEND, etc. for VNODE // NOTE_EXIT, NOTE_FORK, NOTE_EXEC, etc. for PROC // also output intptr_t data; // filter data value void* udata; // opaque to kernel } - Implementation -- struct knote { SLIST_ENTRY(knote) kn_link; /* for kq */ SLIST_ENTRY(knote) kn_selnext; /* for struct selinfo */ struct knlist * kn_knlist; /* f_attach populated */ TAILQ_ENTRY(knote) kn_tqe; struct kqueue * kn_kq; /* which queue we are on */ struct kevent kn_kevent; int kn_status; /* protected by kq lock */ int kn_sfflags; /* saved filter flags */ intptr_t kn_sdata; /* saved data field */ union { struct file * p_fp; /* file data pointer */ struct proc * p_proc; /* proc pointer */ } kn_ptr; struct filterops *kn_fop; void *kn_hook; }; struct filterops { int f_isfd; /* true if ident == filedescriptor */ int (*f_attach)(struct knote *kn); void (*f_detach)(struct knote *kn); int (*f_event)(struct knote *kn, long hint); }; -- struct kqueue ... A list of active knotes ... An array of knote lists, indexed by file descriptor ... A hash table for knotes not corresponding to files ... DRAW PICTURE: Figure 9 -- Filter function ... ... attach: take a knote => attach it appropriately ... detach: reverse ... filter: bool (*filter)(knote*, long hint) ... That's it! A very clean interface -- Hooking it up to other kernel structures ... Anything that might need to attach to kqueue() has a 'struct knlist' ... When something happens, call every filter function attached to that knlist! ... void knote(struct knlist*, long hint, int islocked) -- What does a filter do when an event has triggered? ... Just returns true ... knote() call enqueues knote on kqueue's active list ... If active list was empty, wake up any waiting processes! ... Wake up the corresponding kqueue! -- Performance results -- Figure 12, Figure 14, Figure 15, Figure 16 - This API is more exokernelish! -- Can imagine there being only one place where the process blocks -- Everything else implemented in a library ... How?