lint
proc
et user
/dev/mem, /dev/kmem
représentent la mémoire physique et
la mémoire avec l'adressage du noyau. Exemple (dans ps.c
):
kmem = open("/dev/kmem"; 0); if (kmem < 0) { perror(kmemf); exit(1); } ... kget(dest, loc, size) char *dest; register unsigned loc; register unsigned size; { lseek(kmem, (off_t)loc, 0); if (read(kmem, dest, size) != size) printf("error reading kmem at %x\n", loc); }
/dev/null
$==$ la poubelle
% mf "\mode:=$MODE; mag:=$MAG; scrollmode; input $NAMË </dev/null
load
et store
pc
program
counter)
TIT .mips, RISC Architecture SLA .Gerry Kane - MIPS Computer Systems, Inc. .Prentice Hall, 1987. -
load
et store
load
retardés
rd = rs op rt rd = rs op CONSTANTE(16 bits)où
rs, rt, rd
sont 32 registres. Toute instruction tient sur
32 bits; 6 bits sont réservés pour désigner l'opération (code-op).
load
et store
font
communiquer la mémoire et les registres. (On a toujours
r0
= 0).
beq rs, rt, OFFSET(16 bits) jump CONSTANTE(26 bits)où pour les branchements, l'adresse est obtenue par
pc
+
OFFSET * 4, et pour les sauts par pc
& 0xf0000000) + (CONSTANTE} * 4).
cc -S ps.c
):
# 238 argc--, argv++; lw $15, 72($sp) addu $24, $15, -1 sw $24, 72($sp) lw $25, 76($sp) addu $8, $25, 4 sw $8, 76($sp) # 275 if ((tptr = index(mytty,'y')) != 0) la $4, mytty li $5, 121 jal index sw $2, tptr lw $10, tptr beq $10, 0, $44
load
et branchements
ou sauts ...
# 238 argc--, argv++; 0x40020cqu'on obtient sous: lw $t7,72($sp) 0x400210 : lw $t9,76($sp) 0x400214 : addiu $t8,$t7,-1 0x400218 : addiu $t0,$t9,4 0x40021c : sw $t0,76($sp) 0x400220 : sw $t8,72($sp) 0x400224 : lw $t1,72($sp) # 275 if ((tptr = index(mytty,'y')) != 0) 0x40037c : lui $a0,4096 0x400380 : addiu $a0,$a0,32272 0x400384 : jal 0x405bf0 0x400388 : li $a1,121 0x40038c : sw $v0,-30568($gp) 0x400390 : lw $t4,-30568($gp) 0x400394 : nop 0x400398 : beq $t4,$zero,0x4003b0
gdb
en tapant
(gdb) info line 238 (gdb) x/i
sys/mips/mips/vmparam.h
décrit les paramètres de la mémoire
virtuelle dépendant de la ds5000
kpr% nm -gn a.out 00400170 T __start 004001b4 T moncontrol ... 100006c0 D nl 10000b04 D _iob 10000b60 D _ctype__ 100010a4 D sys_errlist ... 10010768 B rpc_createerr 10010780 B end
kpr% nm -gn /vmunix 8000007c A kstackflag 80030000 T eprol 80030000 T start ... 80150f00 B end c0000000 A Sysbase c2a00000 A usrpt c363c000 A cabase c3640000 A kmembase c3640000 A calimit c4640000 A kmemlimit c4640000 A VA_forkutl ... ffffc000 A u
usrpt
(User Page table) dans l'espace du système. Sur Vax,
comme les pages sont petites (512 octets), la table des pages
est à 2 niveaux. Il y a aussi des TLBs pour accélerer la
traduction.
kseg0
).
proc
et u
dans la zone kseg2
).
text
en
0x400000
, sa zone donnée en 10000000
, sa
pile décroissant à partir de 0x7fffffff
. Au bout de la
pile, il y a une page de zone rouge pour tester les débordements.
text
sont protégées en écriture.
dirty
pour
cette page, sur l'exception du TLB.
text
qui peut être partagé (table des pages incluse).
struct pte { #ifdef MIPSEB unsigned int pg_pfnum:20, /* HW: core page frame number or 0 */ pg_n:1, /* HW: non-cacheable bit */ pg_m:1, /* HW: modified (dirty) bit */ pg_v:1, /* HW: valid bit */ pg_g:1, /* HW: ignore pid bit */ :4, pg_swapm:1, /* SW: page must be forced to swap */ pg_fod:1, /* SW: is fill on demand (=0) */ pg_prot:2; /* SW: access control */ #endif MIPSEB #ifdef MIPSEL unsigned int pg_prot:2, /* SW: access control */ pg_fod:1, /* SW: is fill on demand (=0) */ pg_swapm:1, /* SW: page must be forced to swap */ :4, pg_g:1, /* HW: ignore pid bit */ pg_v:1, /* HW: valid bit */ pg_m:1, /* HW: modified (dirty) bit */ pg_n:1, /* HW: non-cacheable bit */ pg_pfnum:20; /* HW: core page frame number or 0 */ #endif MIPSEL };
raw
). Les zones de swap sont alloués par la structure
dmap
, où l'adresse donne la taille.
exec
,
le chargement peut se faire à la demande (page par page). Le principe
de la pagination est que les processus ont une certaine localité
dans les références mémoire, et ne se servent pas de toute leur
mémoire de manière aléatoire. Plus, l'espace de travail (
working set) est petit, meilleure sera le comportement paginé.
cmap[]
(core map) décrit l'état
des pages physiques.
pageout()
l'algorithme CLOCK (Corbato
) qui consiste à
faire le tour de la mémoire de tous les processus et repérer les
pages LRU. On fait le tour de la mémoire en invalidant les pages,
puis on regarde au 2ème tour si les pages sont toujours invalides.
Si oui, la page est marquée comme candidate à la libération.
Le surcoût du à l'invalidation des pages est faible. D'autres
algorithmes à plusieurs mains sont plus efficaces.
TIT .OPERATING SYSTEMS THEORY AUT .Coffman, Edward G. / Denning, Peter J. COL .Prentice Hall series in automatic computation DAT .1973
1 1 1 2 3 4 1 2 5 2 2 3 4 1 2 5 3 3 4 1 2 5 3 4 1 1 1 1 2 3 4 5 1 2 2 2 2 3 4 5 1 2 3 3 3 4 5 1 2 3 4 4 5 1 2 3 4 5
Belady
(largest forwardly used)
proc
de tcsh
p_link = 0x801853e0, /* linked list of running processes */ p_rlink = 0x0, p_nxt = 0x801862a0, /* linked list of allocated proc slots */ p_prev = 0x801856ac, /* also zombies, and free proc's */ p_addr = 0x801933d8, /* u-area kernel map address */ p_tlbpid = -1, /* tlb context of this proc */ p_tlbinfo = 0x801940d8, /* per process tlb mappings */ p_tlbindx = 3, /* index of next wired tlb entry to use */ p_usrpri = 57, /* user-priority based on p_cpu and p_nice */ p_pri = 40, /* priority, negative is high */ p_cpu = 31, /* cpu usage for scheduling */ p_stat = 1, p_time = 127, /* resident time for scheduling */ p_nice = 20, /* nice for cpu usage */ p_slptime = 127, /* time since last block */ p_cursig = 0, p_sig = 0, /* signals pending to this process */ p_sigmask = 2, /* current signal mask */ p_sigignore = 0x324004, /* signals being ignored */ p_sigcatch = 0x9882003, /* signals being caught by user */ p_flag = 0x13008201, p_uid = 20059, /* signals being caught by user */ p_suid = 20059, /* saved set uid, used to direct tty signals */ p_sgid = 0, /* saved set group id, used by setgid */ p_pgrp = 3855, /* name of process group leader */ p_pid = 3855, /* unique process id */ p_ppid = 3850, /* process id of parent */ p_xstat = 0, /* Exit status for wait */ p_progenv = 0, /* process compatibility mode */ p_sid = 0, /* session id (for POSIX job control) */ p_ru = 0x0, /* mbuf holding exit information */ p_poip = 0, /* page outs in progress */ p_tsize = 51, /* size of text (clicks) */ p_dsize = 21, /* size of data space (clicks) */ p_textstart = 0x400000, /* starting address of user text */ p_datastart = 0x10000000, /* starting address of user data */ p_textpt = 1, /* copy of text page table size */ p_datapt = 1, /* copy of data page table size */ p_stakpt = 1, /* copy of stack page table size */ p_ssize = 9, /* copy of stack size (clicks) */ p_rssize = 9, /* current resident set size in clicks */ p_maxrss = 3187, /* copy of u.u_limit[MAXRSS] */ p_swrss = 0, /* resident set size before last swap */ p_swaddr = 5856, /* disk address of u area when swapped */ /* event process is awaiting */ p_wchan = 0xffffc000<Address 0xffffc000 out of bounds>, p_textp = 0x801976e8, /* pointer to text structure */ p_textbr = 0xc2a0e000,/* text page table base */ p_databr = 0xc2a51000,/* data page table base */ p_stakbr = 0xc2a52000,/* stack page table base */ p_xlink = 0x801853e0, /* linked list of procs sharing same text */ p_cpticks = 0, /* ticks of cpu time */ p_pctcpu = 0, /* %cpu for this process during p_time */ p_fp = 0, /* generate SIGFPE on all fp interrupts */ p_puac = 1, /* print/don't print unaligned access msgs */ p_ndx = 28, /* proc index for memall (because of vfork) */ p_idhash = 0, /* hashed based on p_pid for kill+exit+... */ p_pptr = 0x8018460c, /* pointer to process structure of parent */ p_cptr = 0x80184aa8, /* pointer to youngest living child */ p_osptr = 0x0, /* pointer to older sibling processes */ p_ysptr = 0x0, /* pointer to younger siblings */ p_realtimer = { it_interval = {tv_sec = 0, tv_usec = 0}, it_value = {tv_sec = 0, tv_usec = 0}}, p_quota = 0x801bd0c0, /* quotas for this process */ p_ttyp = 0x8014cab0, /* controlling tty pointer */ p_smsize = 0, /* size of SM space (clicks) */ p_smbeg = 0, /* page table offset of first SMS */ p_smend = 0, /* page table offset of end of */ /* last attached SMS. */ p_smcount = 0, /* count of SMS attached to proc */ p_sm = 0x0} /* shared memory related elements */
proc
de tcsh
(gdb) p jjl_mproc_p_tlbinfo $4 = { { lo = { tl_struct = { tls_g = 0, /* match any pid */ tls_v = 1, /* valid */ tls_d = 1, /* dirty (actually writeable) */ tls_n = 0, /* non-cacheable */ tls_pfn = 3424}},/* physical page frame number */ hi = { th_struct = { ths_pid = 0, ths_vpn = 0xffc01}}}, /* virtual page number */ { lo = { tl_struct = {tls_g = 0, tls_v = 1, tls_d = 1, tls_n = 0, tls_pfn = 2759}}, hi = { th_struct = {ths_pid = 0, ths_vpn = 0xffdff}}}, { lo = { tl_struct = {tls_g = 0, tls_v = 1, tls_d = 1, tls_n = 0, tls_pfn = 2756}}, hi = { th_struct = {ths_pid = 0, ths_vpn = 0xffc40}}}, 0 ptes <repeats 5 times>}
proc
de tcsh
(gdb) p jjl_mproc_p_textp $5 = { x_free = { /* linked list of free text structures */ xun_freef = 0x80196f58, xun_freeb = 0x80197a2c}, x_daddr = 0xc37c9c00, /* disk addresses of DMTEXT-page segments */ x_ptdaddr = 1208, /* disk address of page table */ x_size = 51, /* size (clicks) */ x_caddr = 0x80185a54, /* ptr to linked proc, if loaded */ x_gptr = 0x80170210, /* gnode of prototype */ x_rssize = 39, /* size in clicks */ x_swrss = 0, /* swapped rss */ x_count = 7, /* reference count */ x_ccount = 5, /* number of loaded references */ x_lcount = 0, /* number of processes locking segment */ x_poip = 0, /* page out in progress count */ x_flag = 34} /* traced, written flags */
proc
de tcsh
(gdb) p jjl_mproc_p_textbr $6 = { { pg_prot = 2, pg_fod = 0, pg_swapm = 1, pg_g = 0, pg_v = 1, pg_m = 1, pg_n = 0, pg_pfnum = 1980}, { pg_prot = 2, pg_fod = 0, pg_swapm = 1, pg_g = 0, pg_v = 1, pg_m = 1, pg_n = 0, pg_pfnum = 1277}, { pg_prot = 2, pg_fod = 0, pg_swapm = 1, pg_g = 0, pg_v = 1, pg_m = 1, pg_n = 0, pg_pfnum = 2270}, { pg_prot = 2, pg_fod = 0, pg_swapm = 1, pg_g = 0, pg_v = 1, pg_m = 1, pg_n = 0, pg_pfnum = 3800}, { pg_prot = 2, pg_fod = 0, pg_swapm = 1, pg_g = 0, pg_v = 1, pg_m = 1, pg_n = 0, pg_pfnum = 3168}, { pg_prot = 2, pg_fod = 0, pg_swapm = 1, pg_g = 0, pg_v = 1, pg_m = 1, pg_n = 0, pg_pfnum = 1153}, { pg_prot = 2, pg_fod = 0, pg_swapm = 1, pg_g = 0, pg_v = 1, pg_m = 1, pg_n = 0, pg_pfnum = 978}, { pg_prot = 2, pg_fod = 0, pg_swapm = 1, pg_g = 0, pg_v = 1, pg_m = 1, pg_n = 0, pg_pfnum = 3939}, ... { pg_prot = 0, pg_fod = 0, pg_swapm = 0, pg_g = 0, pg_v = 0, pg_m = 0, pg_n = 0, pg_pfnum = 0} <repeats 49 times>}
proc
de tcsh
(gdb) p jjl_mproc_p_databr $7 = { { pg_prot = 3, pg_fod = 0, pg_swapm = 0, pg_g = 0, pg_v = 0, pg_m = 0, pg_n = 0, pg_pfnum = 2036}, { pg_prot = 3, pg_fod = 0, pg_swapm = 0, pg_g = 0, pg_v = 0, pg_m = 0, pg_n = 0, pg_pfnum = 0}, { pg_prot = 3, pg_fod = 0, pg_swapm = 0, pg_g = 0, pg_v = 0, pg_m = 0, pg_n = 0, pg_pfnum = 3201}, { pg_prot = 3, pg_fod = 0, pg_swapm = 0, pg_g = 0, pg_v = 0, pg_m = 0, pg_n = 0, pg_pfnum = 3474}, ... { pg_prot = 3, pg_fod = 0, pg_swapm = 0, pg_g = 0, pg_v = 0, pg_m = 0, pg_n = 0, pg_pfnum = 0} <repeats 32 times>, { pg_prot = 0, pg_fod = 0, pg_swapm = 0, pg_g = 0, pg_v = 0, pg_m = 0, pg_n = 0, pg_pfnum = 0} <repeats 49 times>} (gdb) p jjl_mproc_p_stakbr $8 = { { pg_prot = 0, pg_fod = 0, pg_swapm = 0, pg_g = 0, pg_v = 0, pg_m = 0, pg_n = 0, pg_pfnum = 0} <repeats 100 times>}
user
de tcsh
$1 = { u_pcb = { /* General purpose registers saved at context switch time. */ pcb_regs = { -2145890728, 257, 268516740, 2, 13, 26, 2147473756, 0, -8532, 0, -2146586220, 65281, 0 <repeats 20 times>}, pcb_pc = 0, pcb_sr = 0, /* Misc. */ pcb_sswap = 0x0, /* ptr for non-local resume */ pcb_resched = 1, /* non-zero if time to resched */ pcb_sstep = 0, /* non-zero if single stepping */ /* single step state info */ pcb_ssi = { ssi_cnt = 0, ssi_bp = { {bp_addr = 0x0, bp_inst = 0}, {bp_addr = 0x0, bp_inst = 0}}}, /* These are use in branch delay instruction emulation * pcb_bd_epc = 0, /* epc register */ pcb_bd_cause = 0, /* cause register */ pcb_bd_ra = 0, /* address to return to if doing bd emulation */ pcb_bd_instr = 0, /* the branch instr for the bd emulation */ /* This is use in fp instruction emulation */ pcb_softfp_pc = 0,/* resulting pc after fp emulation */ /* Space for the state of all the potential coprocessors. WASTEFUL!*/ pcb_fpregs = { 0 <repeats 32 times>}, /* floating point */ pcb_fpc_csr = 0, /* floating point control and status reg */ pcb_fpc_eir = 789, /* floating point exception instruction reg */ pcb_ownedfp = -2146258800, /* has owned fp at one time */ pcb_c2regs = {0 <repeats 32 times>}, /* TBD */ pcb_c3regs = {0 <repeats 32 times>}}, /* TBD */}
user
de tcsh
u_procp = 0x80184e58, /* pointer to proc structure */ u_ar0 = 0xffffdf5c, /* pointer to proc structure */ u_comm = {"tcsh", '\000' <repeats 252 times>}, /* syscall parameters, results and catches */ u_arg = { /* arguments to current system call */ 2, 0, 2147477928, 4353624, 0, 0, 0, 0}, u_ap = 0xffffc350, u_qsave = { /* for non-local gotos on interrupts */ val = { 536936252, 257, 268516740, 2, 13, 26, 2147473756, 0, -8428, 0, -2146578536, 536936253}}, u_r = { /* syscall return values */ u_rv = { R_val1 = 0, R_val2 = 4}, r_off = 0, r_time = 0}, u_error = 0 '\000', /* return error code */ u_eosys = 3 '\003', /* special action on end of syscall */ u_narg = 0 '\000', /* number of arguments in u_arg above */ /* 1.1 - processes and protection */ u_cred = 0xc3790f00, /* user credentials (uid, gid, etc) */ /* 1.2 memory management */ u_tsize = 51, /* text size (clicks) */ u_dsize = 21, /* data size (clicks) */ u_ssize = 9, /* stack size (clicks) */
user
de tcsh
/* disk map for data segment */ u_dmap = { dm_size = 168, /* current size used by process */ dm_alloc = 224,/* amount of physical swap space allocated */ dm_map = { /* first disk block number in each chunk */ 6752, 25968, 44016, 0 <repeats 45 times>}}, u_smap = { /* disk map for stack segment */ dm_size = 72, /* current size used by process */ dm_alloc = 96, /* amount of physical swap space allocated */ dm_map = { /* first disk block number in each chunk */ 21800, 45136, 0 <repeats 46 times>}}, u_cdmap = { /* shadows of u_dmap, u_smap, for use of parent during fork */ dm_size = 168, dm_alloc = 224, dm_map = {44240, 44272, 44336, 0 <repeats 45 times>}}, u_csmap = { dm_size = 72, dm_alloc = 96, dm_map = {45040, 45072, 0 <repeats 46 times>}}, /* label variable for swapping */ u_ssave = { /* label variable for swapping */ val = {-2146809824, -2145890728, 5, 268490372, 0, 26, 2147473492, 0, -8684, 0, -2146779520, 65341}}, u_odsize = 63, u_ossize = 2, /* for (clumsy) expansion swaps */ u_outime = 0, /* user time at last sample */
user
de tcsh
/* 1.3 - signal management */ u_signal = {}, /* disposition of signals */ u_sigmask = { /* signals to be blocked */ 0, 16384, 0 <repeats 13 times>, 1, 0 <repeats 16 times>}, u_sigonstack = 0, /* signals to take on sigstack */ u_sigintr = 0, /* signals that interrupt syscalls */ u_oldsig = 0, /* POSIX "old stylë signals */ u_oldmask = 524290, /* saved mask from before sigpause */ u_code = 0, /* ``code'' to trap */ u_sigstack = { /* sp & on stack state variable */ ss_sp = 0x0, ss_onstack = 0}, u_sigtramp = 0x426e58 <_atod+51800>, /* signal trampoline code */ u_trapcause = 0, /* cause for SIGTRAP */ u_trapinfo = 0, /* extra info concerning SIGTRAP */ /* 1.4 - descriptor management */ u_ofile = {}, /* file structures for open files */ u_pofile = { /* per-process flags of open files */ '\000' <repeats 15 times>, "\001\001\001\001\001", '\000' <repeats 44 times>}, u_lastfile = 19, /* highest numbered open file */ u_cdir = 0x8017d314, /* current directory */ u_rdir = 0x0, /* root directory of current process */ u_ttyd = 1543, /* controlling tty dev */ u_cmask = 18, /* mask for file creation */
/* 1.5 - timing and statistics */ u_ru = { /* stats for this proc */ ru_utime = { /* user time used */ tv_sec = 0, tv_usec = 187488}, ru_stime = { /* system time used */ tv_sec = 0, tv_usec = 507780}, ru_maxrss = 148, ru_ixrss = 8462, /* integral shared text size */ ru_ismrss = 0, /* integral shared memory size*/ ru_idrss = 4825, /* integral unshared data " */ ru_isrss = 0, /* integral unshared stack " */ ru_minflt = 15, /* page reclaims */ ru_majflt = 3, /* page faults */ ru_nswap = 0, /* swaps */ ru_inblock = 2, /* block input operations */ ru_oublock = 1, /* block output operations */ ru_msgsnd = 0, /* messages sent */ ru_msgrcv = 0, /* messages received */ ru_nsignals = 7, /* signals received */ ru_nvcsw = 120, /* voluntary context switches */ ru_nivcsw = 50}, /* involuntary " */ u_cru = { /* sum of stats for reaped children */ ru_utime = { tv_sec = 0, tv_usec = 31248}, ru_stime = { tv_sec = 0, tv_usec = 320292}, ru_maxrss = 64, ru_ixrss = 1505, ru_ismrss = 0, ru_idrss = 930, ru_isrss = 0, ru_minflt = 20, ru_majflt = 13, ru_nswap = 0, ru_inblock = 0, ru_oublock = 0, ru_msgsnd = 0, ru_msgrcv = 0, ru_nsignals = 2, ru_nvcsw = 27, ru_nivcsw = 22},
u_timer = { {it_interval = {tv_sec = 0, tv_usec = 0}, it_value = {tv_sec = 0, tv_usec = 0}}, {it_interval = {tv_sec = 0, tv_usec = 0}, it_value = {tv_sec = 0, tv_usec = 0}}, {it_interval = {tv_sec = 0, tv_usec = 0}, it_value = {tv_sec = 0, tv_usec = 0}}}, u_tracedev = 0, /* for syscall tracer */ u_XXX = {0, 0, 0}, u_start = { /* whole timeval instead of secs */ tv_sec = 700477846, tv_usec = 72145}, u_acflag = 2, /* 1.6 - resource controls */ u_rlimit = { {rlim_cur = 2147483647, rlim_max = 2147483647}, {rlim_cur = 2147483647, rlim_max = 2147483647}, {rlim_cur = 88051712, rlim_max = 88051712}, {rlim_cur = 524288, rlim_max = 88051712}, {rlim_cur = 2147483647, rlim_max = 2147483647}, {rlim_cur = 13053952, rlim_max = 13053952}}, u_quota = 0x801bd0c0, u_qflags = 0, u_prof = { pr_base = 0x0, pr_size = 0, pr_off = 0, pr_scale = 0}, /* 1.7 - System V related elements */ u_smsize = 0, /* size of SM space (clicks) */ u_osmsize = 0, /* old SM size for expansion swaps */ u_lock = 0, /* memory locking flags (see ../h/lock.h) */
user
de tcsh
/* 1.9 - Namei caching */ u_ncache = { /* last successful directory search */ nc_prevoffset = 4564, /* offset at which last entry found */ nc_inumber = 16384, /* inum of cached directory */ nc_dev = 2062, /* dev of cached directory */ nc_time = 700477853}, /* not used */ u_nd = { ni_dirp = 0xc373ac00<Address 0xc373ac00 out of bounds>, ni_nameiop = 64, ni_error = 0, ni_endoff = 0, ni_slcnt = 1, ni_pdir = 0x8016eeec, ni_iovec = { iov_base = 0x0, iov_len = 0}, ni_uio = { uio_iov = 0x0, uio_iovcnt = 0, uio_offset = 24, uio_segflg = 0, uio_resid = 0, uio_flag = 0}, ni_dent = { gd_ino = 2019, gd_reclen = 16, gd_namelen = 3, gd_name = { "a7\000a\000lly\000s\000kpriss", '\000' <repeats 239 times>}}, ni_bufp = 0x0, ni_cp = 0xc373ac1a}, u_stack = { 0}}
libc{/
mips/sys/*.s}
TLB_MISS
ou interruption horloge toutes les 10ms.
sys/mips/asm.h
et dans libc{/
gen/getpid.s}
/* * SYSCALL -- standard system call sequence * The kernel expects arguments to be passed with the normal C calling * sequence. v0 should contain the system call number. On return from * the kernel mode, a3 will be 0 to indicate no error and non-zero to * indicate an error; if an error occurred v0 will contain an errno. */ #define SYSCALL(x) \ LEAF(x); \ li v0,SYS_/**/x; \ syscall; \ beq a3,zero,9f; \ j _cerror; \ 9: ... SYSCALL(getpid) RET .end getpid2- on reçoit avec des tables toutes prêtes des numéros d'appels-système et l'entrée 0x800000xx qui contient le début du programme
exception
(cf sys/h/systm.h
, sys/init_sysent.c
, sys/mips/locore.s
.
extern struct sysent { int sy_narg; /* total number of arguments */ int (*sy_call)(); /* handler */ int sy_mpsafe; /* safe for multiprocessing use */ } sysent[]; ... /* * System call switch table. */ struct sysent sysent[] = { 0, nosys, 0, /* 0 = indir */ 1, rexit, 0, /* 1 = exit */ ... 0, getpid, 1, /* 20 = getpid */ }; ... VECTOR(exception, M_EXCEPT) # Copied down to 0x80000080 .set noreorder .set noat /* * WARNING!!!! * Address calculation here assumes that 16 bit offset on lw * do es NOT have sign bit set! Do not use PUTMSG or any other * message routines from here (can destroy stack). */ lui k0,kstackflag>>16 lw k0,+kstackflag&0xffff(k0) nop beq k0,zero,1f move k0,AT # BDSLOT: save at .set at .set reorder sw sp,EF_SP*4-EF_SIZE(sp) subu sp,EF_SIZE b 2f /* * Came from user mode or utlbmiss, initialize kernel stack */ 1: sw sp,KERNELSTACK-EF_SIZE+EF_SP*4 la sp,KERNELSTACK-EF_SIZE sw gp,EF_GP*4(sp) la gp,_gp sw gp,kstackflag # now on kernel stack (gp always != 0) /* * This instruction stream can be cleaned up somewhat for write stalls, * but for now, left as is so its readable when debugging */ 2: sw k0,EF_AT*4(sp) sw k1,EF_K1*4(sp) # in case we came from utlbmiss sw a0,EF_A0*4(sp) mfc0 a0,C0_EPC sw a1,EF_A1*4(sp) sw a2,EF_A2*4(sp) sw a3,EF_A3*4(sp) mfc0 a3,C0_CAUSE sw a3,EF_CAUSE*4(sp) sw s0,EF_S0*4(sp) sw ra,EF_RA*4(sp) mfc0 s0,C0_SR sw a0,EF_EPC*4(sp) /* * Dispatch to appropriate exception handler * Register setup: * s0 -- SR register at time of exception * a0 -- exception frame pointer * a1 -- cause code * a3 -- cause register */ and a1,a3,CAUSE_EXCMASK lw a2,causevec(a1) move a0,sp .set noreorder j a2 sw s0,EF_SR*4(sp) # should clear PE in s0 after here .set reorder EXPORT(eexception) END(exception)3- on a trouvé qu'il s'agissait d'un appel-système grâce à cette table de
sys/mips/trap.c
.
int (*causevec[16])() = { /* 0: EXC_INT */ VEC_int, /* 1: EXC_MOD */ VEC_tlbmod, /* 2: EXC_RMISS */ VEC_tlbmiss, /* 3: EXC_WMISS */ VEC_tlbmiss, /* 4: EXC_RADE */ VEC_addrerr, /* 5: EXC_WADE */ VEC_addrerr, /* 6: EXC_IBE */ VEC_ibe, /* 7: EXC_DBE */ VEC_dbe, /* 8: EXC_SYSCALL */ VEC_syscall, /* 9: EXC_BREAK */ VEC_breakpoint, /* 10: EXC_II */ VEC_trap, /* 11: EXC_CPU */ VEC_cpfault, /* 12: EXC_OV */ VEC_trap, /* 13: undefined */ VEC_unexp, /* 14: undefined */ VEC_unexp, /* 15: undefined */ VEC_unexp };4- on part exécuter le code des appels-système dans
sys/mips/locore.s
et dans {sys/mips/trap.c}.
VECTOR(VEC_syscall, M_SYSCALLSAVE) #endif or a1,s0,SR_IEC # enable interrupts mtc0 a1,C0_SR sw v0,EF_V0*4(sp) # u_rval1 sw v1,EF_V1*4(sp) # u_rval2 move a1,v0 # arg2 -- syscall number move a2,s0 # arg3 -- sr jal syscall # syscall(ef_ptr, sysnum, sr, cause) bne v0,zero,full_restore # doing a sigreturn lw v0,EF_V0*4(sp) # u_rval1 lw v1,EF_V1*4(sp) # u_rval2 mtc0 s0,C0_SR # disable interrupts b exception_exit END(VEC_syscall) ... syscall(ep, code, sr, cause) register u_int *ep; u_int code; { register regparams, nargs; register struct sysent *callp; register struct proc *p; register int *params; int opc; struct timeval syst; extern char *syscallnames[]; cnt.v_syscall++; syst = u.u_ru.ru_stime; u.u_error = 0; if (!USERMODE(sr)) panic("syscall"); opc = ep[EF_EPC]; ep[EF_EPC] += 4; /* normal return executes next inst */ XPRINTF(XPR_SYSCALL, "syscall %n upc 0x%x\n", code, syscall_values, opc, 0); regparams = EF_A0; if (code >= nsysent) { callp = &sysent[0]; /* indir (illegal) */ ep[EF_EPC] = opc; /* just leave pc at the syscall inst */ } else { callp = &sysent[code]; if (callp == sysent) { /* * indirect system call (syscall), first param is * sys call number */ code = ep[EF_A0]; regparams++; if (code >= nsysent) callp = &sysent[0]; else callp = &sysent[code]; } } u.u_eosys = NORMALRETURN; params = (int *)u.u_arg; for (nargs = callp->sy_narg; nargs && regparams <= EF_A3; nargs--) *params++ = ep[regparams++]; if (nargs) { u.u_error = copyin(ep[EF_SP]+4*sizeof(int), params, nargs*sizeof(int)); if (u.u_error) goto done; } u.u_r.r_val1 = 0; u.u_r.r_val2 = ep[EF_V1]; if (setjmp(&u.u_qsave)) { if (u.u_error == 0 && u.u_eosys != RESTARTSYS) u.u_error = EINTR; } else { (*(callp->sy_call))(u.u_arg); } done: /* * a3 is returned to user 0 if indicate no errors on syscall, * non-zero otherwise */ if (u.u_eosys == NORMALRETURN) { if (u.u_error) { ep[EF_V0] = u.u_error; ep[EF_A3] = 1; } else { ep[EF_V0] = u.u_r.r_val1; ep[EF_V1] = u.u_r.r_val2; ep[EF_A3] = 0; } } else if (u.u_eosys == RESTARTSYS) ep[EF_EPC] = opc; /* else if (u.u_eosys == FULLRESTORE) */ /* returning from sigreturn, force full state restore */ p = u.u_procp; if (p->p_cursig || ISSIG(p)) psig(); p->p_pri = p->p_usrpri; if (runrun) { /* * Since we are u.u_procp, clock will normally just change * our priority without moving us from one queue to another * (since the running process is not on a queue.) * If that happened after we setrq ourselves but before we * swtch()'ed, we might not be on the queue indicated by * our priority. */ (void) splclock(); setrq(p); u.u_ru.ru_nivcsw++; swtch(); } /* * if single stepping this process, install breakpoints before * returning to user mode. Do this here rather than in procxmt * so single stepping will work when signals are delivered. */ if (u.u_pcb.pcb_sstep) install_bp(); if (u.u_prof.pr_scale) { int ticks = times_to_ticks(&u.u_ru.ru_stime,&syst); if (ticks) addupc(ep[EF_EPC], &u.u_prof, ticks); } cpudata[0].c_curpri = p->p_pri; /* * if u_eosys == FULLRESTORE, then force full state restore */ return (u.u_eosys == FULLRESTORE); }5- on fait enfin la chose demandée dans l'appel-système! cf
sys/sys/kern_prot.c
getpid() { u.u_r.r_val1 = u.u_procp->p_pid; u.u_r.r_val2 = u.u_procp->p_ppid; }6- on repart dans
sys/mips/locore.s
/* * End of exception processing. Interrupts should be disabled. */ VECTOR(exception_exit, M_EXCEPT) /* * ENTRY CONDITIONS: * Interrupts Disabled * s0 contains sr at time of exception * * If we are returning to user mode, check to see if a resched is * desired. If so, fake a RESCHED cause bit and let trap save/restore * our state for us. */ and k0,s0,SR_KUP beq k0,zero,2f # returning to kernel mode lw k0,u+PCB_RESCHED beq k0,zero,1f # no resched requested move a0,sp li a1,SEXC_RESCHED # software exception lw a3,EF_CAUSE*4(sp) b VEC_trap 1: sw zero,kstackflag # switching to user stack lw gp,EF_GP*4(sp) 2: lw a0,EF_A0*4(sp) lw a1,EF_A1*4(sp) lw a2,EF_A2*4(sp) lw a3,EF_A3*4(sp) lw s0,EF_S0*4(sp) lw ra,EF_RA*4(sp) lw k0,EF_EPC*4(sp) lw k1,EF_SR*4(sp) mtc0 k1,C0_SR # PE BIT .set noreorder .set noat lw AT,EF_AT*4(sp) lw sp,EF_SP*4(sp) j k0 c0 C0_RFE .set at .set reorder END(exception_exit)7- Si l'appel système a plus de 4 arguments, les arguments supplémentaires sont comme en C passés dans la pile (utilisateur). Or on est maintenant en mode maître du processeur avec une autre pile. Il faut donc recopier des choses de l'espace utilisateur vers l'espace noyau. C'est fait par
copyin()
de
sys/mips/usercopy.s
. Cette procédure est incroyablement
compliquée, car elle peut avoir à copier de gros objets et est
super optimisée.
/* * copyin(user_src, kernel_dst, bcount) */ COPYIOFRM= (4*4)+4 # 4 arg saves plus ra NESTED(copyin, COPYIOFRM, zero) subu sp,COPYIOFRM sw ra,COPYIOFRM-4(sp) bltz a0,cerror .set noreorder li v0,NF_COPYIO sw v0,nofault # branch delay slot jal bcopy nop sw zero,nofault .set reorder move v0,zero lw ra,COPYIOFRM-4(sp) addu sp,COPYIOFRM j ra END(copyin) /* * copyout(kernel_src, user_dst, bcount) */ NESTED(copyout, COPYIOFRM, zero) subu sp,COPYIOFRM sw ra,COPYIOFRM-4(sp) bltz a1,cerror .set noreorder li v0,NF_COPYIO sw v0,nofault # branch delay slot jal bcopy nop sw zero,nofault .set reorder move v0,zero lw ra,COPYIOFRM-4(sp) addu sp,COPYIOFRM j ra END(copyout) NESTED(cerror, COPYIOFRM, zero) li v0,EFAULT lw ra,COPYIOFRM-4(sp) addu sp,COPYIOFRM j ra END(cerror) /* * bcopy(src, dst, bcount) * * NOTE: the optimal copy here is somewhat different than for the user-level * equivalents (bcopy in 4.2, memcpy in V), because: * 1) it frequently acts on uncached data, especially since copying from * (uncached) disk buffers into user pgms is high runner. * This means one must be careful with lwl/lwr/lb - don't expect cache help. * 2) the distribution of usage is very different: there are a large number * of bcopies for small, aligned structures (like for ioctl, for example), * a reasonable number of randomly-sized copies for user I/O, and many * bcopies of large (page-size) blocks for stdio; the latter must be * well-tuned, hence the use of 32-byte loops. * 3) this is much more frequently-used code inside the kernel than outside * * Overall copy-loop speeds, by amount of loop-unrolling: assumptions: * a) low icache miss rate (this code gets used a bunch) * b) large transfers, especially, will be word-alignable. * c) Copying speeds (steady state, 0% I-cache-miss, 100% D-cache Miss): * d) 100% D-Cache Miss (but cacheable, so that lwl/lwr/lb work well) * Config Bytes/ Cycles/ Speed (VAX/780 = 1) * Loop Word * 08V111 35 0.71X (8MHz, BUS, 1-Deep WB, 1-way ILV) * 4 15 1.67X * 8/16 13.5 1.85X * 32/up 13.25 1.89X * 08MM44 1 26 0.96X (8MHz, MEM, 4-Deep WB, 4-way ILV) * 4 9 2.78X * 8 7.5 3.33X * 16 6.75 3.70X * 32 6.375 3.92X (diminishing returns thereafter) * * MINCOPY is minimum number of byte that its worthwhile to try and * align copy into word transactions. Calculations below are for 8 bytes: * Estimating MINCOPY (C = Cacheable, NC = Noncacheable): * Assumes 100% D-cache miss on first reference, then 0% (100%) for C (NC): * (Warning: these are gross numbers, and the code has changed slightly): * Case 08V11 08M44 * MINCOPY C NC C NC * 9 (1 byte loop) 75 133 57 93 * 8 (complex logic) * Aligned 51 51 40 40 * Alignable, * worst (1+4+3) 69 96 53 80 * Unalignable 66 93 60 72 * MINCOPY should be lower for lower cache miss rates, lower cache miss * penalties, better alignment properties, or if src and dst alias in * cache. For this particular case, it seems very important to minimize the * number of lb/sb pairs: a) frequent non-cacheable references are used, * b) when i-cache miss rate approaches zero, even the 4-deep WB can't * put successive sb's together in any useful way, so few references are saved. * To summarize, even as low as 8 bytes, avoiding the single-byte loop seems * worthwhile; some assumptions are probably optimistic, so there is not quite * as much disadvantage. However, the optimal number is almost certainly in * the range 7-12. * * a0 src addr * a1 dst addr * a2 length remaining */ #define MINCOPY 8 LEAF(bcopy) xor v0,a0,a1 # bash src & dst for align chk; BDSLOT blt a2,MINCOPY,bytecopy # too short, just byte copy and v0,NBPW-1 # low-order bits for align chk subu v1,zero,a0 # -src; BDSLOT bne v0,zero,unaligncopy # src and dst not alignable /* * src and dst can be simultaneously word aligned */ and v1,NBPW-1 # number of bytes til aligned subu a2,v1 # bcount -= alignment beq v1,zero,blkcopy # already aligned lwr v0,0(a0) swr v0,0(a1) addu a0,v1 # src += alignment addu a1,v1 # dst += alignment /* * 32 byte block, aligned copy loop (for big reads/writes) */ blkcopy: and a3,a2,~31 # total space in 32 byte chunks subu a2,a3 # count after by-32 byte loop done beq a3,zero,wordcopy # less than 32 bytes to copy addu a3,a0 # source endpoint 1: lw v0,0(a0) lw v1,4(a0) lw t0,8(a0) lw t1,12(a0) sw v0,0(a1) sw v1,4(a1) sw t0,8(a1) sw t1,12(a1) addu a0,32 # src+= 32; here to ease loop end lw v0,-16(a0) lw v1,-12(a0) lw t0,-8(a0) lw t1,-4(a0) sw v0,16(a1) sw v1,20(a1) sw t0,24(a1) sw t1,28(a1) addu a1,32 # dst+= 32; fills BD slot bne a0,a3,1b /* * word copy loop */ wordcopy: and a3,a2,~(NBPW-1) # word chunks subu a2,a3 # count after by word loop beq a3,zero,bytecopy # less than a word to copy addu a3,a0 # source endpoint 1: lw v0,0(a0) addu a0,NBPW sw v0,0(a1) addu a1,NBPW # dst += 4; BD slot bne a0,a3,1b b bytecopy /* * deal with simultaneously unalignable copy by aligning dst */ unaligncopy: subu a3,zero,a1 # calc byte cnt to get dst aligned and a3,NBPW-1 # alignment = 0..3 subu a2,a3 # bcount -= alignment beq a3,zero,partaligncopy # already aligned lwr v0,0(a0) # get whole word lwl v0,3(a0) # for sure swr v0,0(a1) # store right piece (1-3 bytes) addu a0,a3 # src += alignment (will fill LD slot) addu a1,a3 # dst += alignment /* * src unaligned, dst aligned loop * NOTE: if MINCOPY >= 7, will always do 1 loop iteration or more * if we get here at all */ partaligncopy: and a3,a2,~(NBPW-1) # space in word chunks subu a2,a3 # count after by word loop addu a3,a0 # source endpoint 1: lwr v0,0(a0) lwl v0,3(a0) addu a0,NBPW sw v0,0(a1) addu a1,NBPW bne a0,a3,1b /* * brute force byte copy loop, for bcount < MINCOPY + tail of unaligned dst * note that lwl, lwr, swr CANNOT be used for tail, since the lwr might * cross page boundary and give spurious address exception */ bytecopy: addu a3,a2,a0 # source endpoint; BDSLOT ble a2,zero,copydone # nothing left to copy, or bad length 1: lb v0,0(a0) addu a0,1 sb v0,0(a1) addu a1,1 # BDSLOT: incr dst address bne a0,a3,1b copydone: j ra END(bcopy)
getpid()
est la caricature de l'appel système. Pour
2 instructions, on a un overhead incroyable. Il faut toutefois
retenir que ca coûte de faire des tours dans le noyau.
if (issig()) psig();
runrun
est positionné. Ceci signifie que le scheduler
réclame qu'il y ait une commutation de processus.
sleep()
car une ressource dont il a besoin est
absente. Il peut abandonner le processeur, par swtch()
, car il
voit le drapeau runrun
positionné en fin d'appel-système.
hardclock()
de sys/sys/kern_clock
, le drapeau runrun
est
mis en appelant setpri()
de sys/sys/kern_synch
.
Et la procédure trap()
de sys/mips/trap.c
--- qui
correspond à syscall()
--- fait la commutation si le
processus interrompu est en mode utilisateur et si
runrun
est positionné, en fin de traitement de l'exception.
splclock()
, spltty()
, ... définies dans
sys/mips/locore.s
.
issig()
.
sleep()
en fonction de la
priorité d'endormissement (2ème argument de sleep()
).
sendsig()
de
sys/mips/mips/machdep.c
sigreturn()
.
kill()
et
psignal()
. Remarquons que kill()
parcourt
la table des proc
et positionne la présence du signal.
La partie ``signal'' d'un processus doit se trouver dans la structure
proc
et non dans la structure u
.
callout
sont mises en attente des interruptions
horloge.
callout
sont mis en ordre croissant le délai de réveil
et chacun stocke le temps incrémental. A chaque tick d'horloge,
on diminue le temps du premier, et on teste s'il est négatif ou nul.
/* * The callout structure is for * a routine arranging * to be called by the clock interrupt * (clock.c) with a specified argument, * in a specified amount of time. * Used, for example, to time tab * delays on typewriters. */ struct callout { int c_time; /* incremental time */ caddr_t c_arg; /* argument to routine */ int (*c_func)(); /* routine */ struct callout *c_next; };
p_usrpri
d'un processus descend de 1
tous les 4 ticks (4 * 1/60 s). La priorité ne
peut être plus grande que 127.
p_usrpri
= PUSER + Partie_Entière{p_cpu
/ 4} + 2 * p_nice
p_cpu
= {2 load} / {2 load + 1}
p_cpu
+ p_nice
schedcpu()
de
sys/sys/kern_synch.c
recalcule la priorité toutes les secondes, grâce
à un réveil
timeout(schedcpu, (caddr_t)0, hz);
omega(S) = max {fi(S) | 1 <= i <= n}
ou
omega'(S) = 1 / n Somme(wi * fi(S)) de 1 à N
TIT .COMPUTER & JOB/SHOP SCHEDULING THEORY AUT .Coffman, Edward G. COL .John Wiley & Sons, 1976
fork()
dans sys/sys/kern_fork.c
, qui
appelle newproc()
. Tout est copié (y compris la
structure u
). Quelques ajustement sont faits dans la
structure proc
.
exit()
dans sys/sys/kern_exit.c
relâce les ressources, sauve la structure u
pour que le
parent puisse la consulter, entre dans l'état ZOMBIE, réveille le
parent et réarrange les processus fils.
proc.h
/* stat codes */ #define SSLEEP 1 /* awaiting an event */ #define SWAIT 2 /* (abandoned state) */ #define SRUN 3 /* running */ #define SIDL 4 /* intermediate state in process creation */ #define SZOMB 5 /* intermediate state in process termination */ #define SSTOP 6 /* process being traced */ /* flag codes */ #define SLOAD 0x0000001 /* in core */ #define SSYS 0x0000002 /* swapper or pager process */ #define SLOCK 0x0000004 /* process being swapped out */ #define SSWAP 0x0000008 /* save area flag */ #define STRC 0x0000010 /* process is being traced */ #define SWTED 0x0000020 /* stpd proc not been waited for by parent */ #define SULOCK 0x0000040 /* user settable lock in core */ #define SPAGE 0x0000080 /* process in page wait state */ #define SKEEP 0x0000100 /* another flag to prevent swap out */ #define SOMASK 0x0000200 /* restore old mask after taking signal */ #define SWEXIT 0x0000400 /* working on exiting */ #define SPHYSIO 0x0000800 /* doing physical i/o (bio.c) */ #define SVFORK 0x0001000 /* process resulted from vfork() */ #define SVFDONE 0x0002000 /* another vfork flag */ #define SNOVM 0x0004000 /* no vm, parent in a vfork() */ #define SPAGI 0x0008000 /* init data space on demand, from inode */ #define SSEQL 0x0010000 /* user warned of sequential vm behavior */ #define SUANOM 0x0020000 /* user warned of random vm behavior */ #define STIMO 0x0040000 /* timing out during sleep */ /* was SDETACH */ #define SNOCLDSTP 0x0100000 /* POSIX child stop flag */ /* was SOUSIG */ #define SOWEUPC 0x0200000 /* owe process an addupc() call at next ast */ #define SSEL 0x0400000 /* selecting; wakeup/waiting danger */ #define SLOGIN 0x0800000 /* a login process (legit child of init) */ #define SPTECHG 0x1000000 /* ptés for process have changed */
sleep(chan, pri)
a 2 arguments. Le 1er argument est
une adresse mémoire quelconque, par exemple
sleep((caddr_t)&runin, PSWP);pour attendre avec une forte priorité qu'un swap in soit possible. Si la priorité est faible, on teste les signaux avant de s'endormir.
sleep()
faible, on
retourne grâce à un goto non local longjmp(&u.u_qsave)
vers la fin d'une entrée/sortie, ou vers la fin d'un wait()
.
swtch()
.
wakeup(chan)
réveille les processus endormis sur ce
canal. Le réveil est d'autant plus fort que la priorité est forte
(i.e. plus proche de 0). La priorité est fixée par le 2ème argument
de sleep()
. Si la priorité de réveil est faible, on teste les
signaux.
swtch()
se fait dans sys/mips/mips/swtch.c
qui
appelle 2 procédures save()
et resume()
de locore.s
qui
agissent comme des goto non locaux.