by Gu Wei
2021年10月
Architecture Lab 一共有三个部分。第一个部分让你手写Y86-64汇编代码,并利用提供的assembler模拟器将.ys汇编成目标文件.yo。第二部分让你给顺序执行的处理器增加iaddq指令。第三部分是建立在前两者之上,我们既要手写Y86-64汇编来从算法上提高程序性能,还要修改处理器来使它执行这个程序更加快。
在开始之前,建议直接把Makefile中的GUIMODE、TKLIBS、TKINC三个全部注释掉,不使用GUI模式只用TTY模式。本人一开始也想要用GUI,于是下载了tcl、tk等工具,但是在编译的时候,发现tcl8.6会报错:error: ‘Tcl_Interp’ has no member named ‘result’。查了一下是tcl8.6相比tcl8.5把result给去掉了,使得编译不成功。准备下载tcl8.5,可是ubuntu20.04的包管理器似乎已经不提供tcl8.5了,要么去网上下载tcl8.5的deb文件再安装。但是嫌麻烦就不做了。
在seq/misc文件夹下,将三个C程序翻译成Y86-64汇编程序(*.ys)。利用提供的yas程序将.ys变成.yo目标文件(.yo这里是ASCII格式的,虽然严格意义上应该是二进制文件,但是这样才有可读性),再通过yis程序来模拟机器代码的运行,会输出内存和寄存器的变化。三个C程序分别是链表求和、递归链表求和以及拷贝内存地址,其中handout要求的sample list以及copy block分别为:
x1# Sample linked list2.align 83ele1:4 .quad 0x00a5 .quad ele26ele2:7 .quad 0x0b08 .quad ele39ele3:10 .quad 0xc0011 .quad 012
13.align 814# Source block15src:16 .quad 0x00a17 .quad 0x0b018 .quad 0xc0019# Destination block20dest:21 .quad 0x11122 .quad 0x22223 .quad 0x333链表求和。C程序如下:
xxxxxxxxxx161/* linked list element */2typedef struct ELE {3 long val;4 struct ELE *next;5} *list_ptr;6
7/* sum_list - Sum the elements of a linked list */8long sum_list(list_ptr ls)9{10 long val = 0;11 while (ls) {12 val += ls->val;13 ls = ls->next;14 }15 return val;16}本人手写的汇编如下:
xxxxxxxxxx421# sum.ys2# Y86-64 for sum_list 3 .pos 04 irmovq stack, %rsp5 call main6 halt7
8
9 .align 810 ele1:11 .quad 0x00a12 .quad ele213 ele2:14 .quad 0x0b015 .quad ele316 ele3:17 .quad 0xc0018 .quad 019
20
21main:22 irmovq ele1,%rdi23 call sum_list24 ret25
26
27sum_list:28 irmovq $0x0,%rax29 jmp test30loop:31 mrmovq 0x0(%rdi),%r832 addq %r8,%rax33 mrmovq 0x8(%rdi),%rdi34test:35 andq %rdi,%rdi36 jne loop37 ret38
39
40 .pos 0x20041stack:42
利用yas生成.yo程序:
xxxxxxxxxx11❯ ./yas sum.yssum.yo如下:
xxxxxxxxxx411 | # Y86-64 for sum_list 20x000: | .pos 030x000: 30f40002000000000000 | irmovq stack, %rsp40x00a: 804800000000000000 | call main50x013: 00 | halt6 | 7 | 80x018: | .align 890x018: | ele1:100x018: 0a00000000000000 | .quad 0x00a110x020: 2800000000000000 | .quad ele2120x028: | ele2:130x028: b000000000000000 | .quad 0x0b0140x030: 3800000000000000 | .quad ele3150x038: | ele3:160x038: 000c000000000000 | .quad 0xc00170x040: 0000000000000000 | .quad 018 | 19 | 200x048: | main:210x048: 30f71800000000000000 | irmovq ele1,%rdi220x052: 805c00000000000000 | call sum_list230x05b: 90 | ret24 | 25 | 260x05c: | sum_list:270x05c: 30f00000000000000000 | irmovq $0x0,%rax280x066: 708500000000000000 | jmp test290x06f: | loop:300x06f: 50870000000000000000 | mrmovq 0x0(%rdi),%r8310x079: 6080 | addq %r8,%rax320x07b: 50770800000000000000 | mrmovq 0x8(%rdi),%rdi330x085: | test:340x085: 6277 | andq %rdi,%rdi350x087: 746f00000000000000 | jne loop360x090: 90 | ret37 | 38 | 390x200: | .pos 0x200400x200: | stack:41
利用yis查看结果:
xxxxxxxxxx101❯ ./yis sum.yo2Stopped in 26 steps at PC = 0x13. Status 'HLT', CC Z=1 S=0 O=03Changes to registers:4%rax: 0x0000000000000000 0x0000000000000cba5%rsp: 0x0000000000000000 0x00000000000002006%r8: 0x0000000000000000 0x0000000000000c007
8Changes to memory:90x01f0: 0x0000000000000000 0x000000000000005b100x01f8: 0x0000000000000000 0x0000000000000013可以看出返回值%rax为0xcba,没毛病!
递归版链表求和。c程序如下:
xxxxxxxxxx171/* linked list element */2typedef struct ELE {3 long val;4 struct ELE *next;5} *list_ptr;6
7/* rsum_list - Recursive version of sum_list */8long rsum_list(list_ptr ls)9{10 if (!ls)11 return 0;12 else {13 long val = ls->val;14 long rest = rsum_list(ls->next);15 return val + rest;16 }17}本人手写的汇编如下:
xxxxxxxxxx431# rsum.ys2# Y86-64 for rsum_list3 .pos 04 irmovq stack, %rsp5 call main6 halt7
8
9 .align 810 ele1:11 .quad 0x00a12 .quad ele213 ele2:14 .quad 0x0b015 .quad ele316 ele3:17 .quad 0xc0018 .quad 019
20
21main:22 irmovq ele1, %rdi23 call rsum_list24 ret25
26
27rsum_list:28 pushq %r1229 irmovq $0x0, %rax30 andq %rdi, %rdi31 je end32 mrmovq 0x0(%rdi), %r1233 mrmovq 0x8(%rdi), %rdi34 call rsum_list35 addq %r12, %rax36end:37 popq %r1238 ret39
40
41 .pos 0x20042stack:43
测试有:
xxxxxxxxxx161❯ ./yas rsum.ys2❯ ./yis rsum.yo3Stopped in 42 steps at PC = 0x13. Status 'HLT', CC Z=0 S=0 O=04Changes to registers:5%rax: 0x0000000000000000 0x0000000000000cba6%rsp: 0x0000000000000000 0x00000000000002007
8Changes to memory:90x01b8: 0x0000000000000000 0x0000000000000c00100x01c0: 0x0000000000000000 0x0000000000000090110x01c8: 0x0000000000000000 0x00000000000000b0120x01d0: 0x0000000000000000 0x0000000000000090130x01d8: 0x0000000000000000 0x000000000000000a140x01e0: 0x0000000000000000 0x0000000000000090150x01f0: 0x0000000000000000 0x000000000000005b160x01f8: 0x0000000000000000 0x0000000000000013可以看出返回值%rax为0xcba,没毛病!
把src地址上的内存拷贝到dest地址上,返回拷贝值的异或。c代码如下:
xxxxxxxxxx121/* copy_block - Copy src to dest and return xor checksum of src */2long copy_block(long *src, long *dest, long len)3{4 long result = 0;5 while (len > 0) {6 long val = *src++;7 *dest++ = val;8 result ^= val;9 len--;10 }11 return result;12}本人写的汇编如下:
xxxxxxxxxx561# copy.ys2# Y86-64 for copy_block3 .pos 04 irmovq stack, %rsp5 call main6 halt7
8
9 .align 810# Source block11src:12 .quad 0x00a13 .quad 0x0b014 .quad 0xc0015# Destination block16dest:17 .quad 0x11118 .quad 0x22219 .quad 0x33320
21
22main:23 irmovq src, %rdi24 irmovq dest, %rsi25 irmovq $0x3, %rdx26 call copy_block27 ret28
29
30copy_block:31 pushq %r1232 pushq %r1333 pushq %r1434 irmovq $0x1, %r1235 irmovq $0x8, %r1336 irmovq $0x0, %rax37 jmp test38loop:39 mrmovq 0x0(%rdi), %r1440 addq %r13, %rdi41 rmmovq %r14, 0x0(%rsi)42 addq %r13, %rsi43 xorq %r14, %rax44 subq %r12, %rdx45test:46 andq %rdx, %rdx47 jg loop48 popq %r1449 popq %r1350 popq %r1251 ret52 53
54 .pos 0x10055stack:56
测试有:
xxxxxxxxxx151❯ ./yas copy.ys2❯ ./yis copy.yo3Stopped in 45 steps at PC = 0x13. Status 'HLT', CC Z=1 S=0 O=04Changes to registers:5%rax: 0x0000000000000000 0x0000000000000cba6%rsp: 0x0000000000000000 0x00000000000001007%rsi: 0x0000000000000000 0x00000000000000488%rdi: 0x0000000000000000 0x00000000000000309
10Changes to memory:110x0030: 0x0000000000000111 0x000000000000000a120x0038: 0x0000000000000222 0x00000000000000b0130x0040: 0x0000000000000333 0x0000000000000c00140x00f0: 0x0000000000000000 0x000000000000006f150x00f8: 0x0000000000000000 0x0000000000000013可见返回值%rax没有问题,0x111、0x222、0x333也改成了0xa、0xb、0xc。没毛病!
在sim/seq文件夹中,修改seq-full.hcl文件,实现iaddq指令
在书中Practice Problem 4.3找到iaddq指令,可见它是把一个八字节的常数V加到rB之中。

参考Figure 4.18的OPq和irmovq可以依葫芦画瓢得到iaddq在六个阶段的操作:
| Stage | iaddq V, rB |
|---|---|
| Fetch | icode:ifun <--- M1[PC] |
| rA:rB <--- M1[PC+1] | |
| valC <--- M8[PC+2] | |
| valP <--- PC+10 | |
| Decode | valB <--- R[rB] |
| Execute | valE <--- valB + valC |
| Memory | |
| Write back | R[rB] <--- valE |
| PC update | PC <--- valP |
参考4.3.4节修改seq.hcl,增加iaddq有(只列出修改部分):
xxxxxxxxxx541################ Fetch Stage ###################################23bool instr_valid = icode in4{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,5IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ, IIADDQ }; # 加入IIADDQ67# Does fetched instruction require a regid byte?8bool need_regids =9icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ,10IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ }; # 加入IIADDQ1112# Does fetched instruction require a constant word?13bool need_valC =14icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL, IIADDQ }; # 加入IIADDQ1516################ Decode Stage ###################################1718## What register should be used as the B source?19word srcB = [20icode in { IOPQ, IRMMOVQ, IMRMOVQ, IIADDQ } : rB; # 加入IIADDQ21icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;221 : RNONE; # Don't need register23];2425## What register should be used as the E destination?26word dstE = [27icode in { IRRMOVQ } && Cnd : rB;28icode in { IIRMOVQ, IOPQ, IIADDQ} : rB; # 加入IIADDQ29icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;301 : RNONE; # Don't write any register31];323334################ Execute Stage ###################################3536## Select input A to ALU37word aluA = [38icode in { IRRMOVQ, IOPQ } : valA;39icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ } : valC; # 加入IIADDQ40icode in { ICALL, IPUSHQ } : -8;41icode in { IRET, IPOPQ } : 8;42# Other instructions don't need ALU43];4445## Select input B to ALU46word aluB = [47icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL,48IPUSHQ, IRET, IPOPQ, IIADDQ } : valB; # 加入IIADDQ49icode in { IRRMOVQ, IIRMOVQ } : 0;50# Other instructions don't need ALU51];5253## Should the condition codes be updated?54bool set_cc = icode in { IOPQ, IIADDQ }; # 加入IIADDQ
make一下
xxxxxxxxxx51❯ make VERSION=full # 懒的话直接修改makefile把version从std改成full就好了2# Building the seq-full.hcl version of SEQ3../misc/hcl2c -n seq-full.hcl <seq-full.hcl >seq-full.c4gcc -Wall -O2 -I../misc -o ssim \5 seq-full.c ssim.c ../misc/isa.c -lm测试一个简单的Y86-64程序
xxxxxxxxxx491❯ ./ssim -t ../y86-code/asumi.yo2Y86-64 Processor: seq-full.hcl3137 bytes of code read4IF: Fetched irmovq at 0x0. ra=----, rb=%rsp, valC = 0x1005IF: Fetched call at 0xa. ra=----, rb=----, valC = 0x386Wrote 0x13 to address 0xf87IF: Fetched irmovq at 0x38. ra=----, rb=%rdi, valC = 0x188IF: Fetched irmovq at 0x42. ra=----, rb=%rsi, valC = 0x49IF: Fetched call at 0x4c. ra=----, rb=----, valC = 0x5610Wrote 0x55 to address 0xf011IF: Fetched xorq at 0x56. ra=%rax, rb=%rax, valC = 0x012IF: Fetched andq at 0x58. ra=%rsi, rb=%rsi, valC = 0x013IF: Fetched jmp at 0x5a. ra=----, rb=----, valC = 0x8314IF: Fetched jne at 0x83. ra=----, rb=----, valC = 0x6315IF: Fetched mrmovq at 0x63. ra=%r10, rb=%rdi, valC = 0x016IF: Fetched addq at 0x6d. ra=%r10, rb=%rax, valC = 0x017IF: Fetched iaddq at 0x6f. ra=----, rb=%rdi, valC = 0x818IF: Fetched iaddq at 0x79. ra=----, rb=%rsi, valC = 0xffffffffffffffff19IF: Fetched jne at 0x83. ra=----, rb=----, valC = 0x6320IF: Fetched mrmovq at 0x63. ra=%r10, rb=%rdi, valC = 0x021IF: Fetched addq at 0x6d. ra=%r10, rb=%rax, valC = 0x022IF: Fetched iaddq at 0x6f. ra=----, rb=%rdi, valC = 0x823IF: Fetched iaddq at 0x79. ra=----, rb=%rsi, valC = 0xffffffffffffffff24IF: Fetched jne at 0x83. ra=----, rb=----, valC = 0x6325IF: Fetched mrmovq at 0x63. ra=%r10, rb=%rdi, valC = 0x026IF: Fetched addq at 0x6d. ra=%r10, rb=%rax, valC = 0x027IF: Fetched iaddq at 0x6f. ra=----, rb=%rdi, valC = 0x828IF: Fetched iaddq at 0x79. ra=----, rb=%rsi, valC = 0xffffffffffffffff29IF: Fetched jne at 0x83. ra=----, rb=----, valC = 0x6330IF: Fetched mrmovq at 0x63. ra=%r10, rb=%rdi, valC = 0x031IF: Fetched addq at 0x6d. ra=%r10, rb=%rax, valC = 0x032IF: Fetched iaddq at 0x6f. ra=----, rb=%rdi, valC = 0x833IF: Fetched iaddq at 0x79. ra=----, rb=%rsi, valC = 0xffffffffffffffff34IF: Fetched jne at 0x83. ra=----, rb=----, valC = 0x6335IF: Fetched ret at 0x8c. ra=----, rb=----, valC = 0x036IF: Fetched ret at 0x55. ra=----, rb=----, valC = 0x037IF: Fetched halt at 0x13. ra=----, rb=----, valC = 0x03832 instructions executed39Status = HLT40Condition Codes: Z=1 S=0 O=041Changed Register State:42%rax: 0x0000000000000000 0x0000abcdabcdabcd43%rsp: 0x0000000000000000 0x000000000000010044%rdi: 0x0000000000000000 0x000000000000003845%r10: 0x0000000000000000 0x0000a000a000a00046Changed Memory State:470x00f0: 0x0000000000000000 0x0000000000000055480x00f8: 0x0000000000000000 0x000000000000001349ISA Check Succeeds测试除了加法运算的操作
xxxxxxxxxx351❯ (cd ../y86-code; make testssim)2../seq/ssim -t asum.yo > asum.seq3../seq/ssim -t asumr.yo > asumr.seq4../seq/ssim -t cjr.yo > cjr.seq5../seq/ssim -t j-cc.yo > j-cc.seq6../seq/ssim -t poptest.yo > poptest.seq7../seq/ssim -t pushquestion.yo > pushquestion.seq8../seq/ssim -t pushtest.yo > pushtest.seq9../seq/ssim -t prog1.yo > prog1.seq10../seq/ssim -t prog2.yo > prog2.seq11../seq/ssim -t prog3.yo > prog3.seq12../seq/ssim -t prog4.yo > prog4.seq13../seq/ssim -t prog5.yo > prog5.seq14../seq/ssim -t prog6.yo > prog6.seq15../seq/ssim -t prog7.yo > prog7.seq16../seq/ssim -t prog8.yo > prog8.seq17../seq/ssim -t ret-hazard.yo > ret-hazard.seq18grep "ISA Check" *.seq19asum.seq:ISA Check Succeeds20asumr.seq:ISA Check Succeeds21cjr.seq:ISA Check Succeeds22j-cc.seq:ISA Check Succeeds23poptest.seq:ISA Check Succeeds24prog1.seq:ISA Check Succeeds25prog2.seq:ISA Check Succeeds26prog3.seq:ISA Check Succeeds27prog4.seq:ISA Check Succeeds28prog5.seq:ISA Check Succeeds29prog6.seq:ISA Check Succeeds30prog7.seq:ISA Check Succeeds31prog8.seq:ISA Check Succeeds32pushquestion.seq:ISA Check Succeeds33pushtest.seq:ISA Check Succeeds34ret-hazard.seq:ISA Check Succeeds35rm asum.seq asumr.seq cjr.seq j-cc.seq poptest.seq pushquestion.seq pushtest.seq prog1.seq prog2.seq prog3.seq prog4.seq prog5.seq prog6.seq prog7.seq prog8.seq ret-hazard.seq测试除了iaddq以外的所有指令
xxxxxxxxxx131❯ (cd ../ptest; make SIM=../seq/ssim)2./optest.pl -s ../seq/ssim 3Simulating with ../seq/ssim4 All 49 ISA Checks Succeed5./jtest.pl -s ../seq/ssim 6Simulating with ../seq/ssim7 All 64 ISA Checks Succeed8./ctest.pl -s ../seq/ssim 9Simulating with ../seq/ssim10 All 22 ISA Checks Succeed11./htest.pl -s ../seq/ssim 12Simulating with ../seq/ssim13 All 600 ISA Checks Succeed测试所有指令
xxxxxxxxxx131❯ (cd ../ptest; make SIM=../seq/ssim TFLAGS=-i)2./optest.pl -s ../seq/ssim -i3Simulating with ../seq/ssim4 All 58 ISA Checks Succeed5./jtest.pl -s ../seq/ssim -i6Simulating with ../seq/ssim7 All 96 ISA Checks Succeed8./ctest.pl -s ../seq/ssim -i9Simulating with ../seq/ssim10 All 22 ISA Checks Succeed11./htest.pl -s ../seq/ssim -i12Simulating with ../seq/ssim13 All 756 ISA Checks Succeed全部succeed就好了!
在sim/pipe文件夹下,修改ncopy.ys和pipe-full.hcl两个文件,使得ncopy.ys跑得越快越好。handout里面写测试写的复杂极了,有点事无巨细的感觉。我就只用到以下几个:
IOPQ;(cd ../ptest; make SIM=../pipe/psim TFLAGS=-i)去掉TFLAGS就不测试iaddq了./correctness.pl(用YIS测试);./correctness.pl -p(用simulator测试)./benchmark.pl就好了./check-len.pl < ncopy.yoncopy的c源码如下:
xxxxxxxxxx181/*2 * ncopy - copy src to dst, returning number of positive ints3 * contained in src array.4 */5word_t ncopy(word_t *src, word_t *dst, word_t len)6{7 word_t count = 0;8 word_t val;9
10 while (len > 0) {11 val = *src++;12 *dst++ = val;13 if (val > 0)14 count++;15 len--;16 }17 return count;18}这里从略,和之前两个Part及其类似,CPE从15.18变为12.70
观察ncopy.c可以看出程序具有数据相关。根据第五章的知识,我们可以采用循环展开突破延迟界限达到吞吐量界限——一种完全流水线下且利用所有功能单元的终极性能。抄了网上的代码如下:
xxxxxxxxxx761# You can modify this portion2 # Loop header3 xorq %rax,%rax # count = 0;4 5 iaddq $-5, %rdx6 jg Loop6x67 iaddq $5, %rdx8 jg Loop19 ret10
11Loop1:12 mrmovq (%rdi), %r813 rrmovq %rax, %r1414 iaddq $1, %r1415 andq %r8, %r816 cmovg %r14, %rax17 rmmovq %r8, (%rsi)18 19 iaddq $8, %rdi # src++20 iaddq $8, %rsi # dst++21 iaddq $-1, %rdx # len--22
23 jg Loop124 ret25
26Loop6x6:27 mrmovq (%rdi), %r828 rrmovq %rax, %r1429 iaddq $1, %r1430 andq %r8, %r831 cmovg %r14, %rax32 rmmovq %r8, (%rsi)33 34 mrmovq 8(%rdi), %r835 rrmovq %rax, %r1436 iaddq $1, %r1437 andq %r8, %r838 cmovg %r14, %rax39 rmmovq %r8, 8(%rsi)40 41 mrmovq 16(%rdi), %r842 rrmovq %rax, %r1443 iaddq $1, %r1444 andq %r8, %r845 cmovg %r14, %rax46 rmmovq %r8, 16(%rsi)47 48 mrmovq 24(%rdi), %r849 rrmovq %rax, %r1450 iaddq $1, %r1451 andq %r8, %r852 cmovg %r14, %rax53 rmmovq %r8, 24(%rsi)54 55 mrmovq 32(%rdi), %r856 rrmovq %rax, %r1457 iaddq $1, %r1458 andq %r8, %r859 cmovg %r14, %rax60 rmmovq %r8, 32(%rsi)61 62 mrmovq 40(%rdi), %r863 rrmovq %rax, %r1464 iaddq $1, %r1465 andq %r8, %r866 cmovg %r14, %rax67 rmmovq %r8, 40(%rsi)68 69 70 iaddq $48, %rdi # src++71 iaddq $48, %rsi # dst++72 iaddq $-6, %rdx # len--73 74 jg Loop6x675 iaddq $5, %rdx76 jg Loop1 跑个分有CPE为8.63,得分37.4/60.0
从某种角度上来说,这个lab就这样差不多结束了。
本人于2021/10/24完成了Architecture Lab,耗时三天。难以评价这个lab,个人觉得自己水平还没有到去真正吸收这个lab的地步吧。回头想想当初读第四章的时候觉得,这章许多内容有点冗杂,同一个知识点讲述不集中。现在觉得作者还是有自己的初衷的,作者设计了Y86-64汇编语言和它的汇编工具,汇编工具又是由作者设计用来描述硬件的HCL语言所生成。所以第四章需要花费笔墨和读者讲述自己的Y86-64以及HCL,而对处理器的讨论却给冲淡了。有一种作者很用心,但是读者却难以收获什么的感觉。。。
现在主要还有三个lab没有做——cache lab、shell lab、malloc lab。performance lab被cmu替换成cache lab了,那我也不做;poxy lab用到的是第三部分的知识,主要就是套接字编程,当初也没看的很懂第三部分,网上虽然说这个lab体量很小,但是限于个人时间精力还是不做为妙。我想接下来要做这些: