Cache Lab记录

by Gu Wei
2021年11月

实验分为两部分，Part A要求实现一个cache的模拟器，Part B要求实现一个为特定cache优化过的转置矩阵函数。

1. Part A

输入的trace文件都是由valgrind生成，如：linux> valgrind --log-fd=1 --tool=lackey -v --trace-mem=yes ls -l可以依次输出内存的访问。trace的格式为：


1
I 0400d7d4,8
2
 M 0421c7f0,4
3
 L 04f6b868,8
4
 S 7ff0005c8,8

“I” denotes an instruction load, “L” a data load, “S” a data store, and “M” a data modify (i.e., a data load followed by a data store). There is never a space before each “I”. There is always a space before each “M”, “L”, and “S”. The address field specifies a 64-bit hexadecimal memory address. The size field specifies the number of bytes accessed by the operation.

现在给出s、E、b这些参数以及trace文件，我们需要输出hit、miss以及eviction的次数。替换策略采用的是LRU（least-recently used）。用法具体如下：


xxxxxxxxxx
7
1
Usage: ./csim-ref [-hv] -s <s> -E <E> -b <b> -t <tracefile>
2
 -h: Optional help flag that prints usage info
3
 -v: Optional verbose flag that displays trace info
4
 -s <s>: Number of set index bits (S = 2^s is the number of sets)
5
 -E <E>: Associativity (number of lines per set)
6
 -b <b>: Number of block bits (B = 2^b is the block size)
7
 -t <tracefile>: Name of the valgrind trace to replay

输出的例子如下：


x
1
linux> ./csim-ref -s 4 -E 1 -b 4 -t traces/yi.trace
2
hits:4 misses:5 evictions:3
3

4
linux> ./csim-ref -v -s 4 -E 1 -b 4 -t traces/yi.trace
5
L 10,1 miss
6
M 20,1 miss hit
7
L 22,1 hit
8
S 18,1 hit
9
L 110,1 miss eviction
10
L 210,1 miss eviction
11
M 12,1 miss eviction hit
12
hits:4 misses:5 evictions:3

注意到加了v参数就会多输出一些信息。

要模拟cache，其实参数b以及trace文件的I都是没有用的，而trace文件的M、L、S其实也是同质化的。M就是查两次cache有没有，L和S则是一次。我并没有采取队列来实现LRU，而是用了时间戳，感觉队列会比较麻烦。

话不多说，直接上代码：


xxxxxxxxxx
170
1
/**
2
 * by Gu Wei 2021/10/31
3
 */ 
4
#include "cachelab.h" //for printSummary
5
#include <stdio.h> //for printf fscanf
6
#include <unistd.h> // for getopt
7
#include <stdlib.h> // for atoi
8
#include <string.h> // for strcpy memset
9

10
int hit_count, miss_count, eviction_count;
11
void printUsage()
12
{
13
    printf("Usage: ./csim-ref [-hv] -s <num> -E <num> -b <num> -t <file>\n"
14
            "Options:\n"
15
            "  -h         Print this help message.\n"
16
            "  -v         Optional verbose flag.\n"
17
            "  -s <num>   Number of set index bits.\n"
18
            "  -E <num>   Number of lines per set.\n"
19
            "  -b <num>   Number of block offset bits.\n"
20
            "  -t <file>  Trace file.\n\n"
21
            "Examples:\n"
22
            "  linux>  ./csim-ref -s 4 -E 1 -b 4 -t traces/yi.trace\n"
23
            "  linux>  ./csim-ref -v -s 8 -E 2 -b 4 -t traces/yi.trace\n");
24
}
25
typedef struct{
26
    int valid;
27
    int tag;
28
    int stamp;
29
}cache_line;
30

31
int main(int argc, char *argv[])
32
{
33
    /**
34
     * 利用getopt函数获取参数
35
     */ 
36
    int opt, h, v, s, E, b, invalidOption;
37
    opt = h = v = s = E = b = invalidOption = 0;
38
    char t[1000];
39
    while (-1 != (opt = getopt(argc, argv, "hvs:E:b:t:")))
40
    {
41
        switch (opt)
42
        {
43
        case 'h':
44
            h = 1;
45
            break;
46
        case 'v':
47
            v = 1;
48
            break;
49
        case 's':
50
            s = atoi(optarg);
51
            break;
52
        case 'E':
53
            E = atoi(optarg);
54
            break;
55
        case 'b':
56
            b = atoi(optarg);
57
            break;
58
        case 't':
59
            strcpy(t, optarg);
60
            break;
61
        case '?':
62
            invalidOption = 1;
63
            break;
64
        }
65
    }
66
    if (h)
67
    {
68
        printUsage();
69
        return 0;
70
    }
71
    if (invalidOption || s <= 0 || E <= 0 || b <= 0 || t == NULL)
72
    {
73
        printf("OOPS! Something is wrong! Check your input.\n");
74
        printUsage();
75
        return -1;
76
    }
77
    FILE* pFile = fopen(t, "r");
78
    if(pFile == NULL)
79
    {
80
        printf("%s: No such file or directory\n", t);
81
        return -1;
82
    }
83

84

85
    int S = 1 << s;
86
    // init my cache & queue, C99 supports VLA, no need for malloc & free
87
    cache_line cache[S][E];
88
    memset(cache, 0, sizeof(cache));
89

90
    /**
91
     * 核心代码
92
     */ 
93
    char identifier;
94
    unsigned long long address;
95
    int size;
96
    while (fscanf(pFile, " %c %llx,%d", &identifier, &address, &size) != -1)
97
    {
98
        if (identifier == 'I') continue; // no operation for instruction load
99
        unsigned int tag = address >> (s+b);
100
        unsigned int set_index = address << (64-s-b) >> (64-s);
101
        if (v) printf("%c %llx,%d", identifier, address, size);
102
        int cnt;
103
        if (identifier == 'M')  cnt = 2;
104
        else cnt = 1;
105
        for (int j = 0; j < cnt; j++)
106
        {
107
            //hit!
108
            int isHit = 0;
109
            int hasEmpty = 0;
110
            for (int i = 0; i < E; i++)
111
            {
112
                if (cache[set_index][i].valid == 1 && cache[set_index][i].tag == tag)
113
                {
114
                    for (int k = 0; k < E; k++)
115
                        if (cache[set_index][k].valid)
116
                            cache[set_index][k].stamp++;
117
                    cache[set_index][i].stamp = 0;
118
                    hit_count++;
119
                    isHit = 1;
120
                    if (v) printf(" hit");
121
                    break;
122
                }
123
            }
124
            if (isHit) continue;
125

126
            //miss!
127
            miss_count++;
128
            if (v) printf(" miss");
129
            for (int i = 0; i < E; i++)
130
            {
131
                if (cache[set_index][i].valid == 0)
132
                {
133
                    cache[set_index][i].stamp = 0;
134
                    cache[set_index][i].tag = tag;
135
                    for (int k = 0; k < E; k++)
136
                        if (cache[set_index][k].valid)
137
                            cache[set_index][k].stamp++;
138
                    cache[set_index][i].valid = 1;
139
                    hasEmpty = 1;
140
                    break;
141
                }
142
            }
143
            if (hasEmpty) continue;
144

145
            //eviction!
146
            eviction_count++;
147
            int maxStamp = -1;
148
            int maxStampIndex = 0;
149
            for (int i = 0; i < E; i++)
150
            {
151
                if (maxStamp < cache[set_index][i].stamp)
152
                {
153
                    maxStamp = cache[set_index][i].stamp;
154
                    maxStampIndex = i;
155
                }
156
                cache[set_index][i].stamp++;
157
            }
158
            cache[set_index][maxStampIndex].tag = tag;
159
            cache[set_index][maxStampIndex].stamp = 0;
160
            if (v) printf(" eviction");
161
        }
162
        if (v) printf("\n");
163
    }
164
    
165

166
    fclose(pFile);
167
    
168
    printSummary(hit_count, miss_count, eviction_count);
169
    return 0;
170
}

关键就是getopt和fscanf两个函数需要学习一下。~~sdyt，这是本人第一次正式写C程序~~

测试如下：


xxxxxxxxxx
14
1
linux> ./test-csim
2
                        Your simulator     Reference simulator
3
Points (s,E,b)    Hits  Misses  Evicts    Hits  Misses  Evicts
4
     3 (1,1,1)       9       8       6       9       8       6  traces/yi2.trace
5
     3 (4,2,4)       4       5       2       4       5       2  traces/yi.trace
6
     3 (2,1,4)       2       3       1       2       3       1  traces/dave.trace
7
     3 (2,1,3)     167      71      67     167      71      67  traces/trans.trace
8
     3 (2,2,3)     201      37      29     201      37      29  traces/trans.trace
9
     3 (2,4,3)     212      26      10     212      26      10  traces/trans.trace
10
     3 (5,1,5)     231       7       0     231       7       0  traces/trans.trace
11
     6 (5,1,5)  265189   21775   21743  265189   21775   21743  traces/long.trace
12
    27
13

14
TEST_CSIM_RESULTS=27

得分27就对了。

2. Part B

Part B要求我们实现trans.c文件中的transpose_submit函数，对32*32，64*64，61*67三个矩阵进行特殊的转置优化。cache大小为s = 5， E = 1, b = 5。即，直接映射Cache，有32组，每组一行，每行能放32个字节(8个int)。

评分要求为：

32 * 32，miss<300得8分，miss>600不得分
64 * 64，miss<1300得8分，miss>2000不得分
61 * 67，miss<2000得10分，miss>3000不得分

2.1 32*32

1）naive-plus solution

最直接的理解为：


xxxxxxxxxx
10
1
int i, j, tmp;
2

3
for (i = 0; i < N; i++) 
4
{
5
     for (j = 0; j < M; j++) 
6
     {
7
            tmp = A[i][j];
8
            B[j][i] = tmp;
9
      }
10
}

测试出miss数为1183，远远大于要求的300~~（为什么要运行这么久才能测出来）~~。

解释：

以i=0，j=0~31的例子说明。cache每个block放8个int，A矩阵按行读入，第一行产生32/8=4次miss。B矩阵按列写入，每次写都会产生miss，共有32次。在i=1的时候，写入B矩阵的第1列，仍会产生32次miss，这是由cache只有32行所导致的。这样算出来有36*32=1152次miss，测试却是1183。这是产生了冲突不命中（抖动）的原因。对于AB矩阵主对角线上的元素会映射到同一cache块上面，这是由AB的地址所导致的（这一点不查别人的资料很难想到）。也就是说读A[0][0]的时候产生一次miss，然后把A[0][0]到A[0][7]的元素存到了cache的中；接下来把A[0][0]写入B[0][0]中，但是在写入的时候会把B[0][0]到B[0][7]的元素存到cache之前存了A[0][0]到A[0][7]的那一行中，接下来读取A[0][1]的时候，又要把A[0][0]到A[0][7]存回该block中。这便产生了冲突不命中。

具体查看生成的trace.f0文件有：


xxxxxxxxxx
23
1
S 18c08c,1 miss 
2
L 18c0a0,8 miss 
3
L 18c084,4 hit 
4
L 18c080,4 hit //不知道前四行在干啥，可能是调用函数之类的
5
L 10c080,4 miss eviction //load A[0][0] in set4
6
S 14c080,4 miss eviction //store B[0][0] in set 4, evict A[0][0]~A[0][7] from set4
7
L 10c084,4 miss eviction //load A[0][1] int set4 thrash!!!
8
S 14c100,4 miss //store B[1][0] in set8
9
L 10c088,4 hit //load A[0][2] in set4
10
S 14c180,4 miss //store B[2][0] in set12
11
L 10c08c,4 hit //load A[0][3] in set4
12
S 14c200,4 miss //store B[3][0] in set16
13
L 10c090,4 hit //load A[0][4] in set4
14
S 14c280,4 miss //store B[4][0] in set20
15
L 10c094,4 hit //load A[0][5] in set4
16
S 14c300,4 miss //store B[5][0] in set24
17
L 10c098,4 hit //load A[0][6] in set4
18
S 14c380,4 miss //store B[6][0] in set28
19
L 10c09c,4 hit //load A[0][7] in set4
20
S 14c400,4 miss //load B[7][0] in set0
21
L 10c0a0,4 miss eviction //load A[0][8] in set5
22
S 14c480,4 miss eviction //store B[8][0] in set4, evict A[0][0]~A[0][7] from set4
23
...

从第五行开始看，A首地址在0x10c080、B首地址在0x14c080。这样对角线的元素会产生抖动，增加一次miss，对角线共32个元素，那么1152+32=1184，和实际结果的1183十分接近。算是一个不错的估算了，可能中间还有些细节没有考虑到。

2) naive solution

方法1每次写入B矩阵都产生了miss，而方法2利用分块技术来避免写入B矩阵时产生的大量miss。采取的分块是8*8的，一方面，每个cache块存8个int；另一方面两个8*8的块用16个cache块，不会产生容量不命中。而且8又是32的约数，对编码来说比较简单。于是有：


xxxxxxxxxx
8
1
int i, j, m, n;
2
for (i = 0; i < N; i += 8)
3
    for (j = 0; j < M; j += 8)
4
        for (m = i; m < i + 8; ++m)
5
            for (n = j; n < j + 8; ++n)
6
            {
7
                B[n][m] = A[m][n];
8
            }

测试出miss数为343。还是不能达到满分。

解释：

对于主对角线上的8*8分块，仍然会产生冲突不命中。对于不在主对角线上的12个块共有16*12次miss。对于在主对角线上的四个块，第0行10次miss、第1~6行4次，第7行3次，这样共有37*4次miss。算出来是16*12+4*37=340。与343很接近。~~不排除算错~~

3) acceptable solution

我们知道在主对角线上的块会产生冲突也就是：当把A[0][0]写入B[0][0]时，会把B[0][0]到B[0][7]的元素存到cache之前存了A[0][0]到A[0][7]的那一行中，接下来读取A[0][1]的时候，又要把A[0][0]到A[0][7]存回该block中。那么直接在读A[0][0]的时候顺便把A[0][1]到A[0][7]都读了不久可以解决问题了吗。于是有：


xxxxxxxxxx
26
1
for (int i = 0; i < N; i += 8)
2
{
3
    for (int j = 0; j < M; j += 8)
4
    {
5
        for (int k = i; k < i + 8; ++k)
6
        {
7
            int v0 = A[k][j];
8
            int v1 = A[k][j + 1];
9
            int v2 = A[k][j + 2];
10
            int v3 = A[k][j + 3];
11
            int v4 = A[k][j + 4];
12
            int v5 = A[k][j + 5];
13
            int v6 = A[k][j + 6];
14
            int v7 = A[k][j + 7];
15

16
            B[j][k] = v0;
17
            B[j + 1][k] = v1;
18
            B[j + 2][k] = v2;
19
            B[j + 3][k] = v3;
20
            B[j + 4][k] = v4;
21
            B[j + 5][k] = v5;
22
            B[j + 6][k] = v6;
23
            B[j + 7][k] = v7;
24
        }
25
    }
26
}

测试结果有miss数为287。

$12*16+4*24=288$ 。优化掉了读取A时候的冲突不命中，但是在写入B的时候还是有8*4次额外的冲突不命中。但是已经满分了。

理论上最小的miss数为16*16=256次，但是没有必要再去琢磨了。

2.2 64*64

上来就高兴地用刚刚的8*8分块一跑，miss数高达4611次。朴素的做法也只有4723次miss。。。问题出在写入B时产生了冲突。比如说写入B[0][0]~B[0][7]的时候依次写入了set0、set8、set16、set24、set0、set8、set16、set24这样便产生了冲突。

那用4*4分块跑，如下：


xxxxxxxxxx
17
1
for (i = 0; i < N; i+=4)
2
{
3
    for (j = 0; j < M; j+=4)
4
    {
5
        for (k = i; k < i + 4; k++)
6
        {
7
            v0 = A[k][j];
8
            v1 = A[k][j+1];
9
            v2 = A[k][j+2];
10
            v3 = A[k][j+3];
11
            B[j][k] = v0;
12
            B[j+1][k] = v1;
13
            B[j+2][k] = v2;
14
            B[j+3][k] = v3;
15
        }
16
    }
17
}

miss数还是达1699。这主要由于对块的利用不高导致，一个cache块可以存8个int，但是每次只用4个。

于是就gg了。百度启动！


xxxxxxxxxx
55
1
for (int i = 0; i < N; i += 8)
2
{
3
    for (int j = 0; j < M; j += 8)
4
    {
5
        for (int k = i; k < i + 4; ++k)
6
        {
7
        /* 读取1 2，暂时放在左下角1 2 */
8
            int temp_value0 = A[k][j];
9
            int temp_value1 = A[k][j+1];
10
            int temp_value2 = A[k][j+2];
11
            int temp_value3 = A[k][j+3];
12
            int temp_value4 = A[k][j+4];
13
            int temp_value5 = A[k][j+5];
14
            int temp_value6 = A[k][j+6];
15
            int temp_value7 = A[k][j+7];
16
          
17
            B[j][k] = temp_value0;
18
            B[j+1][k] = temp_value1;
19
            B[j+2][k] = temp_value2;
20
            B[j+3][k] = temp_value3;
21
          /* 逆序放置 */
22
            B[j][k+4] = temp_value7;
23
            B[j+1][k+4] = temp_value6;
24
            B[j+2][k+4] = temp_value5;
25
            B[j+3][k+4] = temp_value4;
26
        }
27
         for (int l = 0; l < 4; ++l)
28
        {
29
           /* 按列读取 */
30
            int temp_value0 = A[i+4][j+3-l];
31
            int temp_value1 = A[i+5][j+3-l];
32
            int temp_value2 = A[i+6][j+3-l];
33
            int temp_value3 = A[i+7][j+3-l];
34
            int temp_value4 = A[i+4][j+4+l];
35
            int temp_value5 = A[i+5][j+4+l];
36
            int temp_value6 = A[i+6][j+4+l];
37
            int temp_value7 = A[i+7][j+4+l];
38
           
39
           /* 从下向上按行转换2到3 */
40
            B[j+4+l][i] = B[j+3-l][i+4];
41
            B[j+4+l][i+1] = B[j+3-l][i+5];
42
            B[j+4+l][i+2] = B[j+3-l][i+6];
43
            B[j+4+l][i+3] = B[j+3-l][i+7];
44
           /* 将3 4放到正确的位置 */
45
            B[j+3-l][i+4] = temp_value0;
46
            B[j+3-l][i+5] = temp_value1;
47
            B[j+3-l][i+6] = temp_value2;
48
            B[j+3-l][i+7] = temp_value3;
49
            B[j+4+l][i+4] = temp_value4;
50
            B[j+4+l][i+5] = temp_value5;
51
            B[j+4+l][i+6] = temp_value6;
52
            B[j+4+l][i+7] = temp_value7;
53
        } 
54
    }
55
}

这个miss数是1243，已经满分了。相当tricky的解决办法。大体思路：通过把8*8的块分成四个4*4的块进行处理，即充分利用了cache块可以存八个int的能力，又避免了写入B产生的冲突。本代码来自CS:APP3e 深入理解计算机系统_3e CacheLab实验 - 李秋豪 - 博客园 (cnblogs.com)，博客里面有详细的图片解释，可以学习一下。~~本人只是意会了一下~~

2.3 61*67

这里要求放的很宽，只要小于2000就行，下面这样的16*16分块就行了。


xxxxxxxxxx
13
1
for (i = 0; i < N; i+=16)
2
{
3
    for (j = 0; j < M; j+=16)
4
    {
5
        for (k = i; k < i + 16 && k < N; k++)
6
        {
7
            for (l = j; l < j + 16 && l < M; l++)
8
            {
9
                B[l][k] = A[k][l];
10
            }
11
        }
12
    }
13
}

miss数1992，刚刚够！

3. 写在后面

本人于2021/11/2完成了Cache Lab，差不多用了四天时间吧。Cache Lab算是目前最有代码量的一个lab了。Part A要求实现cache simulator，完全是自己写的代码（不像arch lab那样），整个完成的思路还是比较清晰的。Part B很多还是要参考网上的写法，尤其是64*64的分块还是可以继续深入下去玩味的。总的来说，Cache Lab也挺不错的！

个人主要强化了一下知识：

main memory中的数据如何缓存到cache之中
（什么t、s、b、E呀乱七八糟的东西应该真的记住了）
main函数如何优雅地处理argv的参数
c programming! （学校里都是c++
分块技术降低miss数

有意思的是：

wsl2测试part B的速度十分慢，会产生超时的情况。
Windows下mingw缺少<sys/wait.h>库文件，却可以编译test-trans.c文件。
妙啊，原来编辑器是在windows环境下所以找不到这个库文件，但是终端是在linux环境下，所以可以成功编译。

预计接下来：

shell lab启动？
吃晚饭！