CSAPP--Cache Lab实验记录

2021-12-10 CSAPP csapp-lab 0 评论字数统计: 2239字阅读时长: 5分

CSAPP–Cache Lab实验记录

前言

This lab will help you understand the impact that cache memories can have on the performance of your C programs. The lab consists of two parts.

In the first part you will write a small C program (about 200-300 lines) that simulates the behavior of a cache memory. In the second part, you will optimize a small matrix transpose function, with the goal of minimizing the number of cache misses.

这个实验包括两个部分，Part A 是编写一个c语言cache模拟器。Part B是一个矩阵转置，给出了三种规模，你的任务就是尽可能的提高高速缓存的命中率，它会根据你的miss，hits，eviction这三个值的大小进行打分，评分的范围已给出。

实验准备

下载实验资料–官网
深入理解计算机系统第六章
实验指导ppt

实验内容

Part A

（1）内容分析

编写一个cache模拟器，该模拟器可以模拟在一系列的数据访问中cache的命中、不命中与牺牲行的情况，其中，需要牺牲行时，用LRU替换策略进行替换。

cache模拟器需要处理下列的命令

Usage: ./csim-ref [-hv] -s <s> -E <E> -b <b> -t <tracefile>

其中各参数意义如下：

①-h：输出帮助信息的选项；

②-v：输出详细运行过程信息的选项；

③-s：组索引的位数(意味着组数S=2^s)；

④-E：每一组包含的行数；

⑤-b：偏移位的宽度(意味着块的大小为B=2^b);

⑥-t：输入数据文件的路径(测试数据从该文件里面读取)。

traces 文件夹下面的五个文件是测试数据。以·yi.trace为例

$ cat yi.trace
 L 10,1
 M 20,1
 L 22,1
 S 18,1
 L 110,1
 L 210,1
 M 12,1

格式为：

(空格) 操作类型地址大小

其中，

“I”表示指令加载，

“L”表示数据加载，

“S”表示数据存储，

“M”表示数据修改（即数据存储之后的数据加载）。

每个“I”前面都没有空格。每个“M”，“L”和“S”之前总是有空格。

地址字段指定一个32位的十六进制存储器地址。

大小字段指定操作访问的字节数；

每个数据加载 (L) 或存储 (S) 操作最多可导致一次缓存未命中。数据修改操作 (M) 被视为加载，然后是存储到同一地址。因此，M 操作可能导致两次缓存命中，或者一次未命中和一次命中加上可能的逐出。

（2）cache

首先是cache的构成:

cache有2^s组，每组有E行（E分为三种情况：E=1，称为直接映射高速缓存；1<E<C/B:组相联高速缓存；E=C/B，全相连高速缓存，也就是一组包含了所有的行），然后是每行的构成：（1）有效位vild(取0和1，1表示存储了有效信息，0表示没有，判断命中的时候需要用到）（2）标记位t=m-(s+b),当cpu要读取某个地址的内容时，就会将某个地址的第一个部分：标记位与它进行对比，匹配相等则命中，也就是锁定在某一行（3）数据块B，B负责存储这个地址的内容，把B想象成字节数组就好，共有B-1个小块（别和B这个数据块搞混咯,下标从0开始，因此是B-1）。在这里简单提下b的作用吧，虽然实验没用到。b理解为数据块B这个数组的下标就好，也就是你要从B[b]这个位置开始读取字。注意这里的数据都是2进制的。

根据上面的分析可以定义cache结构体：

// 定义行属性
typedef struct{
    int valid_bit;
    int tag;
    int lru_count;
}Cache_line,**Cache;

（3）几个函数

**getopt()：**解析命令行
**fscanf：**读文件
**malloc/free：**分配释放内存

（4）编写程序

#include<stdlib.h>
#include<stdio.h>
#include<unistd.h>
#include<malloc.h>
#include<string.h>
#include <limits.h>
#include<getopt.h>
#include"cachelab.h"
int h,v,s,E,b,S; //模拟参数

char trace[1000]; //接收trace文件路径

int hit_count,miss_count,eviction_count;
// 定义行属性
typedef struct{
    int valid_bit;
    int tag;
    int lru_stamp;//时间戳
}Cache_line,*Cache_asso,**Sim_cache;

Sim_cache Cache = NULL;

int timenow=0; // 记录当前时间

// 初始化cache
void init_cache(){
    Cache = (Sim_cache)malloc(sizeof(Cache_asso)*S);
    for(int i=0;i<S;++i){
        Cache[i] = (Cache_asso)malloc(sizeof(Cache_line)*E);
        for(int j=0;j<E;++j){
            Cache[i][j].valid_bit=0;
            Cache[i][j].tag=-1;
            Cache[i][j].lru_stamp=0;
        }
    }
}

/* 打印 helper 内容的函数，-h 命令使用，内容可自定义*/
void printUsage()
{
    printf("Usage: ./csim-ref [-hv] -s <num> -E <num> -b <num> -t <file>\n"
            "Options:\n"
            "  -h         Print this help message.\n"
            "  -v         Optional verbose flag.\n"
            "  -s <num>   Number of set index bits.\n"
            "  -E <num>   Number of lines per set.\n"
            "  -b <num>   Number of block offset bits.\n"
            "  -t <file>  Trace file.\n\n"
            "Examples:\n"
            "  linux>  ./csim-ref -s 4 -E 1 -b 4 -t traces/yi.trace\n"
            "  linux>  ./csim-ref -v -s 8 -E 2 -b 4 -t traces/yi.trace\n");
}
// 加载数据
void loadData(unsigned int addr){
    int set_index=(addr >> b) & ((-1U) >> (64 - s));
    int tag = addr>>(b+s);
    int min_stamp = INT_MAX;
    int min_stamp_index = -1;
    for(int i=0;i<E;++i){
        if(Cache[set_index][i].tag==tag){
            Cache[set_index][i].lru_stamp=timenow;
            ++hit_count;
            if(v==1)printf("hit ");
            return;
        }
    }
    // 未命中
    for(int i=0;i<E;++i){ //查看有无空位
        if(Cache[set_index][i].valid_bit==0){
            Cache[set_index][i].valid_bit=1;
            Cache[set_index][i].tag = tag;
            Cache[set_index][i].lru_stamp=timenow;
            ++miss_count;
            if(v==1)printf("miss ");
            return;
        }
    }
    if(v==1)printf("miss eviction ");
    //满了
    ++eviction_count;
    ++miss_count;
    //找出早那个
    for(int i=0;i<E;++i){
        if(Cache[set_index][i].lru_stamp<min_stamp){
            min_stamp_index=i;
            min_stamp=Cache[set_index][i].lru_stamp;
        }
    }
    Cache[set_index][min_stamp_index].tag=tag;
    Cache[set_index][min_stamp_index].lru_stamp=timenow;
    return;

}
// 释放内存
void free_Cache(){
    for(int i = 0; i < S; ++i)
		free(Cache[i]);
	free(Cache);         
}
// 解析trace的参数
void pahse_trace(){
    FILE* pFile = fopen(trace,"r");
    if(pFile==NULL){
        printf("open file error\n");
        exit(-1);
    }
    char op[10];
    unsigned int addr;
    int size;
    while(fscanf(pFile,"%s %x,%d\n",op,&addr,&size)>0){
        if(strcmp(op,"I")==0)continue;
        if(v==1)printf("%s %x %d ",op,addr,size);
        loadData(addr);
        if(strcmp(op,"M")==0){
            loadData(addr);
        }
        if(v==1)printf("\n");
        timenow++;
    }   
    fclose(pFile);
    free_Cache();
}



int main(int argc, char* argv[])
{
    h=0;
    v=0;
    hit_count=miss_count=eviction_count=0;
    int opt=-1;
    while (-1!=(opt=getopt(argc,argv,"hvs:E:b:t:")))
    {
        switch (opt)
        {
        case 'v':
            v = 1;
            break;
        case 's':
            s = atoi(optarg);
            break;
        case 'E':
            E = atoi(optarg);
            break;
        case 'b':
            b = atoi(optarg);
            break;
        case 't':
            strcpy(trace,optarg);
            break;
        case 'h':
            h = 1;
        default:
            printUsage();
            break;
        }
    }
    //检验参数
    if(s<=0 || E<=0 || b<=0 ||trace==NULL){
        printf("illegal argument\n");
        exit(-1);
    }
    S=1<<s;
    if(h==1){
        printUsage();
    }
    init_cache();
    pahse_trace();
    printSummary(hit_count, miss_count, eviction_count);
    return 0;
}

编译，并查看结果：

$ make clean;make
rm -rf *.o
rm -f *.tar
rm -f csim
rm -f test-trans tracegen
rm -f trace.all trace.f*
rm -f .csim_results .marker
gcc -g -Wall -Werror -std=c99 -m64 -o csim csim.c cachelab.c -lm
gcc -g -Wall -Werror -std=c99 -m64 -O0 -c trans.c
gcc -g -Wall -Werror -std=c99 -m64 -o test-trans test-trans.c cachelab.c trans.o
gcc -g -Wall -Werror -std=c99 -m64 -O0 -o tracegen tracegen.c trans.o cachelab.c
# Generate a handin tar file each time you compile
tar -cvf lincx-handin.tar  csim.c trans.c
csim.c
trans.c

$ ./test-csim
                        Your simulator     Reference simulator
Points (s,E,b)    Hits  Misses  Evicts    Hits  Misses  Evicts
     3 (1,1,1)       9       8       6       9       8       6  traces/yi2.trace
     3 (4,2,4)       4       5       2       4       5       2  traces/yi.trace
     3 (2,1,4)       2       3       1       2       3       1  traces/dave.trace
     3 (2,1,3)     167      71      67     167      71      67  traces/trans.trace
     3 (2,2,3)     201      37      29     201      37      29  traces/trans.trace
     3 (2,4,3)     212      26      10     212      26      10  traces/trans.trace
     3 (5,1,5)     231       7       0     231       7       0  traces/trans.trace
     6 (5,1,5)  265189   21775   21743  265189   21775   21743  traces/long.trace
     27

TEST_CSIM_RESULTS=27

全部通过。

Part B

（1）任务：

编写一个实现矩阵转置的函数。即对于给定的矩阵A[N][M]，得到矩阵B[M][N]，使得对于任意0<=i<N、0<=j<M，有B[j][i]=A[i][j]，并且使函数调用过程中对cache的不命中数miss尽可能少。
在如下函数里面编写最终代码：

char transpose_submit_desc[] = “Transpose submission”;

void transpose_submit(int M, int N, int A[N][M], int B[M][N])；

要求：
1. 最多只能定义 12 个局部变量
2. 不使用任何的数组，不调用任何类似于 malloc 的开辟内存的函数
3. 测试矩阵的规模为 32 × 32，64 × 64，61 × 67
4. 测试 cache 的组成：32 组，每组一行，每个块 32 字节。对于编写的函数，miss 个数越少越好。
（2）分析

这里可分成32x32，16x16，8x8，4x4,cache每块的字节数是32，也就是8个int，因此为了有效地利用cache的块容量，选取8x8的分块效果是最好的。

本文链接: https://lincx-911.github.io/2021/12/cache_lab/

License: CC BY 4.0 CN

lincxdeveloper

预防掉发，多敲代码