改进矩阵乘法程序
实验内容
待优化代码:
#include <iostream>
#include <time.h>
#include <string.h>
#include <math.h>
using namespace std;
#define MATRIX_SIZE 1024
int main() {
clock_t start, finish;
clock_t start_new, finish_new;
register int i, j, k;
int (*a)[MATRIX_SIZE], (*b)[MATRIX_SIZE];
a = new int[MATRIX_SIZE][MATRIX_SIZE];
b = new int[MATRIX_SIZE][MATRIX_SIZE];
for(i = 0; i < MATRIX_SIZE; i ++) {
for(j = 0; j < MATRIX_SIZE; j ++) {
a[i][j] = i % (j + 1);
b[i][j] = i / (j + 1);
}
}
int (*c)[MATRIX_SIZE], (*d)[MATRIX_SIZE];
c = new int[MATRIX_SIZE][MATRIX_SIZE];
d = new int[MATRIX_SIZE][MATRIX_SIZE];
// initialize
memset(d, 0, MATRIX_SIZE * MATRIX_SIZE * sizeof(int));
start_new = clock();
// TODO: Your optimized code:
//======================================================
for (i = 0; i < MATRIX_SIZE; i ++)
for (j = 0; j < MATRIX_SIZE; j ++)
for (k = 0; k < MATRIX_SIZE; k ++)
d[i][j] += a[i][k] * b[k][j];
// Stop here.
//======================================================
finish_new = clock();
int (*a_)[MATRIX_SIZE], (*b_)[MATRIX_SIZE];
a_ = new int[MATRIX_SIZE][MATRIX_SIZE];
b_ = new int[MATRIX_SIZE][MATRIX_SIZE];
for (i = 0; i < MATRIX_SIZE; i ++) {
for (j = 0; j < MATRIX_SIZE; j ++) {
a_[i][j] = i % (j + 1);
b_[i][j] = i / (j + 1);
}
}
// initialize
memset(c, 0, MATRIX_SIZE * MATRIX_SIZE * sizeof(int));
start = clock();
for (i = 0; i < MATRIX_SIZE; i ++)
for (j = 0; j < MATRIX_SIZE; j ++)
for (k = 0; k < MATRIX_SIZE; k ++)
c[i][j] += a_[i][k] * b_[k][j];
finish = clock();
// compare
for(i = 0; i < MATRIX_SIZE; i++) {
for(j = 0; j < MATRIX_SIZE; j++) {
if (c[i][j] != d[i][j]) {
cout << "you have got an error in algorithm modification!" << endl;
exit(1);
}
}
}
cout << "time spent for original method : " << (double)(finish - start) / CLOCKS_PER_SEC << " s" << endl;
cout << "time spent for new method : " << (double)(finish_new - start_new) / CLOCKS_PER_SEC << " s" << endl;
cout << "time ratio of performance optimization : " << (double)(finish - start) / (finish_new - start_new) << endl;
return 0;
}
优化位于 TODO 内的代码达到至少 3 倍的加速比。
实验要求
- 使用 O0 参数编译,编译时不打开任何优化开关
- 不能使用汇编指令直接优化,如内联汇编
- 不能使用并行方法优化,如 openmp,MPI,向量化等等
- 不允许修改 TODO 外的非优化部分的代码
- 要保证优化后结果正确,会进行正确性检查
- 运行时不绑定 CPU
主流商用 CPU 满足实验要求应当很容易,如果你的机器实在达不到 3 倍加速比,请在文档中解释原因或和助教沟通
方法示例
- 更改运算顺序,使得访存序列更友好
- 矩阵分块
- ……
最后更新:
2025年3月12日
作者: