YDYの博客

一只有理想的菜鸟

矩阵乘优化

Github

四份设备端程序需满足以下要求:

  • 能正确处理数组长度不对齐的情况
  • 能通过CPU端验证程序的验证,数据量n为13773131
  • 四份程序增量式开发,所有程序的kernel函数名与接口必须满足要求
  • 四份代码以.dev.c作为后缀(方便直接使用预先提供的Makefile文件进行编译)
  • 四份代码分布增量式使用了如下优化
    • daxpy_11.dev.c:多线程并行优化
    • daxpy_12.dev.c:多线程并行优化+核内SM缓存优化
    • daxpy_l3.dev.c:多线程并行优化+核内AM缓存优化 + 向量Intrinsic向量化优化
    • daxpy_l4.dev.c:多线程并行优化+核内AM缓存优化 + 向量Intrinsic向量化优化 +循环展开优化

daxpy.dev.c

  • 目录树
1
2
3
4
5
6
7
hnu_ydy@ln6:~/daxpy_expr$ tree
.
├── daxpy.dev.c #待优化设备端程序
├── daxpy.hos.c #CPU端检验程序
├── make.conf #编译环境配置文件
├── Makefile #CPU端程序与设备端程序编译构建文件
└── README.txt
  • 设置环境变量
1
export LD_LIBRARY_PATH=/vol8/appsoftware/mt3000_programming_env-inbox/mt3000_programming_env-20230315/third-party-lib/:$LD_LIBRARY_PATH
  • 设备端
1
2
3
4
5
6
__global__
void daxpy_kernel(uint64_t n, double a, double *x, double *y){
for (uint64_t i = 0; i < n; ++i){
y[i] = a * x[i] + y[i];
}
}
1
yhrun -p TNG ./daxpy.hos 0 13773131 3.14 ./daxpy.dev.dat

daxpy.png

daxpy_l1.dev.c

多线程并行优化

  • cpu端指定启动多个设备线程
1
2
int threadId = hthread_group_create(clusterId,\
8, "daxpy_threads",1,3,args);
  • 设备端进行数据划分与计算
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
__global__
void daxpy_threads(uint64_t n, double a, double *x, double *y){
int threadId = get_thread_id();
int threadsNum = get_group_size();
uint64_t n_p = n/threadsNum;
uint64_t extras = n%threadsNum;
uint64_t offset;
if(threadId < extras){
n_p++;
offset=threadId*n_p;
}else{
offset=threadId*(n_p+1)-(threadId-extras);
}
daxpy_kernel(n_p,a,x+offset,y+offset);
}

daxpy_l1.png

daxpy_12.dev.c

多线程并行优化+核内SM缓存优化

  • 使用SM缓存数据 & 使用DMA搬运数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
void daxpy_singe(uint64_t n, double a, double *x, double *y){

double x_cache[M_MT_CacheNum];
double y_cache[M_MT_CacheNum];
uint64_t n_cache = M_MT_CacheNum;
uint64_t n_actual = 0;
uint64_t i=0;
for(i=0;i<n;i+=n_actual){
n_actual=(n-i) >= n_actual ? n_cache : (n-i);
uint64_t size_actual =n_actual * sizeof(double);
scalar_load(x+i,x_cache, size_actual);
scalar_load(y+i,y_cache, size_actual);
daxpy_kernel(n_actual,a,x_cache,y_cache);
scalar_store(y_cache,y+i,size_actual);
}
}

daxpy_l2.png

daxpy_13.dev.c

多线程并行优化+核内AM缓存优化 + 向量Intrinsic向量化优化

  • 使用核内AM空间缓存数据
  • 使用向量Intrinsic加速daxpy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
__global__
void daxpy_singe(uint64_t n, double a, double *x, double *y){
uint64_t n_cache =M_MT_CacheNum * M_MT_VecLen;
lvector double *x_cache=(lvector double *)vector_malloc(n_cache*sizeof(double));
lvector double *y_cache=(lvector double *)vector_malloc(n_cache*sizeof(double));
uint64_t n_actual = 0;
lvector double a_vec =vec_svbcast(a);
for(uint64_t i=0;i<n;i+=n_actual){
n_actual=(n-i) >= n_actual ? n_cache : (n-i);
uint64_t size_actual =n_actual * sizeof(double);
vector_load(x+i,x_cache,size_actual);
vector_load(y+i,y_cache,size_actual);
uint64_t vLen =n_actual / M_MT_VecLen;
vLen=((n_actual % M_MT_VecLen)==0) ? vLen :(vLen + 1);
daxpy_kernel_vector(vLen, a_vec,x_cache,y_cache);
vector_store(y_cache,y+i, size_actual);
}
vector_free(x_cache);
vector_free(y_cache);

}
1
2
3
4
5
6
7
8
9
10
__global__
daxpy_kernel_vector(uint64_t n, lvector double a, lvector double *x, lvector double *y)
{
for (uint64_t i = 0; i < n; ++i){
lvector double x_vec=vec_ld(0,x+i);
lvector double y_vec=vec_ld(0,y+i);
y_vec=vec_mula(x_vec,a,y_vec);
vec_st(y_vec,0,y+i);
}
}

daxpy_l3.png

daxpy_14.dev.c

多线程并行优化+核内AM缓存优化 + 向量Intrinsic向量化优化 +循环展开优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
__global__
void daxpy_threads(uint64_t n, double a, double *x, double *y){
int threadId = get_thread_id();
int threadsNum = get_group_size();
uint64_t n_p = n/threadsNum;
uint64_t extras = n%threadsNum;
uint64_t offset;
if(threadId < extras){
n_p++;
offset=threadId*n_p;
}else{
offset=threadId*(n_p+1)-(threadId-extras);
}
//daxpy_kernel(n_p,a,x+offset,y+offset);
daxpy_singe(n_p,a,x+offset,y+offset);

}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
__global__
void daxpy_singe(uint64_t n, double a, double *x, double *y){
uint64_t n_cache =M_MT_CacheNum * M_MT_VecLen;
lvector double *x_cache=(lvector double *)vector_malloc(n_cache*sizeof(double));
lvector double *y_cache=(lvector double *)vector_malloc(n_cache*sizeof(double));
uint64_t n_actual = 0;
lvector double a_vec =vec_svbcast(a);
for(uint64_t i=0;i<n;i+=n_actual){
n_actual=(n-i) >= n_actual ? n_cache : (n-i);
uint64_t size_actual =n_actual * sizeof(double);
vector_load(x+i,x_cache,size_actual);
vector_load(y+i,y_cache,size_actual);
uint64_t vLen =n_actual / M_MT_VecLen;
vLen=((n_actual % M_MT_VecLen)==0) ? vLen :(vLen + 1);
daxpy_cache(vLen, a_vec,x_cache,y_cache);
vector_store(y_cache,y+i, size_actual);
}
vector_free(x_cache);
vector_free(y_cache);

}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

__global__
void daxpy_cache(uint64_t n, lvector double a, lvector double *x, lvector double *y)
{
uint64_t i=0;
for(i = 0; i < n; i += 2){
lvector double x_vec_0=vec_ld(0,x+i+0);
lvector double x_vec_1=vec_ld(0,x+i+1);
lvector double y_vec_0=vec_ld(0,y+i+0);
lvector double y_vec_1=vec_ld(0,y+i+1);
y_vec_0=vec_mula(x_vec_0,a,y_vec_0);
y_vec_1=vec_mula(x_vec_1,a,y_vec_1);
vec_st(y_vec_0,0,y+i+0);
vec_st(y_vec_1,0,y+i+1);
}
//TODO:tail

}

这里c代码的二重循环展开优化不高,最好多展开几层

daxpy_l4.png

参考代码

多线程并行优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
__global__
void daxpy_kernel(uint64_t n, double a, double *x, double *y){
int threadId = get_thread_id();
int threadsNum = get_group_size();
if(threadId >= threadsNum) return;
uint64_t n_p = n / threadsNum;
uint64_t extras = n % threadsNum;
uint64_t offset;
if(threadId < extras) {
n_p++;
offset = threadId * n_p;
}else{
offset = threadId * (n_p + 1) - (threadId - extras);
}
daxpy_single(n_p, a, x + offset, y + offset);
}

多线程并行优化+核内SM缓存优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

#define M_MT_CacheNum 2048 // (2048 * 8) / 1024 = 16KB

static inline
void daxpy_single(uint64_t n, double a, double *x, double *y){
//malloc cache memory
double x_cache[M_MT_CacheNum];
double y_cache[M_MT_CacheNum];
uint64_t n_cache = M_MT_CacheNum;
uint64_t n_actual = 0;
uint64_t i = 0;
for( i = 0; i < n; i += n_actual){
n_actual = (n - i) >= n_cache ? n_cache : (n - i);
uint64_t size_actual = n_actual * sizeof(double);
scalar_load(x + i, x_cache, size_actual);
scalar_load(y + i, y_cache, size_actual);
daxpy_cache(n_actual, a, x_cache, y_cache);
scalar_store(y_cache, y + i, size_actual);
}
}

__global__
void daxpy_kernel(uint64_t n, double a, double *x, double *y){
int threadId = get_thread_id();
int threadsNum = get_group_size();
if(threadId >= threadsNum) return;
uint64_t n_p = n / threadsNum;
uint64_t extras = n % threadsNum;
uint64_t offset;
if(threadId < extras) {
n_p++;
offset = threadId * n_p;
}else{
offset = threadId * (n_p + 1) - (threadId - extras);
}
daxpy_single(n_p, a, x + offset, y + offset);
}

多线程并行优化+核内AM缓存优化 + 向量Intrinsic向量化优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
static inline
void daxpy_cache(uint64_t n, \
lvector double a, \
lvector double *x, \
lvector double *y){
for (uint64_t i = 0; i < n; ++i){
lvector double x_vec = vec_ld(0, x + i);
lvector double y_vec = vec_ld(0, y + i);
y_vec = vec_mula(x_vec, a, y_vec);
vec_st(y_vec, 0, y + i);
}
}

//vector align
#define M_MT_CacheNum (2048) // (2048 * 16 * 8) / 1024 = 256KB
#define M_MT_VecLen (16)

static inline
void daxpy_single(uint64_t n, double a, double *x, double *y){
uint64_t n_cache = M_MT_CacheNum * M_MT_VecLen;
//malloc cache memory
lvector double *x_cache = (lvector double *)vector_malloc(\
n_cache * sizeof(double));
lvector double *y_cache = (lvector double *)vector_malloc(\
n_cache * sizeof(double));
//
uint64_t n_actual = 0;
uint64_t i = 0;
for( i = 0; i < n; i += n_actual){
n_actual = (n - i) >= n_cache ? n_cache : (n - i);
uint64_t size_actual = n_actual * sizeof(double);
vector_load(x + i, x_cache, size_actual);
vector_load(y + i, y_cache, size_actual);
uint64_t vLen = n_actual / M_MT_VecLen;
vLen = ((n_actual % M_MT_VecLen) == 0) ? vLen : (vLen + 1);
lvector double a_vec = vec_svbcast(a);
daxpy_cache(vLen, a_vec, x_cache, y_cache);
vector_store(y_cache, y + i, size_actual);
}
// free cache memory
vector_free(x_cache);
vector_free(y_cache);
}

__global__
void daxpy_kernel(uint64_t n, double a, double *x, double *y){
int threadId = get_thread_id();
int threadsNum = get_group_size();
if(threadId >= threadsNum) return;
uint64_t n_p = n / threadsNum;
uint64_t extras = n % threadsNum;
uint64_t offset;
if(threadId < extras) {
n_p++;
offset = threadId * n_p;
}else{
offset = threadId * (n_p + 1) - (threadId - extras);
}
daxpy_single(n_p, a, x + offset, y + offset);
}

多线程并行优化+核内AM缓存优化 + 向量Intrinsic向量化优化 +循环展开优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
//static inline
void daxpy_cache(uint64_t n, \
lvector double a, \
lvector double *x, \
lvector double *y){
uint64_t i = 0;
#if 0
for (i = 0; i < n; i += 2){
lvector double x_vec_0 = vec_ld(0, x + i + 0);
lvector double x_vec_1 = vec_ld(0, x + i + 1);
lvector double y_vec_0 = vec_ld(0, y + i + 0);
lvector double y_vec_1 = vec_ld(0, y + i + 1);
y_vec_0 = vec_mula(x_vec_0, a, y_vec_0);
y_vec_1 = vec_mula(x_vec_1, a, y_vec_1);
vec_st(y_vec_0, 0, y + i + 0);
vec_st(y_vec_1, 0, y + i + 1);
}
#endif
#if 0
for (i = 0; i < n; i += 4){
lvector double x_vec_0 = vec_ld(0, x + i + 0);
lvector double x_vec_1 = vec_ld(0, x + i + 1);
lvector double x_vec_2 = vec_ld(0, x + i + 2);
lvector double x_vec_3 = vec_ld(0, x + i + 3);
lvector double y_vec_0 = vec_ld(0, y + i + 0);
lvector double y_vec_1 = vec_ld(0, y + i + 1);
lvector double y_vec_2 = vec_ld(0, y + i + 2);
lvector double y_vec_3 = vec_ld(0, y + i + 3);
y_vec_0 = vec_mula(x_vec_0, a, y_vec_0);
y_vec_1 = vec_mula(x_vec_1, a, y_vec_1);
y_vec_2 = vec_mula(x_vec_2, a, y_vec_2);
y_vec_3 = vec_mula(x_vec_3, a, y_vec_3);
vec_st(y_vec_0, 0, y + i + 0);
vec_st(y_vec_1, 0, y + i + 1);
vec_st(y_vec_2, 0, y + i + 2);
vec_st(y_vec_3, 0, y + i + 3);
}
#endif
#if 0
for (i = 0; i < n; i += 8){
lvector double x_vec_0 = vec_ld(0, x + i + 0);
lvector double x_vec_1 = vec_ld(0, x + i + 1);
lvector double x_vec_2 = vec_ld(0, x + i + 2);
lvector double x_vec_3 = vec_ld(0, x + i + 3);
lvector double x_vec_4 = vec_ld(0, x + i + 4);
lvector double x_vec_5 = vec_ld(0, x + i + 5);

lvector double y_vec_0 = vec_ld(0, y + i + 0);
lvector double y_vec_1 = vec_ld(0, y + i + 1);
lvector double y_vec_2 = vec_ld(0, y + i + 2);
lvector double y_vec_3 = vec_ld(0, y + i + 3);
lvector double y_vec_4 = vec_ld(0, y + i + 4);
lvector double y_vec_5 = vec_ld(0, y + i + 5);
y_vec_0 = vec_mula(x_vec_0, a, y_vec_0);
y_vec_1 = vec_mula(x_vec_1, a, y_vec_1);
y_vec_2 = vec_mula(x_vec_2, a, y_vec_2);
y_vec_3 = vec_mula(x_vec_3, a, y_vec_3);
y_vec_4 = vec_mula(x_vec_4, a, y_vec_4);
y_vec_5 = vec_mula(x_vec_5, a, y_vec_5);
vec_st(y_vec_0, 0, y + i + 0);
vec_st(y_vec_1, 0, y + i + 1);
vec_st(y_vec_2, 0, y + i + 2);
vec_st(y_vec_3, 0, y + i + 3);
vec_st(y_vec_4, 0, y + i + 4);
vec_st(y_vec_5, 0, y + i + 5);
}
#endif
#if 1
for (i = 0; i < n; i += 8){
lvector double x_vec_0 = vec_ld(0, x + i + 0);
lvector double x_vec_1 = vec_ld(0, x + i + 1);
lvector double x_vec_2 = vec_ld(0, x + i + 2);
lvector double x_vec_3 = vec_ld(0, x + i + 3);
lvector double x_vec_4 = vec_ld(0, x + i + 4);
lvector double x_vec_5 = vec_ld(0, x + i + 5);
lvector double x_vec_6 = vec_ld(0, x + i + 6);
lvector double x_vec_7 = vec_ld(0, x + i + 7);

lvector double y_vec_0 = vec_ld(0, y + i + 0);
lvector double y_vec_1 = vec_ld(0, y + i + 1);
lvector double y_vec_2 = vec_ld(0, y + i + 2);
lvector double y_vec_3 = vec_ld(0, y + i + 3);
lvector double y_vec_4 = vec_ld(0, y + i + 4);
lvector double y_vec_5 = vec_ld(0, y + i + 5);
lvector double y_vec_6 = vec_ld(0, y + i + 6);
lvector double y_vec_7 = vec_ld(0, y + i + 7);
y_vec_0 = vec_mula(x_vec_0, a, y_vec_0);
y_vec_1 = vec_mula(x_vec_1, a, y_vec_1);
y_vec_2 = vec_mula(x_vec_2, a, y_vec_2);
y_vec_3 = vec_mula(x_vec_3, a, y_vec_3);
y_vec_4 = vec_mula(x_vec_4, a, y_vec_4);
y_vec_5 = vec_mula(x_vec_5, a, y_vec_5);
y_vec_6 = vec_mula(x_vec_6, a, y_vec_6);
y_vec_7 = vec_mula(x_vec_7, a, y_vec_7);
vec_st(y_vec_0, 0, y + i + 0);
vec_st(y_vec_1, 0, y + i + 1);
vec_st(y_vec_2, 0, y + i + 2);
vec_st(y_vec_3, 0, y + i + 3);
vec_st(y_vec_4, 0, y + i + 4);
vec_st(y_vec_5, 0, y + i + 5);
vec_st(y_vec_6, 0, y + i + 6);
vec_st(y_vec_7, 0, y + i + 7);
}
#endif
for (;i < n; i += 1){
lvector double x_vec = vec_ld(0, x + i);
lvector double y_vec = vec_ld(0, y + i);
y_vec = vec_mula(x_vec, a, y_vec);
vec_st(y_vec, 0, y + i);
}
}

//vector align
#define M_MT_CacheNum (2048) // (2048 * 16 * 8) / 1024 = 256KB
#define M_MT_VecLen (16)

static inline
void daxpy_single(uint64_t n, double a, double *x, double *y){
uint64_t n_cache = M_MT_CacheNum * M_MT_VecLen;
//malloc cache memory
lvector double *x_cache = (lvector double *)vector_malloc(\
n_cache * sizeof(double));
lvector double *y_cache = (lvector double *)vector_malloc(\
n_cache * sizeof(double));
//
uint64_t n_actual = 0;
uint64_t i = 0;
for( i = 0; i < n; i += n_actual){
n_actual = (n - i) >= n_cache ? n_cache : (n - i);
uint64_t size_actual = n_actual * sizeof(double);
vector_load(x + i, x_cache, size_actual);
vector_load(y + i, y_cache, size_actual);
uint64_t vLen = n_actual / M_MT_VecLen;
vLen = ((n_actual % M_MT_VecLen) == 0) ? vLen : (vLen + 1);
lvector double a_vec = vec_svbcast(a);
daxpy_cache(vLen, a_vec, x_cache, y_cache);
vector_store(y_cache, y + i, size_actual);
}
// free cache memory
vector_free(x_cache);
vector_free(y_cache);
}

__global__
void daxpy_kernel(uint64_t n, double a, double *x, double *y){
int threadId = get_thread_id();
int threadsNum = get_group_size();
if(threadId >= threadsNum) return;
uint64_t n_p = n / threadsNum;
uint64_t extras = n % threadsNum;
uint64_t offset;
if(threadId < extras) {
n_p++;
offset = threadId * n_p;
}else{
offset = threadId * (n_p + 1) - (threadId - extras);
}
daxpy_single(n_p, a, x + offset, y + offset);
}