编写MT3000设备端代码,为下面的kernel函数接口进行编程实现
1
2 __global__ void kernel_evaluSin(uint64_t len,uint64_t coreNum,\
double *optBuf,double *resBuf)
- 使用AM缓存数据
- 使用向量Intrinsic编程
- 调用libvm向量sin函数
lvector double vm_sind16_u18(lvector double);- 完成设备端代码的编译/链接/dat文件生成
- 只需使用一个设备端线程
- 要求代码能处理数组长度不对齐的情况
makefile
之前对makefile模模糊糊,这次从头完整的写了一遍makefile,感觉慢慢就熟悉了
- host端
1
2
3
4
5
6
7
8
9
10
11
12MT3k_ENV=/vol8/appsoftware/mt3000_programming_env-inbox/mt3000_programming_env-20230315
ENV_ROOT=${MT3k_ENV}/hthreads
MT_LIBVM=/vol8/home/hnu_ydy/libvm_expr/libvm_mt_public
EXE=helloSin.hos
ALL:
gcc -O2 ${EXE}.c -std=c99 -I./ -I$(ENV_ROOT)/include -I$(MT_LIBVM)/include -I$(MT_LIBVM)/lib $(ENV_ROOT)/lib/libhthread_host.a -lpthread -fopenmp -lm -o ${EXE}
clean:
rm ${EXE} - device端
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34MT3k_ENV=/vol8/appsoftware/mt3000_programming_env-inbox/mt3000_programming_env-20230315
ENV_ROOT=${MT3k_ENV}/hthreads
GCCROOT=${MT3k_ENV}/dsp_compiler
MT_LIBVM=/vol8/home/hnu_ydy/libvm_expr/libvm_mt_public
CC=MT-3000-gcc
AR=MT-3000-ar
LD=MT-3000-ld
AS=MT-3000-as
OB=MT-3000-objdump
DAT=MT-3000-makedat
export LD_LIBRARY_PATH=/vol8/appsoftware/mt3000_programming_env-inbox/mt3000_programming_env-20230315/third-party-lib/:$LD_LIBRARY_PATH
CFLAGS=-c -O2 -g -gdwarf-2 -fenable-m3000 -ffunction-sections -flax-vector-conversions -I./ -I$(ENV_ROOT)/include -I$(GCCROOT)/include/
LDFLAGS= -L$(ENV_ROOT)/lib --gc-sections -Tdsp.lds
SRC=helloSin.dev.c
OBJ=helloSin.dev.o
EXE=helloSin.dev.out
DAT=helloSin.dev.dat
ALL: $(EXE)
$(GCCROOT)/bin/MT-3000-makedat -J $(EXE)
$(OBJ): $(SRC)
$(GCCROOT)/bin/$(CC) -I$(MT_LIBVM)/include $(CFLAGS) $(SRC) -o $(OBJ)
$(EXE): $(OBJ)
$(GCCROOT)/bin/$(LD) $(LDFLAGS) $(OBJ) $(MT_LIBVM)/lib/libvm.a $(ENV_ROOT)/lib/libhthread_device.a $(GCCROOT)/lib/vlib3000.a $(GCCROOT)/lib/slib3000.a -o $(EXE)
clean:
rm $(EXE) $(OBJ) $(DAT)设备端
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
__global__ void kernel_evaluSin(uint64_t len,uint64_t coreNum,\
double *optBuf,double *resBuf)
{
int core_id = get_thread_id();
uint64_t offset = core_id * len;
double *optBuf_fix = &optBuf[offset];
double *resBuf_fix = &resBuf[offset];
size_t dataNum = (16 * 1000 + 345 ) * 24 + 13;
//lvector double *cache=(lvector double *)vector_malloc(cacheSize);
lvector double * src1 = vector_malloc(len*sizeof(double));
lvector double * src2 = vector_malloc(len*sizeof(double));
vector_load(optBuf_fix,src1,len*sizeof(double));
long i = 0;
for(i = 0; i < dataNum/16; i++){
src1[i] = (double)((i % 10000) - 5000);
src2[i] = vm_asind16_u10(src1[i]);
}
vector_store(src2,resBuf_fix,len*sizeof(double));
vector_free(src1);
vector_free(src2);
}主机端
1 | //check ulp |
结果
