1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
| void daxpy_cache(uint64_t n, \ lvector double a, \ lvector double *x, \ lvector double *y){ uint64_t i = 0; #if 0 for (i = 0; i < n; i += 2){ lvector double x_vec_0 = vec_ld(0, x + i + 0); lvector double x_vec_1 = vec_ld(0, x + i + 1); lvector double y_vec_0 = vec_ld(0, y + i + 0); lvector double y_vec_1 = vec_ld(0, y + i + 1); y_vec_0 = vec_mula(x_vec_0, a, y_vec_0); y_vec_1 = vec_mula(x_vec_1, a, y_vec_1); vec_st(y_vec_0, 0, y + i + 0); vec_st(y_vec_1, 0, y + i + 1); } #endif #if 0 for (i = 0; i < n; i += 4){ lvector double x_vec_0 = vec_ld(0, x + i + 0); lvector double x_vec_1 = vec_ld(0, x + i + 1); lvector double x_vec_2 = vec_ld(0, x + i + 2); lvector double x_vec_3 = vec_ld(0, x + i + 3); lvector double y_vec_0 = vec_ld(0, y + i + 0); lvector double y_vec_1 = vec_ld(0, y + i + 1); lvector double y_vec_2 = vec_ld(0, y + i + 2); lvector double y_vec_3 = vec_ld(0, y + i + 3); y_vec_0 = vec_mula(x_vec_0, a, y_vec_0); y_vec_1 = vec_mula(x_vec_1, a, y_vec_1); y_vec_2 = vec_mula(x_vec_2, a, y_vec_2); y_vec_3 = vec_mula(x_vec_3, a, y_vec_3); vec_st(y_vec_0, 0, y + i + 0); vec_st(y_vec_1, 0, y + i + 1); vec_st(y_vec_2, 0, y + i + 2); vec_st(y_vec_3, 0, y + i + 3); } #endif #if 0 for (i = 0; i < n; i += 8){ lvector double x_vec_0 = vec_ld(0, x + i + 0); lvector double x_vec_1 = vec_ld(0, x + i + 1); lvector double x_vec_2 = vec_ld(0, x + i + 2); lvector double x_vec_3 = vec_ld(0, x + i + 3); lvector double x_vec_4 = vec_ld(0, x + i + 4); lvector double x_vec_5 = vec_ld(0, x + i + 5);
lvector double y_vec_0 = vec_ld(0, y + i + 0); lvector double y_vec_1 = vec_ld(0, y + i + 1); lvector double y_vec_2 = vec_ld(0, y + i + 2); lvector double y_vec_3 = vec_ld(0, y + i + 3); lvector double y_vec_4 = vec_ld(0, y + i + 4); lvector double y_vec_5 = vec_ld(0, y + i + 5); y_vec_0 = vec_mula(x_vec_0, a, y_vec_0); y_vec_1 = vec_mula(x_vec_1, a, y_vec_1); y_vec_2 = vec_mula(x_vec_2, a, y_vec_2); y_vec_3 = vec_mula(x_vec_3, a, y_vec_3); y_vec_4 = vec_mula(x_vec_4, a, y_vec_4); y_vec_5 = vec_mula(x_vec_5, a, y_vec_5); vec_st(y_vec_0, 0, y + i + 0); vec_st(y_vec_1, 0, y + i + 1); vec_st(y_vec_2, 0, y + i + 2); vec_st(y_vec_3, 0, y + i + 3); vec_st(y_vec_4, 0, y + i + 4); vec_st(y_vec_5, 0, y + i + 5); } #endif #if 1 for (i = 0; i < n; i += 8){ lvector double x_vec_0 = vec_ld(0, x + i + 0); lvector double x_vec_1 = vec_ld(0, x + i + 1); lvector double x_vec_2 = vec_ld(0, x + i + 2); lvector double x_vec_3 = vec_ld(0, x + i + 3); lvector double x_vec_4 = vec_ld(0, x + i + 4); lvector double x_vec_5 = vec_ld(0, x + i + 5); lvector double x_vec_6 = vec_ld(0, x + i + 6); lvector double x_vec_7 = vec_ld(0, x + i + 7);
lvector double y_vec_0 = vec_ld(0, y + i + 0); lvector double y_vec_1 = vec_ld(0, y + i + 1); lvector double y_vec_2 = vec_ld(0, y + i + 2); lvector double y_vec_3 = vec_ld(0, y + i + 3); lvector double y_vec_4 = vec_ld(0, y + i + 4); lvector double y_vec_5 = vec_ld(0, y + i + 5); lvector double y_vec_6 = vec_ld(0, y + i + 6); lvector double y_vec_7 = vec_ld(0, y + i + 7); y_vec_0 = vec_mula(x_vec_0, a, y_vec_0); y_vec_1 = vec_mula(x_vec_1, a, y_vec_1); y_vec_2 = vec_mula(x_vec_2, a, y_vec_2); y_vec_3 = vec_mula(x_vec_3, a, y_vec_3); y_vec_4 = vec_mula(x_vec_4, a, y_vec_4); y_vec_5 = vec_mula(x_vec_5, a, y_vec_5); y_vec_6 = vec_mula(x_vec_6, a, y_vec_6); y_vec_7 = vec_mula(x_vec_7, a, y_vec_7); vec_st(y_vec_0, 0, y + i + 0); vec_st(y_vec_1, 0, y + i + 1); vec_st(y_vec_2, 0, y + i + 2); vec_st(y_vec_3, 0, y + i + 3); vec_st(y_vec_4, 0, y + i + 4); vec_st(y_vec_5, 0, y + i + 5); vec_st(y_vec_6, 0, y + i + 6); vec_st(y_vec_7, 0, y + i + 7); } #endif for (;i < n; i += 1){ lvector double x_vec = vec_ld(0, x + i); lvector double y_vec = vec_ld(0, y + i); y_vec = vec_mula(x_vec, a, y_vec); vec_st(y_vec, 0, y + i); } }
#define M_MT_CacheNum (2048) #define M_MT_VecLen (16)
static inline void daxpy_single(uint64_t n, double a, double *x, double *y){ uint64_t n_cache = M_MT_CacheNum * M_MT_VecLen; lvector double *x_cache = (lvector double *)vector_malloc(\ n_cache * sizeof(double)); lvector double *y_cache = (lvector double *)vector_malloc(\ n_cache * sizeof(double)); uint64_t n_actual = 0; uint64_t i = 0; for( i = 0; i < n; i += n_actual){ n_actual = (n - i) >= n_cache ? n_cache : (n - i); uint64_t size_actual = n_actual * sizeof(double); vector_load(x + i, x_cache, size_actual); vector_load(y + i, y_cache, size_actual); uint64_t vLen = n_actual / M_MT_VecLen; vLen = ((n_actual % M_MT_VecLen) == 0) ? vLen : (vLen + 1); lvector double a_vec = vec_svbcast(a); daxpy_cache(vLen, a_vec, x_cache, y_cache); vector_store(y_cache, y + i, size_actual); } vector_free(x_cache); vector_free(y_cache); }
__global__ void daxpy_kernel(uint64_t n, double a, double *x, double *y){ int threadId = get_thread_id(); int threadsNum = get_group_size(); if(threadId >= threadsNum) return; uint64_t n_p = n / threadsNum; uint64_t extras = n % threadsNum; uint64_t offset; if(threadId < extras) { n_p++; offset = threadId * n_p; }else{ offset = threadId * (n_p + 1) - (threadId - extras); } daxpy_single(n_p, a, x + offset, y + offset); }
|