lrh12580 commited on
Commit
5cb6c4b
·
1 Parent(s): 4f29f93

first commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +1 -0
  2. data.zip +3 -0
  3. env.sh +1 -0
  4. workloads/common/Makefile +12 -0
  5. workloads/common/cpu_timestamps.cpp +38 -0
  6. workloads/common/cpu_timestamps.h +22 -0
  7. workloads/common/cpu_timestamps.o +0 -0
  8. workloads/common/cupti_add.cpp +242 -0
  9. workloads/common/cupti_add.h +56 -0
  10. workloads/common/cupti_add.o +0 -0
  11. workloads/common/make.config +10 -0
  12. workloads/micro/async/2DCONV/2DConvolution.cu +392 -0
  13. workloads/micro/async/2DCONV/Makefile +12 -0
  14. workloads/micro/async/2DCONV/run.sh +2 -0
  15. workloads/micro/async/2DCONV/run_large.sh +1 -0
  16. workloads/micro/async/2DCONV/run_medium.sh +1 -0
  17. workloads/micro/async/2DCONV/run_mega.sh +1 -0
  18. workloads/micro/async/2DCONV/run_small.sh +1 -0
  19. workloads/micro/async/2DCONV/run_super.sh +1 -0
  20. workloads/micro/async/2DCONV/run_tiny.sh +1 -0
  21. workloads/micro/async/3DCONV/3DConvolution.cu +449 -0
  22. workloads/micro/async/3DCONV/Makefile +12 -0
  23. workloads/micro/async/3DCONV/run.sh +2 -0
  24. workloads/micro/async/3DCONV/run_large.sh +1 -0
  25. workloads/micro/async/3DCONV/run_medium.sh +1 -0
  26. workloads/micro/async/3DCONV/run_mega.sh +1 -0
  27. workloads/micro/async/3DCONV/run_small.sh +1 -0
  28. workloads/micro/async/3DCONV/run_super.sh +1 -0
  29. workloads/micro/async/3DCONV/run_tiny.sh +1 -0
  30. workloads/micro/async/gemm/Makefile +12 -0
  31. workloads/micro/async/gemm/gemm.cu +277 -0
  32. workloads/micro/async/gemm/run.sh +3 -0
  33. workloads/micro/async/gemm/run_large.sh +1 -0
  34. workloads/micro/async/gemm/run_medium.sh +1 -0
  35. workloads/micro/async/gemm/run_mega.sh +1 -0
  36. workloads/micro/async/gemm/run_small.sh +1 -0
  37. workloads/micro/async/gemm/run_super.sh +1 -0
  38. workloads/micro/async/gemm/run_tiny.sh +1 -0
  39. workloads/micro/async/gemm_perf/Makefile +12 -0
  40. workloads/micro/async/gemm_perf/gemm +0 -0
  41. workloads/micro/async/gemm_perf/gemm.cu +277 -0
  42. workloads/micro/async/gemm_perf/run.sh +3 -0
  43. workloads/micro/async/gemm_perf/run_large.sh +1 -0
  44. workloads/micro/async/gemm_perf/run_medium.sh +1 -0
  45. workloads/micro/async/gemm_perf/run_mega.sh +1 -0
  46. workloads/micro/async/gemm_perf/run_small.sh +1 -0
  47. workloads/micro/async/gemm_perf/run_super.sh +1 -0
  48. workloads/micro/async/gemm_perf/run_tiny.sh +1 -0
  49. workloads/micro/async/gemv/Makefile +12 -0
  50. workloads/micro/async/gemv/gemv.cu +269 -0
README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # uvm_async_bench
data.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24dcfa78fe0a79ebf4b189d3b2c8449842f15a9f30b7b781c400c96ba525756a
3
+ size 8411674489
env.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ export UVMAsyncBench_BASE=$(pwd)
workloads/common/Makefile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include ./make.config
2
+
3
+ NVCC = $(CUDA_DIR)/bin/nvcc
4
+ NVCC_FLAGS = -I$(CUDA_DIR)/include -I$(CUPTI_INCLUDE)
5
+
6
+ all: cpu_timestapms.o cupti_add.o
7
+
8
+ cpu_timestapms.o: cpu_timestamps.cpp
9
+ $(NVCC) $(NVCC_FLAGS) -c cpu_timestamps.cpp
10
+
11
+ cupti_add.o: cupti_add.cpp
12
+ $(NVCC) $(NVCC_FLAGS) -c cupti_add.cpp
workloads/common/cpu_timestamps.cpp ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "cpu_timestamps.h"
2
+
3
+ void startCPU() {
4
+ struct timespec tv;
5
+ if(clock_gettime(CLOCK_REALTIME, &tv))
6
+ printf("error clock_gettime\n");
7
+ startCPUTime = (tv.tv_sec * 1.0e9 + tv.tv_nsec);
8
+ }
9
+
10
+
11
+
12
+ void endCPU() {
13
+ struct timespec tv;
14
+ if(clock_gettime(CLOCK_REALTIME, &tv))
15
+ printf("error clock_gettime\n");
16
+
17
+ endCPUTime = (tv.tv_sec * 1.0e9 + tv.tv_nsec);
18
+ //endCPUTimestamp1 = std::chrono::system_clock::now();
19
+ printf("CPU_Times,%lu,%lu,%lu\n", startCPUTime, endCPUTime, endCPUTime-startCPUTime);
20
+ printf("Overlap_Times,%lu,%lu,%lu\n", overlapStartCPUTime, overlapEndCPUTime, overlapEndCPUTime - overlapStartCPUTime);
21
+ }
22
+
23
+ void overlapStartCPU()
24
+ {
25
+ struct timespec tv;
26
+ if (clock_gettime(CLOCK_REALTIME, &tv))
27
+ printf("error clock_gettime\n");
28
+ overlapStartCPUTime = (tv.tv_sec * 1.0e9 + tv.tv_nsec);
29
+ }
30
+
31
+ void overlapEndCPU()
32
+ {
33
+ struct timespec tv;
34
+ if (clock_gettime(CLOCK_REALTIME, &tv))
35
+ printf("error clock_gettime\n");
36
+
37
+ overlapEndCPUTime = (tv.tv_sec * 1.0e9 + tv.tv_nsec);
38
+ }
workloads/common/cpu_timestamps.h ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef CPU_TIMESTAMP_
2
+ #define CPU_TIMESTAMP_
3
+
4
+ #include <time.h>
5
+ #include <sys/time.h>
6
+ #include <stdio.h>
7
+ #include <stdint.h>
8
+ #include <error.h>
9
+
10
+ static uint64_t startCPUTime;
11
+ static uint64_t endCPUTime;
12
+
13
+ static uint64_t overlapStartCPUTime = 0;
14
+ static uint64_t overlapEndCPUTime = 0;
15
+
16
+ void startCPU();
17
+ void endCPU();
18
+
19
+ void overlapStartCPU();
20
+ void overlapEndCPU();
21
+
22
+ #endif
workloads/common/cpu_timestamps.o ADDED
Binary file (4.15 kB). View file
 
workloads/common/cupti_add.cpp ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "cupti_add.h"
2
+
3
+ static const char *
4
+ getMemcpyKindString(CUpti_ActivityMemcpyKind kind)
5
+ {
6
+ switch (kind)
7
+ {
8
+ case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
9
+ return "HtoD";
10
+ case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
11
+ return "DtoH";
12
+ case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
13
+ return "HtoA";
14
+ case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
15
+ return "AtoH";
16
+ case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
17
+ return "AtoA";
18
+ case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
19
+ return "AtoD";
20
+ case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
21
+ return "DtoA";
22
+ case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
23
+ return "DtoD";
24
+ case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
25
+ return "HtoH";
26
+ default:
27
+ break;
28
+ }
29
+
30
+ return "<unknown>";
31
+ }
32
+
33
+ static const char *
34
+ getUvmCounterKindString(CUpti_ActivityUnifiedMemoryCounterKind kind)
35
+ {
36
+ switch (kind)
37
+ {
38
+ case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD:
39
+ return "BYTES_TRANSFER_HTOD";
40
+ case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH:
41
+ return "BYTES_TRANSFER_DTOH";
42
+ case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT:
43
+ return "CPU_PAGE_FAULTS";
44
+ case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT:
45
+ return "GPU_PAGE_FAULTS";
46
+ default:
47
+ break;
48
+ }
49
+ return "<unknown>";
50
+ }
51
+
52
+ static void
53
+ printActivity(CUpti_Activity *record)
54
+ {
55
+ switch (record->kind)
56
+ {
57
+ case CUPTI_ACTIVITY_KIND_KERNEL:
58
+ {
59
+ int status;
60
+ CUpti_ActivityKernel4 *kernel = (CUpti_ActivityKernel4 *)record;
61
+ printf("KERNEL %s, %llu, %llu, %llu\n",
62
+ abi::__cxa_demangle(kernel->name, 0, 0, &status),
63
+ (unsigned long long)(kernel->start),
64
+ (unsigned long long)(kernel->end),
65
+ (unsigned long long)(kernel->end) - (kernel->start));
66
+ break;
67
+ }
68
+ case CUPTI_ACTIVITY_KIND_RUNTIME:
69
+ {
70
+ CUpti_ActivityAPI *api = (CUpti_ActivityAPI *)record;
71
+ const char *callback_name;
72
+ cuptiGetCallbackName(CUPTI_CB_DOMAIN_RUNTIME_API, api->cbid, &callback_name);
73
+ // printf("RUNTIME %s (cbid=%u) [ %llu - %llu ] process %u, thread %u, correlation %u\n",
74
+ // callback_name, api->cbid,
75
+ // (unsigned long long)(api->start - startTimestamp),
76
+ // (unsigned long long)(api->end - startTimestamp),
77
+ // api->processId, api->threadId, api->correlationId);
78
+ printf("RUNTIME %s (cbid=%u), %llu,%llu,%llu, process %u, thread %u, correlation %u\n",
79
+ callback_name, api->cbid,
80
+ (unsigned long long)(api->start),
81
+ (unsigned long long)(api->end),
82
+ (unsigned long long)(api->end - api->start),
83
+ api->processId, api->threadId, api->correlationId);
84
+ break;
85
+ }
86
+ case CUPTI_ACTIVITY_KIND_MEMCPY:
87
+ {
88
+ CUpti_ActivityMemcpy4 *memcpy = (CUpti_ActivityMemcpy4 *)record;
89
+ printf("MEMCPY %s, size %llu, %llu, %llu, %llu\n",
90
+ getMemcpyKindString((CUpti_ActivityMemcpyKind)memcpy->copyKind),
91
+ (unsigned long long)memcpy->bytes,
92
+ (unsigned long long)(memcpy->start),
93
+ (unsigned long long)(memcpy->end),
94
+ (unsigned long long)(memcpy->end) - (memcpy->start));
95
+ break;
96
+ }
97
+ case CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER:
98
+ {
99
+ CUpti_ActivityUnifiedMemoryCounter2 *uvm = (CUpti_ActivityUnifiedMemoryCounter2 *)record;
100
+ printf("UVM MEMCPY %s, size %llu, %llu, %llu, %llu \n",
101
+ getUvmCounterKindString(uvm->counterKind),
102
+ (unsigned long long)uvm->value,
103
+ (unsigned long long)(uvm->start),
104
+ (unsigned long long)(uvm->end),
105
+ (unsigned long long)(uvm->end - uvm->start));
106
+ break;
107
+ }
108
+ }
109
+ }
110
+
111
+ void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords)
112
+ {
113
+ uint8_t *bfr = (uint8_t *)malloc(BUF_SIZE + ALIGN_SIZE);
114
+ if (bfr == NULL)
115
+ {
116
+ printf("Error: out of memory\n");
117
+ exit(-1);
118
+ }
119
+
120
+ *size = BUF_SIZE;
121
+ *buffer = ALIGN_BUFFER(bfr, ALIGN_SIZE);
122
+ *maxNumRecords = 0;
123
+ }
124
+
125
+ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize)
126
+ {
127
+ CUptiResult status;
128
+ CUpti_Activity *record = NULL;
129
+ if (validSize > 0)
130
+ {
131
+ do
132
+ {
133
+ status = cuptiActivityGetNextRecord(buffer, validSize, &record);
134
+ if (status == CUPTI_SUCCESS)
135
+ {
136
+ printActivity(record);
137
+ }
138
+ else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED)
139
+ break;
140
+ else
141
+ {
142
+ CUPTI_CALL(status);
143
+ }
144
+ } while (1);
145
+
146
+ // report any records dropped from the queue
147
+ size_t dropped;
148
+ CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
149
+ if (dropped != 0)
150
+ {
151
+ printf("Dropped %u activity records\n", (unsigned int)dropped);
152
+ }
153
+ }
154
+
155
+ free(buffer);
156
+ }
157
+
158
+ // void initTrace() {
159
+ // return;
160
+ // }
161
+
162
+ // void finiTrace() {
163
+ // return;
164
+ // }
165
+
166
+
167
+ void initTrace()
168
+ {
169
+ size_t attrValue = 0, attrValueSize = sizeof(size_t);
170
+
171
+ CUpti_ActivityUnifiedMemoryCounterConfig config[2];
172
+
173
+ // configure unified memory counters
174
+ config[0].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
175
+ config[0].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD;
176
+ config[0].deviceId = 0;
177
+ config[0].enable = 1;
178
+
179
+ config[1].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
180
+ config[1].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH;
181
+ config[1].deviceId = 0;
182
+ config[1].enable = 1;
183
+
184
+ CUptiResult res = cuptiActivityConfigureUnifiedMemoryCounter(config, 2);
185
+ if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED)
186
+ {
187
+ printf("Test is waived, unified memory is not supported on the underlying platform.\n");
188
+ }
189
+ else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE)
190
+ {
191
+ printf("Test is waived, unified memory is not supported on the device.\n");
192
+ }
193
+ else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES)
194
+ {
195
+ printf("Test is waived, unified memory is not supported on the non-P2P multi-gpu setup.\n");
196
+ }
197
+ else
198
+ {
199
+ CUPTI_CALL(res);
200
+ }
201
+
202
+ CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
203
+ CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
204
+ CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
205
+ CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER));
206
+ // CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_COUNT));
207
+
208
+ // CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT
209
+ // CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT
210
+
211
+ // Register callbacks for buffer requests and for buffers completed by CUPTI.
212
+ CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted));
213
+
214
+ // Optionally get and set activity attributes.
215
+ // Attributes can be set by the CUPTI client to change behavior of the activity API.
216
+ // Some attributes require to be set before any CUDA context is created to be effective,
217
+ // e.g. to be applied to all device buffer allocations (see documentation).
218
+ CUPTI_CALL(cuptiActivityGetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE, &attrValueSize, &attrValue));
219
+ printf("%s = %llu B\n", "CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE", (long long unsigned)attrValue);
220
+ attrValue *= 2;
221
+ CUPTI_CALL(cuptiActivitySetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE, &attrValueSize, &attrValue));
222
+
223
+ CUPTI_CALL(cuptiActivityGetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT, &attrValueSize, &attrValue));
224
+ printf("%s = %llu\n", "CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT", (long long unsigned)attrValue);
225
+ attrValue *= 2;
226
+ CUPTI_CALL(cuptiActivitySetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT, &attrValueSize, &attrValue));
227
+
228
+ CUPTI_CALL(cuptiGetTimestamp(&startTimestamp));
229
+ }
230
+
231
+ void finiTrace()
232
+ {
233
+ // Force flush any remaining activity buffers before termination of the application
234
+ CUPTI_CALL(cuptiActivityFlushAll(1));
235
+ }
236
+
237
+ void GPU_argv_init() {
238
+ cudaDeviceProp deviceProp;
239
+ cudaGetDeviceProperties(&deviceProp, GPU_DEVICE);
240
+ printf("setting device %d with name %s\n", GPU_DEVICE, deviceProp.name);
241
+ cudaSetDevice(GPU_DEVICE);
242
+ }
workloads/common/cupti_add.h ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <cupti.h>
2
+ #include <stdio.h>
3
+ #include <cxxabi.h>
4
+ #include <time.h>
5
+
6
+ #define GPU_DEVICE 7
7
+
8
+ #define BUF_SIZE (32 * 1024)
9
+ #define ALIGN_SIZE (8)
10
+ #define ALIGN_BUFFER(buffer, align) \
11
+ (((uintptr_t) (buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t) (buffer) & ((align)-1))) : (buffer))
12
+
13
+ static uint64_t startTimestamp;
14
+ // Timestamp at trace initialization time. Used to normalized other
15
+ // timestamps
16
+
17
+ #define CUPTI_CALL(call) \
18
+ do { \
19
+ CUptiResult _status = call; \
20
+ if (_status != CUPTI_SUCCESS) { \
21
+ const char *errstr; \
22
+ cuptiGetResultString(_status, &errstr); \
23
+ fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
24
+ __FILE__, __LINE__, #call, errstr); \
25
+ if(_status == CUPTI_ERROR_LEGACY_PROFILER_NOT_SUPPORTED) \
26
+ exit(0); \
27
+ else \
28
+ exit(-1); \
29
+ } \
30
+ } while (0)
31
+
32
+
33
+ extern inline __attribute__((always_inline)) unsigned long rdtsc() {
34
+ unsigned long a, d;
35
+
36
+ __asm__ volatile("rdtsc" : "=a"(a), "=d"(d));
37
+
38
+ return (a | (d << 32));
39
+ }
40
+
41
+ extern inline __attribute__((always_inline)) unsigned long rdtsp() {
42
+ struct timespec tms;
43
+ if (clock_gettime(CLOCK_REALTIME, &tms)) {
44
+ return -1;
45
+ }
46
+ unsigned long ns = tms.tv_sec * 1000000000;
47
+ ns += tms.tv_nsec;
48
+ return ns;
49
+ }
50
+
51
+ void initTrace();
52
+ void finiTrace();
53
+ void GPU_argv_init();
54
+ void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords);
55
+ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize);
56
+ static void printActivity(CUpti_Activity *record);
workloads/common/cupti_add.o ADDED
Binary file (13.8 kB). View file
 
workloads/common/make.config ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ CUDA_DIR = /apps/cuda-11.4
2
+
3
+ COMPUTE = compute_80 #61
4
+ SM_ARCH = sm_80 #61
5
+
6
+ CUDA_LIB_DIR := $(CUDA_DIR)/lib64
7
+ CUPTI_LIB_DIR := $(CUDA_DIR)/extras/CUPTI/lib64/
8
+ CUPTI_INCLUDE := $(CUDA_DIR)/extras/CUPTI/include/
9
+
10
+ CUPTI_ADD_COMMON = $(UVMAsyncBench_BASE)/workloads/common/
workloads/micro/async/2DCONV/2DConvolution.cu ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * 2DConvolution.cu: This file is part of the PolyBench/GPU 1.0 test suite.
3
+ *
4
+ *
5
+ * Contact: Scott Grauer-Gray <sgrauerg@gmail.com>
6
+ * Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
7
+ * Web address: http://www.cse.ohio-state.edu/~pouchet/software/polybench/GPU
8
+ */
9
+
10
+ #include <unistd.h>
11
+ #include <stdio.h>
12
+ #include <time.h>
13
+ #include <sys/time.h>
14
+ #include <stdlib.h>
15
+ #include <stdarg.h>
16
+ #include <string.h>
17
+ #include <cuda.h>
18
+
19
+ #include "../../../common/cupti_add.h"
20
+ #include "../../../common/cpu_timestamps.h"
21
+
22
+ #include <cooperative_groups.h>
23
+ #include <cooperative_groups/memcpy_async.h>
24
+
25
+ using namespace nvcuda::experimental;
26
+
27
+ #define PREFETCH_COUNT 2
28
+
29
+ #define SMALL_FLOAT_VAL 0.00000001f
30
+
31
+ double rtclock()
32
+ {
33
+ struct timezone Tzp;
34
+ struct timeval Tp;
35
+ uint64_t stat;
36
+ stat = gettimeofday(&Tp, &Tzp);
37
+ if (stat != 0)
38
+ printf("Error return from gettimeofday: %d", stat);
39
+ return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
40
+ }
41
+
42
+ float absVal(float a)
43
+ {
44
+ if (a < 0)
45
+ {
46
+ return (a * -1);
47
+ }
48
+ else
49
+ {
50
+ return a;
51
+ }
52
+ }
53
+
54
+ float percentDiff(double val1, double val2)
55
+ {
56
+ if ((absVal(val1) < 0.01) && (absVal(val2) < 0.01))
57
+ {
58
+ return 0.0f;
59
+ }
60
+
61
+ else
62
+ {
63
+ return 100.0f * (absVal(absVal(val1 - val2) / absVal(val1 + SMALL_FLOAT_VAL)));
64
+ }
65
+ }
66
+
67
+ //define the error threshold for the results "not matching"
68
+ #define PERCENT_DIFF_ERROR_THRESHOLD 0.05
69
+
70
+ /* Problem size */
71
+ #define SIZE 4096
72
+ #define NBLOCKS 32
73
+ #define BATCH_SIZE 4
74
+
75
+ uint64_t NI;
76
+ uint64_t NJ;
77
+ uint64_t nblocks;
78
+
79
+
80
+ /* Thread block dimensions */
81
+ #define KERNEL 3
82
+ #define DIM_THREAD_BLOCK 8
83
+
84
+ /* Can switch DATA_TYPE between float and double */
85
+ typedef float DATA_TYPE;
86
+
87
+ void conv2D(DATA_TYPE* A, DATA_TYPE* B)
88
+ {
89
+ uint64_t i, j;
90
+ DATA_TYPE c11, c12, c13, c21, c22, c23, c31, c32, c33;
91
+
92
+ c11 = +0.2; c21 = +0.5; c31 = -0.8;
93
+ c12 = -0.3; c22 = +0.6; c32 = -0.9;
94
+ c13 = +0.4; c23 = +0.7; c33 = +0.10;
95
+
96
+ for (i = 1; i < NI - 1; ++i) // 0
97
+ {
98
+ for (j = 1; j < NJ - 1; ++j) // 1
99
+ {
100
+ B[i*NJ + j] = c11 * A[(i - 1)*NJ + (j - 1)] + c12 * A[(i + 0)*NJ + (j - 1)] + c13 * A[(i + 1)*NJ + (j - 1)]
101
+ + c21 * A[(i - 1)*NJ + (j + 0)] + c22 * A[(i + 0)*NJ + (j + 0)] + c23 * A[(i + 1)*NJ + (j + 0)]
102
+ + c31 * A[(i - 1)*NJ + (j + 1)] + c32 * A[(i + 0)*NJ + (j + 1)] + c33 * A[(i + 1)*NJ + (j + 1)];
103
+ }
104
+ }
105
+ }
106
+
107
+
108
+ void initGPU(DATA_TYPE* A_gpu)
109
+ {
110
+ uint64_t i, j;
111
+
112
+ for (i = 0; i < NI; ++i) {
113
+ for (j = 0; j < NJ; ++j) {
114
+ A_gpu[i * NJ + j] = ((DATA_TYPE)i * j) / NI;
115
+ }
116
+ }
117
+ }
118
+
119
+ void initCPU(DATA_TYPE* A)
120
+ {
121
+ uint64_t i, j;
122
+
123
+ for (i = 0; i < NI; ++i) {
124
+ for (j = 0; j < NJ; ++j) {
125
+ A[i * NJ + j] = ((DATA_TYPE)i * j) / NI;
126
+ }
127
+ }
128
+ }
129
+
130
+
131
+ void compareResults(DATA_TYPE* B, DATA_TYPE* B_outputFromGpu)
132
+ {
133
+ uint64_t i, j, fail;
134
+ fail = 0;
135
+
136
+ // Compare a and b
137
+ for (i=1; i < (NI-1); i++)
138
+ {
139
+ for (j=1; j < (NJ-1); j++)
140
+ {
141
+ if (percentDiff(B[i*NJ + j], B_outputFromGpu[i*NJ + j]) > PERCENT_DIFF_ERROR_THRESHOLD)
142
+ {
143
+ printf("%d, %d, CPU is %f, GPU is %f.\n", i, j, B[i * NJ + j], B_outputFromGpu[i * NJ + j]);
144
+ fail++;
145
+ }
146
+ }
147
+ }
148
+
149
+ // Print results
150
+ printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail);
151
+
152
+ }
153
+
154
+ __global__ void Convolution2D_kernel(DATA_TYPE *A, DATA_TYPE *B, uint64_t NI, uint64_t NJ, uint64_t block_size)
155
+ {
156
+ cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
157
+ pipeline pipe;
158
+ uint64_t tile_dim_x = (NJ + DIM_THREAD_BLOCK - 1) / (DIM_THREAD_BLOCK * BATCH_SIZE);
159
+
160
+ __shared__ DATA_TYPE tmp_A[PREFETCH_COUNT][DIM_THREAD_BLOCK * BATCH_SIZE + KERNEL - 1][DIM_THREAD_BLOCK * BATCH_SIZE + KERNEL - 1];
161
+ __shared__ DATA_TYPE tmp_B[DIM_THREAD_BLOCK * BATCH_SIZE][DIM_THREAD_BLOCK * BATCH_SIZE];
162
+
163
+ uint64_t total_tiles = tile_dim_x * tile_dim_x;
164
+
165
+ uint64_t tiles_this_block_x = (block_size / (DIM_THREAD_BLOCK * BATCH_SIZE));
166
+ uint64_t tiles_this_block = tiles_this_block_x * tiles_this_block_x;
167
+
168
+ // DATA_TYPE c11, c12, c13, c21, c22, c23, c31, c32, c33;
169
+
170
+ // c11 = +0.2; c21 = +0.5; c31 = -0.8;
171
+ // c12 = -0.3; c22 = +0.6; c32 = -0.9;
172
+ // c13 = +0.4; c23 = +0.7; c33 = +0.10;
173
+
174
+ DATA_TYPE c[KERNEL][KERNEL];
175
+
176
+ c[0][0] = +0.2;
177
+ c[1][0] = +0.5;
178
+ c[2][0] = -0.8;
179
+ c[0][1] = -0.3;
180
+ c[1][1] = +0.6;
181
+ c[2][1] = -0.9;
182
+ c[0][2] = +0.4;
183
+ c[1][2] = +0.7;
184
+ c[2][2] = +0.10;
185
+
186
+ uint64_t base_tile = (blockIdx.y * gridDim.x + blockIdx.x) * tiles_this_block;
187
+ uint64_t fetch = base_tile;
188
+ uint64_t end_tile = fetch + tiles_this_block;
189
+
190
+ for (uint64_t compute = fetch; compute < end_tile; compute++)
191
+ {
192
+ for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++)
193
+ {
194
+ // block id
195
+ uint64_t offset = fetch - base_tile;
196
+ uint64_t block_id = fetch / tiles_this_block;
197
+ uint64_t bx = block_id % gridDim.x * tiles_this_block_x + offset % tiles_this_block_x;
198
+ uint64_t by = block_id / gridDim.x * tiles_this_block_x + offset / tiles_this_block_x;
199
+
200
+ uint64_t batch_size = DIM_THREAD_BLOCK * BATCH_SIZE;
201
+
202
+ // thread id
203
+ uint64_t tx = threadIdx.x;
204
+ uint64_t ty = threadIdx.y;
205
+
206
+ uint64_t index_A_y = DIM_THREAD_BLOCK * BATCH_SIZE * by + BATCH_SIZE * ty;
207
+ uint64_t index_A_x = DIM_THREAD_BLOCK * BATCH_SIZE * bx + BATCH_SIZE * tx;
208
+
209
+ uint64_t index_A_y_start = DIM_THREAD_BLOCK * BATCH_SIZE * by;
210
+ uint64_t index_A_x_start = DIM_THREAD_BLOCK * BATCH_SIZE * bx;
211
+
212
+ uint64_t index_A_y_bound = DIM_THREAD_BLOCK * BATCH_SIZE * by + BATCH_SIZE * DIM_THREAD_BLOCK;
213
+ uint64_t index_A_x_bound = DIM_THREAD_BLOCK * BATCH_SIZE * bx + BATCH_SIZE * DIM_THREAD_BLOCK;
214
+
215
+ // fetch A
216
+ for (uint64_t i = 0; i < BATCH_SIZE; i++)
217
+ {
218
+ for (uint64_t j = 0; j < BATCH_SIZE; j++)
219
+ {
220
+ if ((index_A_y + i) < NI && (index_A_x + j) < NJ)
221
+ {
222
+ memcpy_async(tmp_A[fetch % PREFETCH_COUNT][ty * BATCH_SIZE + i][tx * BATCH_SIZE + j], A[(index_A_y + i) * NJ + index_A_x + j], pipe);
223
+ tmp_B[ty * BATCH_SIZE + i][tx * BATCH_SIZE + j] = 0;
224
+ }
225
+ }
226
+ }
227
+
228
+ // fetch A -- padding
229
+ for (uint64_t i = 0; i < KERNEL - 1; i++)
230
+ {
231
+ for (uint64_t j = 0; j < BATCH_SIZE * DIM_THREAD_BLOCK + KERNEL - 1; j++)
232
+ {
233
+ if ((index_A_y_bound + i) < NI && (index_A_x_start + j) < NJ)
234
+ {
235
+ memcpy_async(tmp_A[fetch % PREFETCH_COUNT][DIM_THREAD_BLOCK * BATCH_SIZE + i][j], A[(index_A_y_bound + i) * NJ + index_A_x_start + j], pipe);
236
+ }
237
+ }
238
+ }
239
+
240
+ // fetch A -- padding
241
+ for (uint64_t i = 0; i < BATCH_SIZE * DIM_THREAD_BLOCK + KERNEL - 1; i++)
242
+ {
243
+ for (uint64_t j = 0; j < KERNEL - 1; j++)
244
+ {
245
+ if ((index_A_y_start + i) < NI && (index_A_x_bound + j) < NJ)
246
+ {
247
+ memcpy_async(tmp_A[fetch % PREFETCH_COUNT][i][DIM_THREAD_BLOCK * BATCH_SIZE + j], A[(index_A_y_start + i) * NJ + index_A_x_bound + j], pipe);
248
+ }
249
+ }
250
+ }
251
+ pipe.commit();
252
+ }
253
+ if (fetch == end_tile)
254
+ {
255
+ for (uint64_t i = 0; i < PREFETCH_COUNT - 1; ++i)
256
+ {
257
+ pipe.commit();
258
+ }
259
+ ++fetch;
260
+ }
261
+ pipe.wait_prior<PREFETCH_COUNT - 1>();
262
+ block.sync();
263
+
264
+ // block id
265
+ uint64_t offset = compute - base_tile;
266
+ uint64_t block_id = compute / tiles_this_block;
267
+ uint64_t bx = block_id % gridDim.x * tiles_this_block_x + offset % tiles_this_block_x;
268
+ uint64_t by = block_id / gridDim.x * tiles_this_block_x + offset / tiles_this_block_x;
269
+
270
+ // thread id
271
+ uint64_t tx = threadIdx.x;
272
+ uint64_t ty = threadIdx.y;
273
+
274
+ uint64_t index_B_y = DIM_THREAD_BLOCK * BATCH_SIZE * by + BATCH_SIZE * ty + 1;
275
+ uint64_t index_B_x = DIM_THREAD_BLOCK * BATCH_SIZE * bx + BATCH_SIZE * tx + 1;
276
+
277
+ // Computation
278
+ for (uint64_t i = 0; i < BATCH_SIZE; i++)
279
+ {
280
+ for (uint64_t j = 0; j < BATCH_SIZE; j++)
281
+ {
282
+ tmp_B[ty * BATCH_SIZE + i][tx * BATCH_SIZE + j] = 0;
283
+ }
284
+ }
285
+ block.sync();
286
+
287
+ for (uint64_t i = 0; i < BATCH_SIZE; i++)
288
+ {
289
+ for (uint64_t j = 0; j < BATCH_SIZE; j++)
290
+ {
291
+ for (uint64_t m = 0; m < KERNEL; m++)
292
+ {
293
+ for (uint64_t n = 0; n < KERNEL; n++)
294
+ {
295
+ tmp_B[ty * BATCH_SIZE + i][tx * BATCH_SIZE + j] += tmp_A[compute % PREFETCH_COUNT][ty * BATCH_SIZE + i + m][tx * BATCH_SIZE + j + n] * c[n][m];
296
+ }
297
+ }
298
+ }
299
+ }
300
+ block.sync();
301
+
302
+ // Store B
303
+ for (uint64_t i = 0; i < BATCH_SIZE; i++)
304
+ {
305
+ for (uint64_t j = 0; j < BATCH_SIZE; j++)
306
+ {
307
+ if ((index_B_y + i) < NI && (index_B_x + j) < NJ)
308
+ {
309
+ B[(index_B_y + i) * NJ + index_B_x + j] = tmp_B[ty * BATCH_SIZE + i][tx * BATCH_SIZE + j];
310
+ }
311
+ }
312
+ }
313
+ block.sync();
314
+ }
315
+ }
316
+
317
+ void convolution2DCuda(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *A_gpu, DATA_TYPE *B_gpu)
318
+ {
319
+ double t_start, t_end;
320
+
321
+ uint64_t output_width = NI - KERNEL + 1;
322
+ uint64_t output_height = NJ - KERNEL + 1;
323
+
324
+ dim3 block(DIM_THREAD_BLOCK, DIM_THREAD_BLOCK);
325
+ dim3 grid(nblocks, nblocks);
326
+
327
+ uint64_t block_size = (NJ + (nblocks - 1)) / nblocks;
328
+
329
+ // t_start = rtclock();
330
+
331
+ cudaMemcpy(A_gpu, A, sizeof(DATA_TYPE) * NI * NJ, cudaMemcpyHostToDevice);
332
+ Convolution2D_kernel<<<grid,block>>>(A_gpu, B_gpu, NI, NJ, block_size);
333
+ cudaDeviceSynchronize();
334
+ cudaMemcpy(B, B_gpu, sizeof(DATA_TYPE) * NI * NJ, cudaMemcpyDeviceToHost);
335
+
336
+ // t_end = rtclock();
337
+ // fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);//);
338
+ }
339
+
340
+ int main(int argc, char *argv[])
341
+ {
342
+ uint64_t start_tsc = rdtsc();
343
+ uint64_t start_tsp = rdtsp();
344
+ printf("start_tsc %lu start_tsp %lu\n", start_tsc, start_tsp);
345
+ if (argc >= 4) {
346
+ NI = atoll(argv[1]);
347
+ NJ = atoll(argv[2]);
348
+ nblocks = atoi(argv[3]);
349
+ } else {
350
+ NI = SIZE;
351
+ NJ = SIZE;
352
+ nblocks = NBLOCKS;
353
+ }
354
+ double t_start, t_end;
355
+
356
+ DATA_TYPE* A;
357
+ DATA_TYPE* B;
358
+ DATA_TYPE *B_ref;
359
+ DATA_TYPE *A_gpu;
360
+ DATA_TYPE *B_gpu;
361
+
362
+ A = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE));
363
+ B = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE));
364
+ B_ref = (DATA_TYPE *)malloc(NI * NJ * sizeof(DATA_TYPE));
365
+ initCPU(A);
366
+ GPU_argv_init();
367
+
368
+ initTrace();
369
+ startCPU();
370
+
371
+ cudaMalloc(&A_gpu, sizeof(DATA_TYPE) * NI * NJ);
372
+ cudaMalloc(&B_gpu, sizeof(DATA_TYPE) * NI * NJ);
373
+ // B_outputFromGpu = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE));
374
+
375
+ convolution2DCuda(A, B, A_gpu, B_gpu);
376
+
377
+ cudaFree(A_gpu);
378
+ cudaFree(B_gpu);
379
+ endCPU();
380
+ finiTrace();
381
+
382
+ // t_start = rtclock();
383
+ // conv2D(A, B_ref);
384
+ // t_end = rtclock();
385
+ // fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);
386
+
387
+ // compareResults(B, B_ref);
388
+ free(A);
389
+ free(B);
390
+
391
+ return 0;
392
+ }
workloads/micro/async/2DCONV/Makefile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include ../../../common/make.config
2
+
3
+ NVCCCFLAGS = -I$(CUPTI_INCLUDE) -L$(CUPTI_LIB_DIR) -std=c++11 -lcuda -lcupti -arch=sm_80 -O3
4
+ NVCC = $(CUDA_DIR)/bin/nvcc
5
+
6
+ EXECUTABLE := 2DConvolution
7
+ CUFILES := 2DConvolution.cu $(CUPTI_ADD_COMMON)/cupti_add.cpp $(CUPTI_ADD_COMMON)/cpu_timestamps.cpp
8
+
9
+ all:
10
+ $(NVCC) ${NVCCCFLAGS} ${CUFILES} ${DEF} -o ${EXECUTABLE}
11
+ clean:
12
+ rm -f *.o 2DConvolution
workloads/micro/async/2DCONV/run.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # ./2DConvolution 16384 16384 32
2
+ ./2DConvolution 32768 32768 32
workloads/micro/async/2DCONV/run_large.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./2DConvolution 8192 8192 32
workloads/micro/async/2DCONV/run_medium.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./2DConvolution 4096 4096 32
workloads/micro/async/2DCONV/run_mega.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./2DConvolution 65536 65536 32
workloads/micro/async/2DCONV/run_small.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./2DConvolution 1024 1024 8
workloads/micro/async/2DCONV/run_super.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./2DConvolution 32768 32768 32
workloads/micro/async/2DCONV/run_tiny.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./2DConvolution 512 512 4
workloads/micro/async/3DCONV/3DConvolution.cu ADDED
@@ -0,0 +1,449 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * 3DConvolution.cu: This file is part of the PolyBench/GPU 1.0 test suite.
3
+ *
4
+ *
5
+ * Contact: Scott Grauer-Gray <sgrauerg@gmail.com>
6
+ * Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
7
+ * Web address: http://www.cse.ohio-state.edu/~pouchet/software/polybench/GPU
8
+ */
9
+
10
+ #include <unistd.h>
11
+ #include <stdio.h>
12
+ #include <time.h>
13
+ #include <sys/time.h>
14
+ #include <stdlib.h>
15
+ #include <stdarg.h>
16
+ #include <string.h>
17
+ #include <cuda.h>
18
+
19
+ #include "../../../common/cupti_add.h"
20
+ #include "../../../common/cpu_timestamps.h"
21
+
22
+ #include <cooperative_groups.h>
23
+ #include <cooperative_groups/memcpy_async.h>
24
+
25
+ using namespace nvcuda::experimental;
26
+
27
+ #define PREFETCH_COUNT 2
28
+
29
+ #define SMALL_FLOAT_VAL 0.00000001f
30
+
31
+ double rtclock()
32
+ {
33
+ struct timezone Tzp;
34
+ struct timeval Tp;
35
+ uint64_t stat;
36
+ stat = gettimeofday(&Tp, &Tzp);
37
+ if (stat != 0)
38
+ printf("Error return from gettimeofday: %d", stat);
39
+ return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
40
+ }
41
+
42
+ float absVal(float a)
43
+ {
44
+ if (a < 0)
45
+ {
46
+ return (a * -1);
47
+ }
48
+ else
49
+ {
50
+ return a;
51
+ }
52
+ }
53
+
54
+ float percentDiff(double val1, double val2)
55
+ {
56
+ if ((absVal(val1) < 0.01) && (absVal(val2) < 0.01))
57
+ {
58
+ return 0.0f;
59
+ }
60
+
61
+ else
62
+ {
63
+ return 100.0f * (absVal(absVal(val1 - val2) / absVal(val1 + SMALL_FLOAT_VAL)));
64
+ }
65
+ }
66
+
67
+ // define the error threshold for the results "not matching"
68
+ #define PERCENT_DIFF_ERROR_THRESHOLD 0.05
69
+
70
+ /* Problem size */
71
+ #define SIZE 4096
72
+ #define NBLOCKS 2
73
+ #define BATCH_SIZE 3
74
+
75
+ uint64_t NI;
76
+ uint64_t NJ;
77
+ uint64_t NK;
78
+ uint64_t nblocks;
79
+
80
+ /* Thread block dimensions */
81
+ #define DIM_THREAD_BLOCK 4
82
+
83
+ #define KERNEL 3
84
+
85
+ /* Can switch DATA_TYPE between float and double */
86
+ typedef float DATA_TYPE;
87
+
88
+
89
+
90
+ void conv3D(DATA_TYPE* A, DATA_TYPE* B)
91
+ {
92
+ uint64_t i, j, k;
93
+ DATA_TYPE c11, c12, c13, c21, c22, c23, c31, c32, c33;
94
+
95
+ c11 = +2; c21 = +5; c31 = -8;
96
+ c12 = -3; c22 = +6; c32 = -9;
97
+ c13 = +4; c23 = +7; c33 = +10;
98
+
99
+ for (i = 1; i < NI - 1; ++i) // 0
100
+ {
101
+ for (j = 1; j < NJ - 1; ++j) // 1
102
+ {
103
+ for (k = 1; k < NK -1; ++k) // 2
104
+ {
105
+ B[i*(NK * NJ) + j*NK + k] = c11 * A[(i - 1)*(NK * NJ) + (j - 1)*NK + (k - 1)] + c13 * A[(i + 1)*(NK * NJ) + (j - 1)*NK + (k - 1)]
106
+ + c21 * A[(i - 1)*(NK * NJ) + (j - 1)*NK + (k - 1)] + c23 * A[(i + 1)*(NK * NJ) + (j - 1)*NK + (k - 1)]
107
+ + c31 * A[(i - 1)*(NK * NJ) + (j - 1)*NK + (k - 1)] + c33 * A[(i + 1)*(NK * NJ) + (j - 1)*NK + (k - 1)]
108
+ + c12 * A[(i + 0)*(NK * NJ) + (j - 1)*NK + (k + 0)] + c22 * A[(i + 0)*(NK * NJ) + (j + 0)*NK + (k + 0)]
109
+ + c32 * A[(i + 0)*(NK * NJ) + (j + 1)*NK + (k + 0)] + c11 * A[(i - 1)*(NK * NJ) + (j - 1)*NK + (k + 1)]
110
+ + c13 * A[(i + 1)*(NK * NJ) + (j - 1)*NK + (k + 1)] + c21 * A[(i - 1)*(NK * NJ) + (j + 0)*NK + (k + 1)]
111
+ + c23 * A[(i + 1)*(NK * NJ) + (j + 0)*NK + (k + 1)] + c31 * A[(i - 1)*(NK * NJ) + (j + 1)*NK + (k + 1)]
112
+ + c33 * A[(i + 1)*(NK * NJ) + (j + 1)*NK + (k + 1)];
113
+ }
114
+ }
115
+ }
116
+ }
117
+
118
+ void initGPU(DATA_TYPE *A_gpu)
119
+ {
120
+ uint64_t i, j, k;
121
+
122
+ for (i = 0; i < NI; ++i)
123
+ {
124
+ for (j = 0; j < NJ; ++j)
125
+ {
126
+ for (k = 0; k < NK; ++k)
127
+ {
128
+ A_gpu[i * (NK * NJ) + j * NK + k] = i % 12 + 2 * (j % 7) + 3 * (k % 13);
129
+ }
130
+ }
131
+ }
132
+ }
133
+
134
+ void initCPU(DATA_TYPE *A)
135
+ {
136
+ uint64_t i, j, k;
137
+
138
+ for (i = 0; i < NI; ++i)
139
+ {
140
+ for (j = 0; j < NJ; ++j)
141
+ {
142
+ for (k = 0; k < NK; ++k)
143
+ {
144
+ A[i*(NK * NJ) + j*NK + k] = i % 12 + 2 * (j % 7) + 3 * (k % 13);
145
+ }
146
+ }
147
+ }
148
+ }
149
+
150
+
151
+ void compareResults(DATA_TYPE* B, DATA_TYPE* B_outputFromGpu)
152
+ {
153
+ uint64_t i, j, k, fail;
154
+ fail = 0;
155
+
156
+ // Compare result from cpu and gpu...
157
+ for (i = 1; i < NI - 1; ++i) // 0
158
+ {
159
+ for (j = 1; j < NJ - 1; ++j) // 1
160
+ {
161
+ for (k = 1; k < NK - 1; ++k) // 2
162
+ {
163
+ if (percentDiff(B[i*(NK * NJ) + j*NK + k], B_outputFromGpu[i*(NK * NJ) + j*NK + k]) > PERCENT_DIFF_ERROR_THRESHOLD)
164
+ {
165
+ printf("%d, %d, %d, CPU is %f, GPU is %f.\n", i, j, k, B[i * (NK * NJ) + j * NK + k], B_outputFromGpu[i * (NK * NJ) + j * NK + k]);
166
+ fail++;
167
+ }
168
+ }
169
+ }
170
+ }
171
+
172
+ // Print results
173
+ printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail);
174
+ }
175
+
176
+ __global__ void convolution3D_kernel(DATA_TYPE *A, DATA_TYPE *B, uint64_t NI, uint64_t NJ, uint64_t NK, uint64_t block_size)
177
+ {
178
+ cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
179
+ pipeline pipe;
180
+ DATA_TYPE c11, c12, c13, c21, c22, c23, c31, c32, c33;
181
+
182
+ c11 = +2;
183
+ c21 = +5;
184
+ c31 = -8;
185
+ c12 = -3;
186
+ c22 = +6;
187
+ c32 = -9;
188
+ c13 = +4;
189
+ c23 = +7;
190
+ c33 = +10;
191
+
192
+ uint64_t tile_dim_x = (NJ + DIM_THREAD_BLOCK - 1) / (DIM_THREAD_BLOCK * BATCH_SIZE);
193
+
194
+ __shared__ DATA_TYPE tmp_A[PREFETCH_COUNT][DIM_THREAD_BLOCK * BATCH_SIZE + KERNEL - 1][DIM_THREAD_BLOCK * BATCH_SIZE + KERNEL - 1][DIM_THREAD_BLOCK * BATCH_SIZE + KERNEL - 1];
195
+ __shared__ DATA_TYPE tmp_B[DIM_THREAD_BLOCK * BATCH_SIZE][DIM_THREAD_BLOCK * BATCH_SIZE][DIM_THREAD_BLOCK * BATCH_SIZE];
196
+
197
+ // uint64_t total_tiles = tile_dim_x * tile_dim_x * tile_dim_x;
198
+
199
+ uint64_t tiles_this_block_x = (block_size / (DIM_THREAD_BLOCK * BATCH_SIZE));
200
+ uint64_t tiles_this_block = tiles_this_block_x * tiles_this_block_x * tiles_this_block_x;
201
+
202
+ uint64_t base_tile = (blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x) * tiles_this_block;
203
+ uint64_t fetch = base_tile;
204
+ uint64_t end_tile = fetch + tiles_this_block;
205
+
206
+ // printf("block_size is %d, tile_dim_x is %d, tiles_this_block_x is %d.\n", block_size, tile_dim_x, tiles_this_block_x);
207
+
208
+ for (uint64_t compute = fetch; compute < end_tile; compute++)
209
+ {
210
+ for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++)
211
+ {
212
+ // block id
213
+ uint64_t offset = fetch - base_tile;
214
+ uint64_t block_id = fetch / tiles_this_block;
215
+
216
+ uint64_t bz = block_id / (gridDim.y * gridDim.x) * tiles_this_block_x + offset / (tiles_this_block_x * tiles_this_block_x);
217
+ uint64_t by = block_id % (gridDim.y * gridDim.x) / gridDim.x * tiles_this_block_x + offset % (tiles_this_block_x * tiles_this_block_x) / tiles_this_block_x;
218
+ uint64_t bx = block_id % (gridDim.y * gridDim.x) % gridDim.x * tiles_this_block_x + offset % (tiles_this_block_x * tiles_this_block_x) % tiles_this_block_x;
219
+
220
+ // thread id
221
+ uint64_t tx = threadIdx.x;
222
+ uint64_t ty = threadIdx.y;
223
+ uint64_t tz = threadIdx.z;
224
+
225
+ uint64_t index_A_z = DIM_THREAD_BLOCK * BATCH_SIZE * bz + BATCH_SIZE * tz;
226
+ uint64_t index_A_y = DIM_THREAD_BLOCK * BATCH_SIZE * by + BATCH_SIZE * ty;
227
+ uint64_t index_A_x = DIM_THREAD_BLOCK * BATCH_SIZE * bx + BATCH_SIZE * tx;
228
+
229
+ uint64_t index_A_z_start = DIM_THREAD_BLOCK * BATCH_SIZE * bz;
230
+ uint64_t index_A_y_start = DIM_THREAD_BLOCK * BATCH_SIZE * by;
231
+ uint64_t index_A_x_start = DIM_THREAD_BLOCK * BATCH_SIZE * bx;
232
+
233
+ uint64_t index_A_z_bound = DIM_THREAD_BLOCK * BATCH_SIZE * bz + BATCH_SIZE * DIM_THREAD_BLOCK;
234
+ uint64_t index_A_y_bound = DIM_THREAD_BLOCK * BATCH_SIZE * by + BATCH_SIZE * DIM_THREAD_BLOCK;
235
+ uint64_t index_A_x_bound = DIM_THREAD_BLOCK * BATCH_SIZE * bx + BATCH_SIZE * DIM_THREAD_BLOCK;
236
+
237
+ // fetch A
238
+ for (uint64_t i = 0; i < BATCH_SIZE; i++)
239
+ {
240
+ for (uint64_t j = 0; j < BATCH_SIZE; j++)
241
+ {
242
+ for (uint64_t k = 0; k < BATCH_SIZE; k++)
243
+ {
244
+ if ((index_A_z + i) < NI && (index_A_y + j) < NJ && (index_A_x + k) < NK)
245
+ {
246
+ memcpy_async(tmp_A[fetch % PREFETCH_COUNT][tz * BATCH_SIZE + i][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k], A[(index_A_z + i) * NJ * NK + (index_A_y + j) * NK + index_A_x + k], pipe);
247
+ }
248
+ }
249
+ }
250
+ }
251
+
252
+ // fetch A -- padding
253
+ for (uint64_t i = 0; i < KERNEL - 1; i++)
254
+ {
255
+ for (uint64_t j = 0; j < BATCH_SIZE * DIM_THREAD_BLOCK + KERNEL - 1; j++)
256
+ {
257
+ for (uint64_t k = 0; k < BATCH_SIZE * DIM_THREAD_BLOCK + KERNEL - 1; k++)
258
+ {
259
+ if ((index_A_z_bound + i) < NI && (index_A_y_start + j) < NJ && (index_A_x_start + k) < NK)
260
+ {
261
+ memcpy_async(tmp_A[fetch % PREFETCH_COUNT][DIM_THREAD_BLOCK * BATCH_SIZE + i][j][k], A[(index_A_z_bound + i) * NJ * NK + (index_A_y_start + j) * NK + index_A_x_start + k], pipe);
262
+ }
263
+ }
264
+ }
265
+ }
266
+
267
+ // fetch A -- padding
268
+ for (uint64_t i = 0; i < BATCH_SIZE * DIM_THREAD_BLOCK + KERNEL - 1; i++)
269
+ {
270
+ for (uint64_t j = 0; j < KERNEL - 1; j++)
271
+ {
272
+ for (uint64_t k = 0; k < BATCH_SIZE * DIM_THREAD_BLOCK + KERNEL - 1; k++)
273
+ {
274
+ if ((index_A_z_start + i) < NI && (index_A_y_bound + j) < NJ && (index_A_x_start + k) < NK)
275
+ {
276
+ memcpy_async(tmp_A[fetch % PREFETCH_COUNT][i][DIM_THREAD_BLOCK * BATCH_SIZE + j][k], A[(index_A_z_start + i) * NJ * NK + (index_A_y_bound + j) * NK + index_A_x_start + k], pipe);
277
+ }
278
+ }
279
+ }
280
+ }
281
+
282
+ // fetch A -- padding
283
+ for (uint64_t i = 0; i < BATCH_SIZE * DIM_THREAD_BLOCK + KERNEL - 1; i++)
284
+ {
285
+ for (uint64_t j = 0; j < BATCH_SIZE * DIM_THREAD_BLOCK + KERNEL - 1; j++)
286
+ {
287
+ for (uint64_t k = 0; k < KERNEL - 1; k++)
288
+ {
289
+ if ((index_A_z_start + i) < NI && (index_A_y_start + j) < NJ && (index_A_x_bound + k) < NK)
290
+ {
291
+ memcpy_async(tmp_A[fetch % PREFETCH_COUNT][i][j][DIM_THREAD_BLOCK * BATCH_SIZE + k], A[(index_A_z_start + i) * NJ * NK + (index_A_y_start + j) * NK + index_A_x_bound + k], pipe);
292
+ }
293
+ }
294
+ }
295
+ }
296
+ pipe.commit();
297
+ }
298
+ if (fetch == end_tile)
299
+ {
300
+ for (uint64_t i = 0; i < PREFETCH_COUNT - 1; ++i)
301
+ {
302
+ pipe.commit();
303
+ }
304
+ ++fetch;
305
+ }
306
+ pipe.wait_prior<PREFETCH_COUNT - 1>();
307
+ block.sync();
308
+
309
+ // block id
310
+ uint64_t offset = compute - base_tile;
311
+ uint64_t block_id = compute / tiles_this_block;
312
+
313
+ uint64_t bz = block_id / (gridDim.y * gridDim.x) * tiles_this_block_x + offset / (tiles_this_block_x * tiles_this_block_x);
314
+ uint64_t by = block_id % (gridDim.y * gridDim.x) / gridDim.x * tiles_this_block_x + offset % (tiles_this_block_x * tiles_this_block_x) / tiles_this_block_x;
315
+ uint64_t bx = block_id % (gridDim.y * gridDim.x) % gridDim.x * tiles_this_block_x + offset % (tiles_this_block_x * tiles_this_block_x) % tiles_this_block_x;
316
+
317
+ // thread id
318
+ uint64_t tx = threadIdx.x;
319
+ uint64_t ty = threadIdx.y;
320
+ uint64_t tz = threadIdx.z;
321
+
322
+ uint64_t index_B_z = DIM_THREAD_BLOCK * BATCH_SIZE * bz + BATCH_SIZE * tz + 1;
323
+ uint64_t index_B_y = DIM_THREAD_BLOCK * BATCH_SIZE * by + BATCH_SIZE * ty + 1;
324
+ uint64_t index_B_x = DIM_THREAD_BLOCK * BATCH_SIZE * bx + BATCH_SIZE * tx + 1;
325
+
326
+
327
+ // Computation
328
+ for (uint64_t i = 0; i < BATCH_SIZE; i++)
329
+ {
330
+ for (uint64_t j = 0; j < BATCH_SIZE; j++)
331
+ {
332
+ for (uint64_t k = 0; k < BATCH_SIZE; k++)
333
+ {
334
+ tmp_B[tz * BATCH_SIZE + i][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k] = 0;
335
+ }
336
+ }
337
+ }
338
+ block.sync();
339
+
340
+ for (uint64_t i = 0; i < BATCH_SIZE; i++)
341
+ {
342
+ for (uint64_t j = 0; j < BATCH_SIZE; j++)
343
+ {
344
+ for (uint64_t k = 0; k < BATCH_SIZE; k++)
345
+ {
346
+ tmp_B[tz * BATCH_SIZE + i][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k] =
347
+ c11 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k] + c13 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i + 2][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k] + c21 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k] + c23 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i + 2][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k] + c31 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k] + c33 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i + 2][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k] + c12 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i + 1][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k + 1] + c22 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i + 1][ty * BATCH_SIZE + j + 1][tx * BATCH_SIZE + k + 1] + c32 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i + 1][ty * BATCH_SIZE + j + 2][tx * BATCH_SIZE + k + 1] + c11 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k + 2] + c13 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i + 2][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k + 2] + c21 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i][ty * BATCH_SIZE + j + 1][tx * BATCH_SIZE + k + 2] + c23 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i + 2][ty * BATCH_SIZE + j + 1][tx * BATCH_SIZE + k + 2] + c31 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i][ty * BATCH_SIZE + j + 2][tx * BATCH_SIZE + k + 2] + c33 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i + 2][ty * BATCH_SIZE + j + 2][tx * BATCH_SIZE + k + 2];
348
+ }
349
+ }
350
+ }
351
+ block.sync();
352
+
353
+ // Store B
354
+ for (uint64_t i = 0; i < BATCH_SIZE; i++)
355
+ {
356
+ for (uint64_t j = 0; j < BATCH_SIZE; j++)
357
+ {
358
+ for (uint64_t k = 0; k < BATCH_SIZE; k++)
359
+ {
360
+ if ((index_B_z + i + 1) < NI && (index_B_y + j + 1) < NJ && (index_B_x + k + 1) < NK)
361
+ {
362
+ B[(index_B_z + i) * NJ * NK + (index_B_y + j) * NK + index_B_x + k] = tmp_B[tz * BATCH_SIZE + i][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k];
363
+ }
364
+ }
365
+ }
366
+ }
367
+ block.sync();
368
+ }
369
+ }
370
+
371
+
372
+
373
+ void convolution3DCuda(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* A_gpu, DATA_TYPE* B_gpu)
374
+ {
375
+ double t_start, t_end;
376
+
377
+ dim3 block(DIM_THREAD_BLOCK, DIM_THREAD_BLOCK, DIM_THREAD_BLOCK);
378
+ dim3 grid(nblocks, nblocks, nblocks);
379
+
380
+ uint64_t block_size = (NI + (nblocks - 1)) / nblocks;
381
+
382
+ // t_start = rtclock();
383
+
384
+ cudaMemcpy(A_gpu, A, sizeof(DATA_TYPE) * NI * NJ * NK, cudaMemcpyHostToDevice);
385
+ convolution3D_kernel<<<grid, block>>>(A_gpu, B_gpu, NI, NJ, NK, block_size);
386
+ cudaDeviceSynchronize();
387
+ cudaMemcpy(B, B_gpu, sizeof(DATA_TYPE) * NI * NJ * NK, cudaMemcpyDeviceToHost);
388
+
389
+ // t_end = rtclock();
390
+ // fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);
391
+
392
+ }
393
+
394
+ int main(int argc, char *argv[])
395
+ {
396
+ uint64_t start_tsc = rdtsc();
397
+ uint64_t start_tsp = rdtsp();
398
+ printf("start_tsc %lu start_tsp %lu\n", start_tsc, start_tsp);
399
+ if (argc >= 5) {
400
+ NI = atoll(argv[1]);
401
+ NJ = atoll(argv[2]);
402
+ NK = atoll(argv[3]);
403
+ nblocks = atoi(argv[4]);
404
+ } else {
405
+ NI = SIZE;
406
+ NJ = SIZE;
407
+ NK = SIZE;
408
+ nblocks = NBLOCKS;
409
+ }
410
+ double t_start, t_end;
411
+
412
+ DATA_TYPE* A;
413
+ DATA_TYPE* B;
414
+ DATA_TYPE *B_ref;
415
+ DATA_TYPE *A_gpu;
416
+ DATA_TYPE *B_gpu;
417
+
418
+ A = (DATA_TYPE*)malloc(NI*NJ*NK*sizeof(DATA_TYPE));
419
+ B = (DATA_TYPE*)malloc(NI*NJ*NK*sizeof(DATA_TYPE));
420
+ B_ref = (DATA_TYPE*)malloc(NI*NJ*NK*sizeof(DATA_TYPE));
421
+ initCPU(A);
422
+ GPU_argv_init();
423
+
424
+ initTrace();
425
+ startCPU();
426
+
427
+ cudaMalloc(&A_gpu, sizeof(DATA_TYPE) * NI * NJ * NK);
428
+ cudaMalloc(&B_gpu, sizeof(DATA_TYPE) * NI * NJ * NK);
429
+
430
+ // initGPU(A_gpu);
431
+
432
+ convolution3DCuda(A, B, A_gpu, B_gpu);
433
+
434
+ cudaFree(A_gpu);
435
+ cudaFree(B_gpu);
436
+ endCPU();
437
+ finiTrace();
438
+
439
+ // t_start = rtclock();
440
+ // conv3D(A, B_ref);
441
+ // t_end = rtclock();
442
+ // fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);
443
+
444
+ // compareResults(B, B_ref);
445
+ free(A);
446
+ free(B);
447
+
448
+ return 0;
449
+ }
workloads/micro/async/3DCONV/Makefile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include ../../../common/make.config
2
+
3
+ NVCCCFLAGS = -I$(CUPTI_INCLUDE) -L$(CUPTI_LIB_DIR) -std=c++11 -lcuda -lcupti -arch=sm_80 -O3
4
+ NVCC = $(CUDA_DIR)/bin/nvcc
5
+
6
+ EXECUTABLE := 3DConvolution
7
+ CUFILES := 3DConvolution.cu $(CUPTI_ADD_COMMON)/cupti_add.cpp $(CUPTI_ADD_COMMON)/cpu_timestamps.cpp
8
+
9
+ all:
10
+ $(NVCC) ${NVCCCFLAGS} ${CUFILES} ${DEF} -o ${EXECUTABLE}
11
+ clean:
12
+ rm -f *.o 3DConvolution
workloads/micro/async/3DCONV/run.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ #./3DConvolution 768 768 768 8
2
+ ./3DConvolution 1536 1536 1536 8
workloads/micro/async/3DCONV/run_large.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./3DConvolution 384 384 384 8
workloads/micro/async/3DCONV/run_medium.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./3DConvolution 192 192 192 4
workloads/micro/async/3DCONV/run_mega.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./3DConvolution 1536 1536 1536 8
workloads/micro/async/3DCONV/run_small.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./3DConvolution 96 96 96 4
workloads/micro/async/3DCONV/run_super.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./3DConvolution 768 768 768 8
workloads/micro/async/3DCONV/run_tiny.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./3DConvolution 48 48 48 2
workloads/micro/async/gemm/Makefile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include ../../../common/make.config
2
+
3
+ NVCCCFLAGS = -I$(CUPTI_INCLUDE) -L$(CUPTI_LIB_DIR) -std=c++11 -lcuda -lcupti -arch=sm_80 -O3
4
+ NVCC = $(CUDA_DIR)/bin/nvcc
5
+
6
+ EXECUTABLE := gemm
7
+ CUFILES := gemm.cu $(CUPTI_ADD_COMMON)/cupti_add.cpp $(CUPTI_ADD_COMMON)/cpu_timestamps.cpp
8
+
9
+ all:
10
+ $(NVCC) ${NVCCCFLAGS} ${CUFILES} ${DEF} -o ${EXECUTABLE}
11
+ clean:
12
+ rm -f *.o gemm
workloads/micro/async/gemm/gemm.cu ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * gemm.cu: This file is part of the PolyBench/GPU 1.0 test suite.
3
+ *
4
+ *
5
+ * Contact: Scott Grauer-Gray <sgrauerg@gmail.com>
6
+ * Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
7
+ * Web address: http://www.cse.ohio-state.edu/~pouchet/software/polybench/GPU
8
+ */
9
+
10
+ #include <unistd.h>
11
+ #include <stdio.h>
12
+ #include <time.h>
13
+ #include <sys/time.h>
14
+ #include <stdlib.h>
15
+ #include <stdarg.h>
16
+ #include <string.h>
17
+ #include <cuda.h>
18
+ #include "../../../common/cupti_add.h"
19
+ #include "../../../common/cpu_timestamps.h"
20
+
21
+ #include <cooperative_groups.h>
22
+ #include <cooperative_groups/memcpy_async.h>
23
+
24
+ using namespace nvcuda::experimental;
25
+
26
+ #define PREFETCH_COUNT 2
27
+
28
+ #define SMALL_FLOAT_VAL 0.00000001f
29
+
30
+ double rtclock()
31
+ {
32
+ struct timezone Tzp;
33
+ struct timeval Tp;
34
+ uint64_t stat;
35
+ stat = gettimeofday(&Tp, &Tzp);
36
+ if (stat != 0)
37
+ printf("Error return from gettimeofday: %d", stat);
38
+ return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
39
+ }
40
+
41
+ float absVal(float a)
42
+ {
43
+ if (a < 0)
44
+ {
45
+ return (a * -1);
46
+ }
47
+ else
48
+ {
49
+ return a;
50
+ }
51
+ }
52
+
53
+ float percentDiff(double val1, double val2)
54
+ {
55
+ if ((absVal(val1) < 0.01) && (absVal(val2) < 0.01))
56
+ {
57
+ return 0.0f;
58
+ }
59
+
60
+ else
61
+ {
62
+ return 100.0f * (absVal(absVal(val1 - val2) / absVal(val1 + SMALL_FLOAT_VAL)));
63
+ }
64
+ }
65
+
66
+ //define the error threshold for the results "not matching"
67
+ #define PERCENT_DIFF_ERROR_THRESHOLD 0.05
68
+
69
+ /* Problem size */
70
+ #define SIZE 4096
71
+ uint64_t NI;
72
+ uint64_t NJ;
73
+ uint64_t NK;
74
+
75
+ /* Thread block dimensions */
76
+ #define DIM_THREAD_BLOCK_X 32
77
+ #define DIM_THREAD_BLOCK_Y 32
78
+
79
+
80
+ /* Declared constant values for ALPHA and BETA (same as values in PolyBench 2.0) */
81
+ #define ALPHA 1.1f
82
+ #define BETA 1.1f
83
+
84
+ /* Can switch DATA_TYPE between float and double */
85
+ typedef float DATA_TYPE;
86
+ // typedef uint64_t DATA_TYPE;
87
+
88
+ void gemm(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C)
89
+ {
90
+ uint64_t i,j,k;
91
+
92
+ for (i = 0; i < NI; i++) {
93
+ for (j = 0; j < NJ; j++) {
94
+ C[i*NJ + j] *= BETA;
95
+ for (k = 0; k < NK; ++k) {
96
+ C[i*NJ + j] += ALPHA * A[i*NK + k] * B[k*NJ + j];
97
+ }
98
+ }
99
+ }
100
+ }
101
+
102
+ void init(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C, DATA_TYPE *C_ref)
103
+ {
104
+ uint64_t i, j;
105
+
106
+ for (i = 0; i < NI; i++)
107
+ for (j = 0; j < NK; j++)
108
+ A[i*NK + j] = ((DATA_TYPE) i*j) / NI;
109
+
110
+ for (i = 0; i < NK; i++)
111
+ for (j = 0; j < NJ; j++)
112
+ B[i*NJ + j] = ((DATA_TYPE) i*j + 1) / NJ;
113
+
114
+ for (i = 0; i < NI; i++) {
115
+ for (j = 0; j < NJ; j++) {
116
+ C[i * NJ + j] = ((DATA_TYPE)i * j + 2) / NJ;
117
+ C_ref[i * NJ + j] = ((DATA_TYPE)i * j + 2) / NJ;
118
+ }
119
+ }
120
+
121
+ }
122
+
123
+
124
+ void compareResults(DATA_TYPE* C, DATA_TYPE* C_outputFromGpu)
125
+ {
126
+ uint64_t i, j, fail;
127
+ fail = 0;
128
+
129
+ // Compare C1 and C2
130
+ for (i=0; i < NI; i++)
131
+ {
132
+ for (j=0; j < NJ; j++)
133
+ {
134
+ // printf("%d, %d, GPU is %f, CPU is %f.\n", i, j, C[i*NJ + j], C_outputFromGpu[i*NJ + j]);
135
+ if (percentDiff(C[i*NJ + j], C_outputFromGpu[i*NJ + j]) > PERCENT_DIFF_ERROR_THRESHOLD)
136
+ {
137
+ printf("%d, %d, GPU is %f, CPU is %f.\n", i, j, C[i*NJ + j], C_outputFromGpu[i*NJ + j]);
138
+ fail++;
139
+ }
140
+ }
141
+ }
142
+
143
+ // Print results
144
+ printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail);
145
+ }
146
+
147
+ __global__ void gemm_kernel(DATA_TYPE *a, DATA_TYPE *b, DATA_TYPE *c, uint64_t NI, uint64_t NK, uint64_t NJ)
148
+ {
149
+ cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
150
+ pipeline pipe;
151
+
152
+ uint64_t row = blockIdx.y * blockDim.y + threadIdx.y;
153
+ uint64_t col = blockIdx.x * blockDim.x + threadIdx.x;
154
+
155
+ __shared__ DATA_TYPE s_a[DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y * PREFETCH_COUNT];
156
+ __shared__ DATA_TYPE s_b[DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y * PREFETCH_COUNT];
157
+
158
+ DATA_TYPE tmp = BETA * c[row * NJ + col];
159
+
160
+ uint64_t base_tiles = 0;
161
+ uint64_t end_tile = base_tiles + NK / blockDim.x;
162
+
163
+ uint64_t fetch = base_tiles;
164
+ uint64_t tile_size = DIM_THREAD_BLOCK_X;
165
+ uint64_t mem_size = DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y;
166
+
167
+ for (uint64_t compute = fetch; compute < end_tile; compute++)
168
+ {
169
+ for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++)
170
+ {
171
+ memcpy_async(s_a[(fetch % PREFETCH_COUNT) * mem_size + (threadIdx.y * blockDim.x + threadIdx.x)], a[row * NK + fetch * tile_size + threadIdx.x], pipe);
172
+ memcpy_async(s_b[(fetch % PREFETCH_COUNT) * mem_size + (threadIdx.y * blockDim.x + threadIdx.x)], b[(fetch * tile_size + threadIdx.y) * NJ + col], pipe);
173
+
174
+ pipe.commit();
175
+ }
176
+ if (fetch == end_tile) {
177
+ for (uint64_t i = 0; i < PREFETCH_COUNT-1; ++i) { pipe.commit(); }
178
+ ++fetch;
179
+ }
180
+ pipe.wait_prior<PREFETCH_COUNT - 1>();
181
+ block.sync();
182
+
183
+ for (uint64_t k = 0; k < blockDim.x; k++)
184
+ {
185
+ tmp += ALPHA * s_a[(compute % PREFETCH_COUNT) * mem_size + (threadIdx.y * blockDim.x + k)] * s_b[(compute % PREFETCH_COUNT) * mem_size + (k * blockDim.x + threadIdx.x)];
186
+ }
187
+ block.sync();
188
+ }
189
+
190
+ c[row * NJ + col] = tmp;
191
+ }
192
+
193
+ void gemmCuda(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C, DATA_TYPE *A_gpu, DATA_TYPE *B_gpu, DATA_TYPE *C_gpu)
194
+ {
195
+ double t_start, t_end;
196
+
197
+ dim3 block(DIM_THREAD_BLOCK_X, DIM_THREAD_BLOCK_Y);
198
+ dim3 grid((size_t)(ceil( ((float)NI)/ ((float)block.x) )),(size_t)(ceil( ((float)NJ)/ ((float)block.y) )));
199
+
200
+ //t_start = rtclock();
201
+ cudaMemcpy(A_gpu, A, sizeof(DATA_TYPE) * NI * NK, cudaMemcpyHostToDevice);
202
+ cudaMemcpy(B_gpu, B, sizeof(DATA_TYPE) * NK * NJ, cudaMemcpyHostToDevice);
203
+ cudaMemcpy(C_gpu, C, sizeof(DATA_TYPE) * NI * NJ, cudaMemcpyHostToDevice);
204
+ gemm_kernel<<< grid, block >>>(A_gpu, B_gpu, C_gpu, NI, NK, NJ);
205
+ cudaDeviceSynchronize();
206
+ cudaMemcpy(C, C_gpu, sizeof(DATA_TYPE) * NI * NJ, cudaMemcpyDeviceToHost);
207
+ //t_end = rtclock();
208
+
209
+ //fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);
210
+ }
211
+
212
+ int main(int argc, char *argv[])
213
+ {
214
+ uint64_t start_tsc = rdtsc();
215
+ uint64_t start_tsp = rdtsp();
216
+ printf("start_tsc %lu start_tsp %lu\n", start_tsc, start_tsp);
217
+ if (argc >= 4) {
218
+ NI = atoll(argv[1]);
219
+ NK = atoll(argv[2]);
220
+ NJ = atoll(argv[3]);
221
+ } else {
222
+ NI = SIZE;
223
+ NK = SIZE;
224
+ NJ = SIZE;
225
+ }
226
+
227
+ double t_start, t_end;
228
+
229
+ DATA_TYPE* A;
230
+ DATA_TYPE* B;
231
+ DATA_TYPE* C;
232
+ DATA_TYPE *C_ref;
233
+
234
+ DATA_TYPE *A_gpu;
235
+ DATA_TYPE *B_gpu;
236
+ DATA_TYPE *C_gpu;
237
+
238
+ A = (DATA_TYPE*)malloc(NI*NK*sizeof(DATA_TYPE));
239
+ B = (DATA_TYPE*)malloc(NK*NJ*sizeof(DATA_TYPE));
240
+ C = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE));
241
+ C_ref = (DATA_TYPE *)malloc(NI * NJ * sizeof(DATA_TYPE));
242
+
243
+ //cudaMallocManaged(&A_gpu, sizeof(DATA_TYPE) * NI * NK);
244
+ //cudaMallocManaged(&B_gpu, sizeof(DATA_TYPE) * NK * NJ);
245
+ //cudaMallocManaged(&C_gpu, sizeof(DATA_TYPE) * NI * NJ);
246
+ init(A, B, C, C_ref);
247
+
248
+ GPU_argv_init();
249
+
250
+ initTrace();
251
+ startCPU();
252
+
253
+ cudaMalloc(&A_gpu, sizeof(DATA_TYPE) * NI * NK);
254
+ cudaMalloc(&B_gpu, sizeof(DATA_TYPE) * NK * NJ);
255
+ cudaMalloc(&C_gpu, sizeof(DATA_TYPE) * NI * NJ);
256
+
257
+ gemmCuda(A, B, C, A_gpu, B_gpu, C_gpu);
258
+
259
+ cudaFree(A_gpu);
260
+ cudaFree(B_gpu);
261
+ cudaFree(C_gpu);
262
+ endCPU();
263
+ finiTrace();
264
+
265
+ // t_start = rtclock();
266
+ // gemm(A, B, C_ref);
267
+ // t_end = rtclock();
268
+ // fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);
269
+
270
+ // compareResults(C, C_ref);
271
+ free(A);
272
+ free(B);
273
+ free(C);
274
+ free(C_ref);
275
+ return 0;
276
+ }
277
+
workloads/micro/async/gemm/run.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ #./gemm 1024 1024 1024
2
+ ./gemm 32768 32768 32768
3
+ #./gemm 512 512 512
workloads/micro/async/gemm/run_large.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./gemm 8192 8192 8192
workloads/micro/async/gemm/run_medium.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./gemm 4096 4096 4096
workloads/micro/async/gemm/run_mega.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./gemm 32768 32768 32768
workloads/micro/async/gemm/run_small.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./gemm 1024 1024 1024
workloads/micro/async/gemm/run_super.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./gemm 16384 16384 16384
workloads/micro/async/gemm/run_tiny.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./gemm 512 512 512
workloads/micro/async/gemm_perf/Makefile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include ../../../common/make.config
2
+
3
+ NVCCCFLAGS = -I$(CUPTI_INCLUDE) -L$(CUPTI_LIB_DIR) -std=c++11 -lcuda -lcupti -arch=sm_80 -O3
4
+ NVCC = $(CUDA_DIR)/bin/nvcc
5
+
6
+ EXECUTABLE := gemm
7
+ CUFILES := gemm.cu $(CUPTI_ADD_COMMON)/cupti_add.cpp $(CUPTI_ADD_COMMON)/cpu_timestamps.cpp
8
+
9
+ all:
10
+ $(NVCC) ${NVCCCFLAGS} ${CUFILES} ${DEF} -o ${EXECUTABLE}
11
+ clean:
12
+ rm -f *.o gemm
workloads/micro/async/gemm_perf/gemm ADDED
Binary file (801 kB). View file
 
workloads/micro/async/gemm_perf/gemm.cu ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * gemm.cu: This file is part of the PolyBench/GPU 1.0 test suite.
3
+ *
4
+ *
5
+ * Contact: Scott Grauer-Gray <sgrauerg@gmail.com>
6
+ * Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
7
+ * Web address: http://www.cse.ohio-state.edu/~pouchet/software/polybench/GPU
8
+ */
9
+
10
+ #include <unistd.h>
11
+ #include <stdio.h>
12
+ #include <time.h>
13
+ #include <sys/time.h>
14
+ #include <stdlib.h>
15
+ #include <stdarg.h>
16
+ #include <string.h>
17
+ #include <cuda.h>
18
+ #include "../../../common/cupti_add.h"
19
+ #include "../../../common/cpu_timestamps.h"
20
+
21
+ #include <cooperative_groups.h>
22
+ #include <cooperative_groups/memcpy_async.h>
23
+
24
+ using namespace nvcuda::experimental;
25
+
26
+ #define PREFETCH_COUNT 2
27
+
28
+ #define SMALL_FLOAT_VAL 0.00000001f
29
+
30
+ double rtclock()
31
+ {
32
+ struct timezone Tzp;
33
+ struct timeval Tp;
34
+ uint64_t stat;
35
+ stat = gettimeofday(&Tp, &Tzp);
36
+ if (stat != 0)
37
+ printf("Error return from gettimeofday: %d", stat);
38
+ return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
39
+ }
40
+
41
+ float absVal(float a)
42
+ {
43
+ if (a < 0)
44
+ {
45
+ return (a * -1);
46
+ }
47
+ else
48
+ {
49
+ return a;
50
+ }
51
+ }
52
+
53
+ float percentDiff(double val1, double val2)
54
+ {
55
+ if ((absVal(val1) < 0.01) && (absVal(val2) < 0.01))
56
+ {
57
+ return 0.0f;
58
+ }
59
+
60
+ else
61
+ {
62
+ return 100.0f * (absVal(absVal(val1 - val2) / absVal(val1 + SMALL_FLOAT_VAL)));
63
+ }
64
+ }
65
+
66
+ //define the error threshold for the results "not matching"
67
+ #define PERCENT_DIFF_ERROR_THRESHOLD 0.05
68
+
69
+ /* Problem size */
70
+ #define SIZE 4096
71
+ uint64_t NI;
72
+ uint64_t NJ;
73
+ uint64_t NK;
74
+
75
+ /* Thread block dimensions */
76
+ #define DIM_THREAD_BLOCK_X 32
77
+ #define DIM_THREAD_BLOCK_Y 32
78
+
79
+
80
+ /* Declared constant values for ALPHA and BETA (same as values in PolyBench 2.0) */
81
+ #define ALPHA 1.1f
82
+ #define BETA 1.1f
83
+
84
+ /* Can switch DATA_TYPE between float and double */
85
+ typedef float DATA_TYPE;
86
+ // typedef uint64_t DATA_TYPE;
87
+
88
+ void gemm(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C)
89
+ {
90
+ uint64_t i,j,k;
91
+
92
+ for (i = 0; i < NI; i++) {
93
+ for (j = 0; j < NJ; j++) {
94
+ C[i*NJ + j] *= BETA;
95
+ for (k = 0; k < NK; ++k) {
96
+ C[i*NJ + j] += ALPHA * A[i*NK + k] * B[k*NJ + j];
97
+ }
98
+ }
99
+ }
100
+ }
101
+
102
+ void init(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C, DATA_TYPE *C_ref)
103
+ {
104
+ uint64_t i, j;
105
+
106
+ for (i = 0; i < NI; i++)
107
+ for (j = 0; j < NK; j++)
108
+ A[i*NK + j] = ((DATA_TYPE) i*j) / NI;
109
+
110
+ for (i = 0; i < NK; i++)
111
+ for (j = 0; j < NJ; j++)
112
+ B[i*NJ + j] = ((DATA_TYPE) i*j + 1) / NJ;
113
+
114
+ for (i = 0; i < NI; i++) {
115
+ for (j = 0; j < NJ; j++) {
116
+ C[i * NJ + j] = ((DATA_TYPE)i * j + 2) / NJ;
117
+ C_ref[i * NJ + j] = ((DATA_TYPE)i * j + 2) / NJ;
118
+ }
119
+ }
120
+
121
+ }
122
+
123
+
124
+ void compareResults(DATA_TYPE* C, DATA_TYPE* C_outputFromGpu)
125
+ {
126
+ uint64_t i, j, fail;
127
+ fail = 0;
128
+
129
+ // Compare C1 and C2
130
+ for (i=0; i < NI; i++)
131
+ {
132
+ for (j=0; j < NJ; j++)
133
+ {
134
+ // printf("%d, %d, GPU is %f, CPU is %f.\n", i, j, C[i*NJ + j], C_outputFromGpu[i*NJ + j]);
135
+ if (percentDiff(C[i*NJ + j], C_outputFromGpu[i*NJ + j]) > PERCENT_DIFF_ERROR_THRESHOLD)
136
+ {
137
+ printf("%d, %d, GPU is %f, CPU is %f.\n", i, j, C[i*NJ + j], C_outputFromGpu[i*NJ + j]);
138
+ fail++;
139
+ }
140
+ }
141
+ }
142
+
143
+ // Print results
144
+ printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail);
145
+ }
146
+
147
+ __global__ void gemm_kernel(DATA_TYPE *a, DATA_TYPE *b, DATA_TYPE *c, uint64_t NI, uint64_t NK, uint64_t NJ)
148
+ {
149
+ cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
150
+ pipeline pipe;
151
+
152
+ uint64_t row = blockIdx.y * blockDim.y + threadIdx.y;
153
+ uint64_t col = blockIdx.x * blockDim.x + threadIdx.x;
154
+
155
+ __shared__ DATA_TYPE s_a[DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y * PREFETCH_COUNT];
156
+ __shared__ DATA_TYPE s_b[DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y * PREFETCH_COUNT];
157
+
158
+ DATA_TYPE tmp = BETA * c[row * NJ + col];
159
+
160
+ uint64_t base_tiles = 0;
161
+ uint64_t end_tile = base_tiles + NK / blockDim.x;
162
+
163
+ uint64_t fetch = base_tiles;
164
+ uint64_t tile_size = DIM_THREAD_BLOCK_X;
165
+ uint64_t mem_size = DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y;
166
+
167
+ for (uint64_t compute = fetch; compute < end_tile; compute++)
168
+ {
169
+ for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++)
170
+ {
171
+ memcpy_async(s_a[(fetch % PREFETCH_COUNT) * mem_size + (threadIdx.y * blockDim.x + threadIdx.x)], a[row * NK + fetch * tile_size + threadIdx.x], pipe);
172
+ memcpy_async(s_b[(fetch % PREFETCH_COUNT) * mem_size + (threadIdx.y * blockDim.x + threadIdx.x)], b[(fetch * tile_size + threadIdx.y) * NJ + col], pipe);
173
+
174
+ pipe.commit();
175
+ }
176
+ if (fetch == end_tile) {
177
+ for (uint64_t i = 0; i < PREFETCH_COUNT-1; ++i) { pipe.commit(); }
178
+ ++fetch;
179
+ }
180
+ pipe.wait_prior<PREFETCH_COUNT - 1>();
181
+ block.sync();
182
+
183
+ for (uint64_t k = 0; k < blockDim.x; k++)
184
+ {
185
+ tmp += ALPHA * s_a[(compute % PREFETCH_COUNT) * mem_size + (threadIdx.y * blockDim.x + k)] * s_b[(compute % PREFETCH_COUNT) * mem_size + (k * blockDim.x + threadIdx.x)];
186
+ }
187
+ block.sync();
188
+ }
189
+
190
+ c[row * NJ + col] = tmp;
191
+ }
192
+
193
+ void gemmCuda(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C, DATA_TYPE *A_gpu, DATA_TYPE *B_gpu, DATA_TYPE *C_gpu)
194
+ {
195
+ double t_start, t_end;
196
+
197
+ dim3 block(DIM_THREAD_BLOCK_X, DIM_THREAD_BLOCK_Y);
198
+ dim3 grid((size_t)(ceil( ((float)NI)/ ((float)block.x) )),(size_t)(ceil( ((float)NJ)/ ((float)block.y) )));
199
+
200
+ //t_start = rtclock();
201
+ cudaMemcpy(A_gpu, A, sizeof(DATA_TYPE) * NI * NK, cudaMemcpyHostToDevice);
202
+ cudaMemcpy(B_gpu, B, sizeof(DATA_TYPE) * NK * NJ, cudaMemcpyHostToDevice);
203
+ cudaMemcpy(C_gpu, C, sizeof(DATA_TYPE) * NI * NJ, cudaMemcpyHostToDevice);
204
+ gemm_kernel<<< grid, block >>>(A_gpu, B_gpu, C_gpu, NI, NK, NJ);
205
+ cudaDeviceSynchronize();
206
+ cudaMemcpy(C, C_gpu, sizeof(DATA_TYPE) * NI * NJ, cudaMemcpyDeviceToHost);
207
+ //t_end = rtclock();
208
+
209
+ //fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);
210
+ }
211
+
212
+ int main(int argc, char *argv[])
213
+ {
214
+ uint64_t start_tsc = rdtsc();
215
+ uint64_t start_tsp = rdtsp();
216
+ printf("start_tsc %lu start_tsp %lu\n", start_tsc, start_tsp);
217
+ if (argc >= 4) {
218
+ NI = atoll(argv[1]);
219
+ NK = atoll(argv[2]);
220
+ NJ = atoll(argv[3]);
221
+ } else {
222
+ NI = SIZE;
223
+ NK = SIZE;
224
+ NJ = SIZE;
225
+ }
226
+
227
+ double t_start, t_end;
228
+
229
+ DATA_TYPE* A;
230
+ DATA_TYPE* B;
231
+ DATA_TYPE* C;
232
+ DATA_TYPE *C_ref;
233
+
234
+ DATA_TYPE *A_gpu;
235
+ DATA_TYPE *B_gpu;
236
+ DATA_TYPE *C_gpu;
237
+
238
+ A = (DATA_TYPE*)malloc(NI*NK*sizeof(DATA_TYPE));
239
+ B = (DATA_TYPE*)malloc(NK*NJ*sizeof(DATA_TYPE));
240
+ C = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE));
241
+ C_ref = (DATA_TYPE *)malloc(NI * NJ * sizeof(DATA_TYPE));
242
+
243
+ //cudaMallocManaged(&A_gpu, sizeof(DATA_TYPE) * NI * NK);
244
+ //cudaMallocManaged(&B_gpu, sizeof(DATA_TYPE) * NK * NJ);
245
+ //cudaMallocManaged(&C_gpu, sizeof(DATA_TYPE) * NI * NJ);
246
+ init(A, B, C, C_ref);
247
+
248
+ GPU_argv_init();
249
+
250
+ //initTrace();
251
+ startCPU();
252
+
253
+ cudaMalloc(&A_gpu, sizeof(DATA_TYPE) * NI * NK);
254
+ cudaMalloc(&B_gpu, sizeof(DATA_TYPE) * NK * NJ);
255
+ cudaMalloc(&C_gpu, sizeof(DATA_TYPE) * NI * NJ);
256
+
257
+ gemmCuda(A, B, C, A_gpu, B_gpu, C_gpu);
258
+
259
+ cudaFree(A_gpu);
260
+ cudaFree(B_gpu);
261
+ cudaFree(C_gpu);
262
+ endCPU();
263
+ //finiTrace();
264
+
265
+ // t_start = rtclock();
266
+ // gemm(A, B, C_ref);
267
+ // t_end = rtclock();
268
+ // fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);
269
+
270
+ // compareResults(C, C_ref);
271
+ free(A);
272
+ free(B);
273
+ free(C);
274
+ free(C_ref);
275
+ return 0;
276
+ }
277
+
workloads/micro/async/gemm_perf/run.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ #./gemm 1024 1024 1024
2
+ ./gemm 32768 32768 32768
3
+ #./gemm 512 512 512
workloads/micro/async/gemm_perf/run_large.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./gemm 8192 8192 8192
workloads/micro/async/gemm_perf/run_medium.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./gemm 4096 4096 4096
workloads/micro/async/gemm_perf/run_mega.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./gemm 32768 32768 32768
workloads/micro/async/gemm_perf/run_small.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./gemm 1024 1024 1024
workloads/micro/async/gemm_perf/run_super.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./gemm 16384 16384 16384
workloads/micro/async/gemm_perf/run_tiny.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ./gemm 512 512 512
workloads/micro/async/gemv/Makefile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include ../../../common/make.config
2
+
3
+ NVCCCFLAGS = -I$(CUPTI_INCLUDE) -L$(CUPTI_LIB_DIR) -std=c++11 -lcuda -lcupti -arch=sm_80 -O3
4
+ NVCC = $(CUDA_DIR)/bin/nvcc
5
+
6
+ EXECUTABLE := gemv
7
+ CUFILES := gemv.cu $(CUPTI_ADD_COMMON)/cupti_add.cpp $(CUPTI_ADD_COMMON)/cpu_timestamps.cpp
8
+
9
+ all:
10
+ $(NVCC) ${NVCCCFLAGS} ${CUFILES} ${DEF} -o ${EXECUTABLE}
11
+ clean:
12
+ rm -f *.o gemv
workloads/micro/async/gemv/gemv.cu ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * gemm.cu: This file is part of the PolyBench/GPU 1.0 test suite.
3
+ *
4
+ *
5
+ * Contact: Scott Grauer-Gray <sgrauerg@gmail.com>
6
+ * Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
7
+ * Web address: http://www.cse.ohio-state.edu/~pouchet/software/polybench/GPU
8
+ */
9
+
10
+ #include <unistd.h>
11
+ #include <stdio.h>
12
+ #include <time.h>
13
+ #include <sys/time.h>
14
+ #include <stdlib.h>
15
+ #include <stdarg.h>
16
+ #include <string.h>
17
+ #include <cuda.h>
18
+ #include "../../../common/cupti_add.h"
19
+ #include "../../../common/cpu_timestamps.h"
20
+
21
+ #include <cooperative_groups.h>
22
+ #include <cooperative_groups/memcpy_async.h>
23
+
24
+ using namespace nvcuda::experimental;
25
+
26
+ #define PREFETCH_COUNT 2
27
+
28
+ #define SMALL_FLOAT_VAL 0.00000001f
29
+
30
+ double rtclock()
31
+ {
32
+ struct timezone Tzp;
33
+ struct timeval Tp;
34
+ uint64_t stat;
35
+ stat = gettimeofday(&Tp, &Tzp);
36
+ if (stat != 0)
37
+ printf("Error return from gettimeofday: %d", stat);
38
+ return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
39
+ }
40
+
41
+ float absVal(float a)
42
+ {
43
+ if (a < 0)
44
+ {
45
+ return (a * -1);
46
+ }
47
+ else
48
+ {
49
+ return a;
50
+ }
51
+ }
52
+
53
+ float percentDiff(double val1, double val2)
54
+ {
55
+ if ((absVal(val1) < 0.01) && (absVal(val2) < 0.01))
56
+ {
57
+ return 0.0f;
58
+ }
59
+
60
+ else
61
+ {
62
+ return 100.0f * (absVal(absVal(val1 - val2) / absVal(val1 + SMALL_FLOAT_VAL)));
63
+ }
64
+ }
65
+
66
+ // define the error threshold for the results "not matching"
67
+ #define PERCENT_DIFF_ERROR_THRESHOLD 0.05
68
+
69
+ /* Problem size */
70
+ #define SIZE 40960
71
+ uint64_t NI;
72
+ uint64_t NJ;
73
+
74
+ /* Thread block dimensions */
75
+ #define DIM_THREAD_BLOCK 256
76
+
77
+ #define BATCH_SIZE 16
78
+
79
+ /* Declared constant values for ALPHA and BETA (same as values in PolyBench 2.0) */
80
+ #define ALPHA 1.1f
81
+ #define BETA 1.1f
82
+
83
+ /* Can switch DATA_TYPE between float and double */
84
+ typedef float DATA_TYPE;
85
+ // typedef uint64_t DATA_TYPE;
86
+
87
+ void gemv(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C)
88
+ {
89
+ uint64_t i, j;
90
+
91
+ for (i = 0; i < NI; i++)
92
+ {
93
+ C[i] *= BETA;
94
+ for (j = 0; j < NJ; j++)
95
+ {
96
+ C[i] += ALPHA * A[i * NJ + j] * B[j];
97
+ }
98
+ }
99
+ }
100
+
101
+ void init(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C, DATA_TYPE *C_ref)
102
+ {
103
+ uint64_t i, j;
104
+
105
+ for (i = 0; i < NI; i++)
106
+ for (j = 0; j < NJ; j++)
107
+ A[i * NJ + j] = ((DATA_TYPE)i * j) / NI;
108
+
109
+ for (j = 0; j < NJ; j++)
110
+ B[j] = ((DATA_TYPE)j + 1) / NJ;
111
+
112
+ for (i = 0; i < NI; i++)
113
+ {
114
+ C[i] = ((DATA_TYPE)i + 2) / NI;
115
+ C_ref[i] = ((DATA_TYPE)i + 2) / NI;
116
+ }
117
+ }
118
+
119
+ void compareResults(DATA_TYPE *C, DATA_TYPE *C_outputFromGpu)
120
+ {
121
+ uint64_t i, fail;
122
+ fail = 0;
123
+
124
+ // Compare C1 and C2
125
+ for (i = 0; i < NI; i++)
126
+ {
127
+ if (percentDiff(C[i], C_outputFromGpu[i]) > PERCENT_DIFF_ERROR_THRESHOLD)
128
+ {
129
+ fail++;
130
+ printf("%d, GPU is %f, CPU is %f.\n", i, C[i], C_outputFromGpu[i]);
131
+ }
132
+ }
133
+
134
+ // Print results
135
+ printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail);
136
+ }
137
+
138
+ __global__ void gemv_kernel(DATA_TYPE *a, DATA_TYPE *b, DATA_TYPE *c, uint64_t NI, uint64_t NJ)
139
+ {
140
+ cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
141
+ pipeline pipe;
142
+ uint64_t row = blockIdx.x * blockDim.x + threadIdx.x;
143
+ uint64_t tx = threadIdx.x;
144
+
145
+ __shared__ DATA_TYPE s_b[PREFETCH_COUNT][DIM_THREAD_BLOCK][BATCH_SIZE];
146
+
147
+ DATA_TYPE tmp = BETA * c[row];
148
+ __syncthreads();
149
+
150
+ uint64_t fetch = 0;
151
+ uint64_t end_tile = NJ / BATCH_SIZE;
152
+
153
+ for (uint64_t compute = fetch; compute < end_tile; compute++)
154
+ {
155
+ for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++)
156
+ {
157
+ uint64_t base_index = fetch * BATCH_SIZE;
158
+ for (uint64_t k = 0; k < BATCH_SIZE; k++)
159
+ {
160
+ memcpy_async(s_b[fetch % PREFETCH_COUNT][tx][k], b[base_index + k], pipe);
161
+ }
162
+ pipe.commit();
163
+ }
164
+ if (fetch == end_tile)
165
+ {
166
+ for (uint64_t i = 0; i < PREFETCH_COUNT - 1; ++i)
167
+ {
168
+ pipe.commit();
169
+ }
170
+ ++fetch;
171
+ }
172
+ pipe.wait_prior<PREFETCH_COUNT - 1>();
173
+ block.sync();
174
+
175
+ uint64_t base_index = compute * BATCH_SIZE;
176
+ for (uint64_t k = 0; k < BATCH_SIZE; k++)
177
+ {
178
+ tmp += ALPHA * a[row * NJ + base_index + k] * s_b[compute % PREFETCH_COUNT][tx][k];
179
+ }
180
+ block.sync();
181
+ }
182
+ c[row] = tmp;
183
+ }
184
+
185
+ void gemvCuda(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C, DATA_TYPE *A_gpu, DATA_TYPE *B_gpu, DATA_TYPE *C_gpu)
186
+ {
187
+ double t_start, t_end;
188
+
189
+ dim3 block(DIM_THREAD_BLOCK);
190
+ dim3 grid(NI / (DIM_THREAD_BLOCK));
191
+
192
+ // t_start = rtclock();
193
+ cudaMemcpy(A_gpu, A, sizeof(DATA_TYPE) * NI * NJ, cudaMemcpyHostToDevice);
194
+ cudaMemcpy(B_gpu, B, sizeof(DATA_TYPE) * NJ, cudaMemcpyHostToDevice);
195
+ cudaMemcpy(C_gpu, C, sizeof(DATA_TYPE) * NI, cudaMemcpyHostToDevice);
196
+ gemv_kernel<<<grid, block>>>(A_gpu, B_gpu, C_gpu, NI, NJ);
197
+ cudaDeviceSynchronize();
198
+ cudaMemcpy(C, C_gpu, sizeof(DATA_TYPE) * NI, cudaMemcpyDeviceToHost);
199
+ // t_end = rtclock();
200
+
201
+ // fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);
202
+ }
203
+
204
+ int main(int argc, char *argv[])
205
+ {
206
+ uint64_t start_tsc = rdtsc();
207
+ uint64_t start_tsp = rdtsp();
208
+ printf("start_tsc %lu start_tsp %lu\n", start_tsc, start_tsp);
209
+ if (argc >= 3)
210
+ {
211
+ NI = atoll(argv[1]);
212
+ NJ = atoll(argv[2]);
213
+ }
214
+ else
215
+ {
216
+ NI = SIZE;
217
+ NJ = SIZE;
218
+ }
219
+
220
+ double t_start, t_end;
221
+
222
+ DATA_TYPE *A;
223
+ DATA_TYPE *B;
224
+ DATA_TYPE *C;
225
+ DATA_TYPE *C_ref;
226
+
227
+ DATA_TYPE *A_gpu;
228
+ DATA_TYPE *B_gpu;
229
+ DATA_TYPE *C_gpu;
230
+
231
+ A = (DATA_TYPE *)malloc(NI * NJ * sizeof(DATA_TYPE));
232
+ B = (DATA_TYPE *)malloc(NJ * sizeof(DATA_TYPE));
233
+ C = (DATA_TYPE *)malloc(NI * sizeof(DATA_TYPE));
234
+ C_ref = (DATA_TYPE *)malloc(NI * sizeof(DATA_TYPE));
235
+
236
+ // cudaMallocManaged(&A_gpu, sizeof(DATA_TYPE) * NI * NK);
237
+ // cudaMallocManaged(&B_gpu, sizeof(DATA_TYPE) * NK * NJ);
238
+ // cudaMallocManaged(&C_gpu, sizeof(DATA_TYPE) * NI * NJ);
239
+
240
+ init(A, B, C, C_ref);
241
+
242
+ GPU_argv_init();
243
+ initTrace();
244
+ startCPU();
245
+
246
+ cudaMalloc(&A_gpu, sizeof(DATA_TYPE) * NI * NJ);
247
+ cudaMalloc(&B_gpu, sizeof(DATA_TYPE) * NJ);
248
+ cudaMalloc(&C_gpu, sizeof(DATA_TYPE) * NI);
249
+
250
+ gemvCuda(A, B, C, A_gpu, B_gpu, C_gpu);
251
+
252
+ cudaFree(A_gpu);
253
+ cudaFree(B_gpu);
254
+ cudaFree(C_gpu);
255
+ endCPU();
256
+ finiTrace();
257
+
258
+ // t_start = rtclock();
259
+ // gemv(A, B, C_ref);
260
+ // t_end = rtclock();
261
+ // fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);
262
+
263
+ // compareResults(C, C_ref);
264
+ free(A);
265
+ free(B);
266
+ free(C);
267
+ free(C_ref);
268
+ return 0;
269
+ }