lrh12580 commited on
Commit ·
5cb6c4b
1
Parent(s): 4f29f93
first commit
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- README.md +1 -0
- data.zip +3 -0
- env.sh +1 -0
- workloads/common/Makefile +12 -0
- workloads/common/cpu_timestamps.cpp +38 -0
- workloads/common/cpu_timestamps.h +22 -0
- workloads/common/cpu_timestamps.o +0 -0
- workloads/common/cupti_add.cpp +242 -0
- workloads/common/cupti_add.h +56 -0
- workloads/common/cupti_add.o +0 -0
- workloads/common/make.config +10 -0
- workloads/micro/async/2DCONV/2DConvolution.cu +392 -0
- workloads/micro/async/2DCONV/Makefile +12 -0
- workloads/micro/async/2DCONV/run.sh +2 -0
- workloads/micro/async/2DCONV/run_large.sh +1 -0
- workloads/micro/async/2DCONV/run_medium.sh +1 -0
- workloads/micro/async/2DCONV/run_mega.sh +1 -0
- workloads/micro/async/2DCONV/run_small.sh +1 -0
- workloads/micro/async/2DCONV/run_super.sh +1 -0
- workloads/micro/async/2DCONV/run_tiny.sh +1 -0
- workloads/micro/async/3DCONV/3DConvolution.cu +449 -0
- workloads/micro/async/3DCONV/Makefile +12 -0
- workloads/micro/async/3DCONV/run.sh +2 -0
- workloads/micro/async/3DCONV/run_large.sh +1 -0
- workloads/micro/async/3DCONV/run_medium.sh +1 -0
- workloads/micro/async/3DCONV/run_mega.sh +1 -0
- workloads/micro/async/3DCONV/run_small.sh +1 -0
- workloads/micro/async/3DCONV/run_super.sh +1 -0
- workloads/micro/async/3DCONV/run_tiny.sh +1 -0
- workloads/micro/async/gemm/Makefile +12 -0
- workloads/micro/async/gemm/gemm.cu +277 -0
- workloads/micro/async/gemm/run.sh +3 -0
- workloads/micro/async/gemm/run_large.sh +1 -0
- workloads/micro/async/gemm/run_medium.sh +1 -0
- workloads/micro/async/gemm/run_mega.sh +1 -0
- workloads/micro/async/gemm/run_small.sh +1 -0
- workloads/micro/async/gemm/run_super.sh +1 -0
- workloads/micro/async/gemm/run_tiny.sh +1 -0
- workloads/micro/async/gemm_perf/Makefile +12 -0
- workloads/micro/async/gemm_perf/gemm +0 -0
- workloads/micro/async/gemm_perf/gemm.cu +277 -0
- workloads/micro/async/gemm_perf/run.sh +3 -0
- workloads/micro/async/gemm_perf/run_large.sh +1 -0
- workloads/micro/async/gemm_perf/run_medium.sh +1 -0
- workloads/micro/async/gemm_perf/run_mega.sh +1 -0
- workloads/micro/async/gemm_perf/run_small.sh +1 -0
- workloads/micro/async/gemm_perf/run_super.sh +1 -0
- workloads/micro/async/gemm_perf/run_tiny.sh +1 -0
- workloads/micro/async/gemv/Makefile +12 -0
- workloads/micro/async/gemv/gemv.cu +269 -0
README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# uvm_async_bench
|
data.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:24dcfa78fe0a79ebf4b189d3b2c8449842f15a9f30b7b781c400c96ba525756a
|
| 3 |
+
size 8411674489
|
env.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
export UVMAsyncBench_BASE=$(pwd)
|
workloads/common/Makefile
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
include ./make.config
|
| 2 |
+
|
| 3 |
+
NVCC = $(CUDA_DIR)/bin/nvcc
|
| 4 |
+
NVCC_FLAGS = -I$(CUDA_DIR)/include -I$(CUPTI_INCLUDE)
|
| 5 |
+
|
| 6 |
+
all: cpu_timestapms.o cupti_add.o
|
| 7 |
+
|
| 8 |
+
cpu_timestapms.o: cpu_timestamps.cpp
|
| 9 |
+
$(NVCC) $(NVCC_FLAGS) -c cpu_timestamps.cpp
|
| 10 |
+
|
| 11 |
+
cupti_add.o: cupti_add.cpp
|
| 12 |
+
$(NVCC) $(NVCC_FLAGS) -c cupti_add.cpp
|
workloads/common/cpu_timestamps.cpp
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "cpu_timestamps.h"
|
| 2 |
+
|
| 3 |
+
void startCPU() {
|
| 4 |
+
struct timespec tv;
|
| 5 |
+
if(clock_gettime(CLOCK_REALTIME, &tv))
|
| 6 |
+
printf("error clock_gettime\n");
|
| 7 |
+
startCPUTime = (tv.tv_sec * 1.0e9 + tv.tv_nsec);
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
void endCPU() {
|
| 13 |
+
struct timespec tv;
|
| 14 |
+
if(clock_gettime(CLOCK_REALTIME, &tv))
|
| 15 |
+
printf("error clock_gettime\n");
|
| 16 |
+
|
| 17 |
+
endCPUTime = (tv.tv_sec * 1.0e9 + tv.tv_nsec);
|
| 18 |
+
//endCPUTimestamp1 = std::chrono::system_clock::now();
|
| 19 |
+
printf("CPU_Times,%lu,%lu,%lu\n", startCPUTime, endCPUTime, endCPUTime-startCPUTime);
|
| 20 |
+
printf("Overlap_Times,%lu,%lu,%lu\n", overlapStartCPUTime, overlapEndCPUTime, overlapEndCPUTime - overlapStartCPUTime);
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
void overlapStartCPU()
|
| 24 |
+
{
|
| 25 |
+
struct timespec tv;
|
| 26 |
+
if (clock_gettime(CLOCK_REALTIME, &tv))
|
| 27 |
+
printf("error clock_gettime\n");
|
| 28 |
+
overlapStartCPUTime = (tv.tv_sec * 1.0e9 + tv.tv_nsec);
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
void overlapEndCPU()
|
| 32 |
+
{
|
| 33 |
+
struct timespec tv;
|
| 34 |
+
if (clock_gettime(CLOCK_REALTIME, &tv))
|
| 35 |
+
printf("error clock_gettime\n");
|
| 36 |
+
|
| 37 |
+
overlapEndCPUTime = (tv.tv_sec * 1.0e9 + tv.tv_nsec);
|
| 38 |
+
}
|
workloads/common/cpu_timestamps.h
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef CPU_TIMESTAMP_
|
| 2 |
+
#define CPU_TIMESTAMP_
|
| 3 |
+
|
| 4 |
+
#include <time.h>
|
| 5 |
+
#include <sys/time.h>
|
| 6 |
+
#include <stdio.h>
|
| 7 |
+
#include <stdint.h>
|
| 8 |
+
#include <error.h>
|
| 9 |
+
|
| 10 |
+
static uint64_t startCPUTime;
|
| 11 |
+
static uint64_t endCPUTime;
|
| 12 |
+
|
| 13 |
+
static uint64_t overlapStartCPUTime = 0;
|
| 14 |
+
static uint64_t overlapEndCPUTime = 0;
|
| 15 |
+
|
| 16 |
+
void startCPU();
|
| 17 |
+
void endCPU();
|
| 18 |
+
|
| 19 |
+
void overlapStartCPU();
|
| 20 |
+
void overlapEndCPU();
|
| 21 |
+
|
| 22 |
+
#endif
|
workloads/common/cpu_timestamps.o
ADDED
|
Binary file (4.15 kB). View file
|
|
|
workloads/common/cupti_add.cpp
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "cupti_add.h"
|
| 2 |
+
|
| 3 |
+
static const char *
|
| 4 |
+
getMemcpyKindString(CUpti_ActivityMemcpyKind kind)
|
| 5 |
+
{
|
| 6 |
+
switch (kind)
|
| 7 |
+
{
|
| 8 |
+
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
|
| 9 |
+
return "HtoD";
|
| 10 |
+
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
|
| 11 |
+
return "DtoH";
|
| 12 |
+
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
|
| 13 |
+
return "HtoA";
|
| 14 |
+
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
|
| 15 |
+
return "AtoH";
|
| 16 |
+
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
|
| 17 |
+
return "AtoA";
|
| 18 |
+
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
|
| 19 |
+
return "AtoD";
|
| 20 |
+
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
|
| 21 |
+
return "DtoA";
|
| 22 |
+
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
|
| 23 |
+
return "DtoD";
|
| 24 |
+
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
|
| 25 |
+
return "HtoH";
|
| 26 |
+
default:
|
| 27 |
+
break;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
return "<unknown>";
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
static const char *
|
| 34 |
+
getUvmCounterKindString(CUpti_ActivityUnifiedMemoryCounterKind kind)
|
| 35 |
+
{
|
| 36 |
+
switch (kind)
|
| 37 |
+
{
|
| 38 |
+
case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD:
|
| 39 |
+
return "BYTES_TRANSFER_HTOD";
|
| 40 |
+
case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH:
|
| 41 |
+
return "BYTES_TRANSFER_DTOH";
|
| 42 |
+
case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT:
|
| 43 |
+
return "CPU_PAGE_FAULTS";
|
| 44 |
+
case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT:
|
| 45 |
+
return "GPU_PAGE_FAULTS";
|
| 46 |
+
default:
|
| 47 |
+
break;
|
| 48 |
+
}
|
| 49 |
+
return "<unknown>";
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
static void
|
| 53 |
+
printActivity(CUpti_Activity *record)
|
| 54 |
+
{
|
| 55 |
+
switch (record->kind)
|
| 56 |
+
{
|
| 57 |
+
case CUPTI_ACTIVITY_KIND_KERNEL:
|
| 58 |
+
{
|
| 59 |
+
int status;
|
| 60 |
+
CUpti_ActivityKernel4 *kernel = (CUpti_ActivityKernel4 *)record;
|
| 61 |
+
printf("KERNEL %s, %llu, %llu, %llu\n",
|
| 62 |
+
abi::__cxa_demangle(kernel->name, 0, 0, &status),
|
| 63 |
+
(unsigned long long)(kernel->start),
|
| 64 |
+
(unsigned long long)(kernel->end),
|
| 65 |
+
(unsigned long long)(kernel->end) - (kernel->start));
|
| 66 |
+
break;
|
| 67 |
+
}
|
| 68 |
+
case CUPTI_ACTIVITY_KIND_RUNTIME:
|
| 69 |
+
{
|
| 70 |
+
CUpti_ActivityAPI *api = (CUpti_ActivityAPI *)record;
|
| 71 |
+
const char *callback_name;
|
| 72 |
+
cuptiGetCallbackName(CUPTI_CB_DOMAIN_RUNTIME_API, api->cbid, &callback_name);
|
| 73 |
+
// printf("RUNTIME %s (cbid=%u) [ %llu - %llu ] process %u, thread %u, correlation %u\n",
|
| 74 |
+
// callback_name, api->cbid,
|
| 75 |
+
// (unsigned long long)(api->start - startTimestamp),
|
| 76 |
+
// (unsigned long long)(api->end - startTimestamp),
|
| 77 |
+
// api->processId, api->threadId, api->correlationId);
|
| 78 |
+
printf("RUNTIME %s (cbid=%u), %llu,%llu,%llu, process %u, thread %u, correlation %u\n",
|
| 79 |
+
callback_name, api->cbid,
|
| 80 |
+
(unsigned long long)(api->start),
|
| 81 |
+
(unsigned long long)(api->end),
|
| 82 |
+
(unsigned long long)(api->end - api->start),
|
| 83 |
+
api->processId, api->threadId, api->correlationId);
|
| 84 |
+
break;
|
| 85 |
+
}
|
| 86 |
+
case CUPTI_ACTIVITY_KIND_MEMCPY:
|
| 87 |
+
{
|
| 88 |
+
CUpti_ActivityMemcpy4 *memcpy = (CUpti_ActivityMemcpy4 *)record;
|
| 89 |
+
printf("MEMCPY %s, size %llu, %llu, %llu, %llu\n",
|
| 90 |
+
getMemcpyKindString((CUpti_ActivityMemcpyKind)memcpy->copyKind),
|
| 91 |
+
(unsigned long long)memcpy->bytes,
|
| 92 |
+
(unsigned long long)(memcpy->start),
|
| 93 |
+
(unsigned long long)(memcpy->end),
|
| 94 |
+
(unsigned long long)(memcpy->end) - (memcpy->start));
|
| 95 |
+
break;
|
| 96 |
+
}
|
| 97 |
+
case CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER:
|
| 98 |
+
{
|
| 99 |
+
CUpti_ActivityUnifiedMemoryCounter2 *uvm = (CUpti_ActivityUnifiedMemoryCounter2 *)record;
|
| 100 |
+
printf("UVM MEMCPY %s, size %llu, %llu, %llu, %llu \n",
|
| 101 |
+
getUvmCounterKindString(uvm->counterKind),
|
| 102 |
+
(unsigned long long)uvm->value,
|
| 103 |
+
(unsigned long long)(uvm->start),
|
| 104 |
+
(unsigned long long)(uvm->end),
|
| 105 |
+
(unsigned long long)(uvm->end - uvm->start));
|
| 106 |
+
break;
|
| 107 |
+
}
|
| 108 |
+
}
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords)
|
| 112 |
+
{
|
| 113 |
+
uint8_t *bfr = (uint8_t *)malloc(BUF_SIZE + ALIGN_SIZE);
|
| 114 |
+
if (bfr == NULL)
|
| 115 |
+
{
|
| 116 |
+
printf("Error: out of memory\n");
|
| 117 |
+
exit(-1);
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
*size = BUF_SIZE;
|
| 121 |
+
*buffer = ALIGN_BUFFER(bfr, ALIGN_SIZE);
|
| 122 |
+
*maxNumRecords = 0;
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize)
|
| 126 |
+
{
|
| 127 |
+
CUptiResult status;
|
| 128 |
+
CUpti_Activity *record = NULL;
|
| 129 |
+
if (validSize > 0)
|
| 130 |
+
{
|
| 131 |
+
do
|
| 132 |
+
{
|
| 133 |
+
status = cuptiActivityGetNextRecord(buffer, validSize, &record);
|
| 134 |
+
if (status == CUPTI_SUCCESS)
|
| 135 |
+
{
|
| 136 |
+
printActivity(record);
|
| 137 |
+
}
|
| 138 |
+
else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED)
|
| 139 |
+
break;
|
| 140 |
+
else
|
| 141 |
+
{
|
| 142 |
+
CUPTI_CALL(status);
|
| 143 |
+
}
|
| 144 |
+
} while (1);
|
| 145 |
+
|
| 146 |
+
// report any records dropped from the queue
|
| 147 |
+
size_t dropped;
|
| 148 |
+
CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
|
| 149 |
+
if (dropped != 0)
|
| 150 |
+
{
|
| 151 |
+
printf("Dropped %u activity records\n", (unsigned int)dropped);
|
| 152 |
+
}
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
free(buffer);
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
// void initTrace() {
|
| 159 |
+
// return;
|
| 160 |
+
// }
|
| 161 |
+
|
| 162 |
+
// void finiTrace() {
|
| 163 |
+
// return;
|
| 164 |
+
// }
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
void initTrace()
|
| 168 |
+
{
|
| 169 |
+
size_t attrValue = 0, attrValueSize = sizeof(size_t);
|
| 170 |
+
|
| 171 |
+
CUpti_ActivityUnifiedMemoryCounterConfig config[2];
|
| 172 |
+
|
| 173 |
+
// configure unified memory counters
|
| 174 |
+
config[0].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
|
| 175 |
+
config[0].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD;
|
| 176 |
+
config[0].deviceId = 0;
|
| 177 |
+
config[0].enable = 1;
|
| 178 |
+
|
| 179 |
+
config[1].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
|
| 180 |
+
config[1].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH;
|
| 181 |
+
config[1].deviceId = 0;
|
| 182 |
+
config[1].enable = 1;
|
| 183 |
+
|
| 184 |
+
CUptiResult res = cuptiActivityConfigureUnifiedMemoryCounter(config, 2);
|
| 185 |
+
if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED)
|
| 186 |
+
{
|
| 187 |
+
printf("Test is waived, unified memory is not supported on the underlying platform.\n");
|
| 188 |
+
}
|
| 189 |
+
else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE)
|
| 190 |
+
{
|
| 191 |
+
printf("Test is waived, unified memory is not supported on the device.\n");
|
| 192 |
+
}
|
| 193 |
+
else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES)
|
| 194 |
+
{
|
| 195 |
+
printf("Test is waived, unified memory is not supported on the non-P2P multi-gpu setup.\n");
|
| 196 |
+
}
|
| 197 |
+
else
|
| 198 |
+
{
|
| 199 |
+
CUPTI_CALL(res);
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
|
| 203 |
+
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
|
| 204 |
+
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
|
| 205 |
+
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER));
|
| 206 |
+
// CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_COUNT));
|
| 207 |
+
|
| 208 |
+
// CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT
|
| 209 |
+
// CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT
|
| 210 |
+
|
| 211 |
+
// Register callbacks for buffer requests and for buffers completed by CUPTI.
|
| 212 |
+
CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted));
|
| 213 |
+
|
| 214 |
+
// Optionally get and set activity attributes.
|
| 215 |
+
// Attributes can be set by the CUPTI client to change behavior of the activity API.
|
| 216 |
+
// Some attributes require to be set before any CUDA context is created to be effective,
|
| 217 |
+
// e.g. to be applied to all device buffer allocations (see documentation).
|
| 218 |
+
CUPTI_CALL(cuptiActivityGetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE, &attrValueSize, &attrValue));
|
| 219 |
+
printf("%s = %llu B\n", "CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE", (long long unsigned)attrValue);
|
| 220 |
+
attrValue *= 2;
|
| 221 |
+
CUPTI_CALL(cuptiActivitySetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE, &attrValueSize, &attrValue));
|
| 222 |
+
|
| 223 |
+
CUPTI_CALL(cuptiActivityGetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT, &attrValueSize, &attrValue));
|
| 224 |
+
printf("%s = %llu\n", "CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT", (long long unsigned)attrValue);
|
| 225 |
+
attrValue *= 2;
|
| 226 |
+
CUPTI_CALL(cuptiActivitySetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT, &attrValueSize, &attrValue));
|
| 227 |
+
|
| 228 |
+
CUPTI_CALL(cuptiGetTimestamp(&startTimestamp));
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
void finiTrace()
|
| 232 |
+
{
|
| 233 |
+
// Force flush any remaining activity buffers before termination of the application
|
| 234 |
+
CUPTI_CALL(cuptiActivityFlushAll(1));
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
void GPU_argv_init() {
|
| 238 |
+
cudaDeviceProp deviceProp;
|
| 239 |
+
cudaGetDeviceProperties(&deviceProp, GPU_DEVICE);
|
| 240 |
+
printf("setting device %d with name %s\n", GPU_DEVICE, deviceProp.name);
|
| 241 |
+
cudaSetDevice(GPU_DEVICE);
|
| 242 |
+
}
|
workloads/common/cupti_add.h
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <cupti.h>
|
| 2 |
+
#include <stdio.h>
|
| 3 |
+
#include <cxxabi.h>
|
| 4 |
+
#include <time.h>
|
| 5 |
+
|
| 6 |
+
#define GPU_DEVICE 7
|
| 7 |
+
|
| 8 |
+
#define BUF_SIZE (32 * 1024)
|
| 9 |
+
#define ALIGN_SIZE (8)
|
| 10 |
+
#define ALIGN_BUFFER(buffer, align) \
|
| 11 |
+
(((uintptr_t) (buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t) (buffer) & ((align)-1))) : (buffer))
|
| 12 |
+
|
| 13 |
+
static uint64_t startTimestamp;
|
| 14 |
+
// Timestamp at trace initialization time. Used to normalized other
|
| 15 |
+
// timestamps
|
| 16 |
+
|
| 17 |
+
#define CUPTI_CALL(call) \
|
| 18 |
+
do { \
|
| 19 |
+
CUptiResult _status = call; \
|
| 20 |
+
if (_status != CUPTI_SUCCESS) { \
|
| 21 |
+
const char *errstr; \
|
| 22 |
+
cuptiGetResultString(_status, &errstr); \
|
| 23 |
+
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
|
| 24 |
+
__FILE__, __LINE__, #call, errstr); \
|
| 25 |
+
if(_status == CUPTI_ERROR_LEGACY_PROFILER_NOT_SUPPORTED) \
|
| 26 |
+
exit(0); \
|
| 27 |
+
else \
|
| 28 |
+
exit(-1); \
|
| 29 |
+
} \
|
| 30 |
+
} while (0)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
extern inline __attribute__((always_inline)) unsigned long rdtsc() {
|
| 34 |
+
unsigned long a, d;
|
| 35 |
+
|
| 36 |
+
__asm__ volatile("rdtsc" : "=a"(a), "=d"(d));
|
| 37 |
+
|
| 38 |
+
return (a | (d << 32));
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
extern inline __attribute__((always_inline)) unsigned long rdtsp() {
|
| 42 |
+
struct timespec tms;
|
| 43 |
+
if (clock_gettime(CLOCK_REALTIME, &tms)) {
|
| 44 |
+
return -1;
|
| 45 |
+
}
|
| 46 |
+
unsigned long ns = tms.tv_sec * 1000000000;
|
| 47 |
+
ns += tms.tv_nsec;
|
| 48 |
+
return ns;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
void initTrace();
|
| 52 |
+
void finiTrace();
|
| 53 |
+
void GPU_argv_init();
|
| 54 |
+
void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords);
|
| 55 |
+
void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize);
|
| 56 |
+
static void printActivity(CUpti_Activity *record);
|
workloads/common/cupti_add.o
ADDED
|
Binary file (13.8 kB). View file
|
|
|
workloads/common/make.config
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CUDA_DIR = /apps/cuda-11.4
|
| 2 |
+
|
| 3 |
+
COMPUTE = compute_80 #61
|
| 4 |
+
SM_ARCH = sm_80 #61
|
| 5 |
+
|
| 6 |
+
CUDA_LIB_DIR := $(CUDA_DIR)/lib64
|
| 7 |
+
CUPTI_LIB_DIR := $(CUDA_DIR)/extras/CUPTI/lib64/
|
| 8 |
+
CUPTI_INCLUDE := $(CUDA_DIR)/extras/CUPTI/include/
|
| 9 |
+
|
| 10 |
+
CUPTI_ADD_COMMON = $(UVMAsyncBench_BASE)/workloads/common/
|
workloads/micro/async/2DCONV/2DConvolution.cu
ADDED
|
@@ -0,0 +1,392 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* 2DConvolution.cu: This file is part of the PolyBench/GPU 1.0 test suite.
|
| 3 |
+
*
|
| 4 |
+
*
|
| 5 |
+
* Contact: Scott Grauer-Gray <sgrauerg@gmail.com>
|
| 6 |
+
* Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
|
| 7 |
+
* Web address: http://www.cse.ohio-state.edu/~pouchet/software/polybench/GPU
|
| 8 |
+
*/
|
| 9 |
+
|
| 10 |
+
#include <unistd.h>
|
| 11 |
+
#include <stdio.h>
|
| 12 |
+
#include <time.h>
|
| 13 |
+
#include <sys/time.h>
|
| 14 |
+
#include <stdlib.h>
|
| 15 |
+
#include <stdarg.h>
|
| 16 |
+
#include <string.h>
|
| 17 |
+
#include <cuda.h>
|
| 18 |
+
|
| 19 |
+
#include "../../../common/cupti_add.h"
|
| 20 |
+
#include "../../../common/cpu_timestamps.h"
|
| 21 |
+
|
| 22 |
+
#include <cooperative_groups.h>
|
| 23 |
+
#include <cooperative_groups/memcpy_async.h>
|
| 24 |
+
|
| 25 |
+
using namespace nvcuda::experimental;
|
| 26 |
+
|
| 27 |
+
#define PREFETCH_COUNT 2
|
| 28 |
+
|
| 29 |
+
#define SMALL_FLOAT_VAL 0.00000001f
|
| 30 |
+
|
| 31 |
+
double rtclock()
|
| 32 |
+
{
|
| 33 |
+
struct timezone Tzp;
|
| 34 |
+
struct timeval Tp;
|
| 35 |
+
uint64_t stat;
|
| 36 |
+
stat = gettimeofday(&Tp, &Tzp);
|
| 37 |
+
if (stat != 0)
|
| 38 |
+
printf("Error return from gettimeofday: %d", stat);
|
| 39 |
+
return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
float absVal(float a)
|
| 43 |
+
{
|
| 44 |
+
if (a < 0)
|
| 45 |
+
{
|
| 46 |
+
return (a * -1);
|
| 47 |
+
}
|
| 48 |
+
else
|
| 49 |
+
{
|
| 50 |
+
return a;
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
float percentDiff(double val1, double val2)
|
| 55 |
+
{
|
| 56 |
+
if ((absVal(val1) < 0.01) && (absVal(val2) < 0.01))
|
| 57 |
+
{
|
| 58 |
+
return 0.0f;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
else
|
| 62 |
+
{
|
| 63 |
+
return 100.0f * (absVal(absVal(val1 - val2) / absVal(val1 + SMALL_FLOAT_VAL)));
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
//define the error threshold for the results "not matching"
|
| 68 |
+
#define PERCENT_DIFF_ERROR_THRESHOLD 0.05
|
| 69 |
+
|
| 70 |
+
/* Problem size */
|
| 71 |
+
#define SIZE 4096
|
| 72 |
+
#define NBLOCKS 32
|
| 73 |
+
#define BATCH_SIZE 4
|
| 74 |
+
|
| 75 |
+
uint64_t NI;
|
| 76 |
+
uint64_t NJ;
|
| 77 |
+
uint64_t nblocks;
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
/* Thread block dimensions */
|
| 81 |
+
#define KERNEL 3
|
| 82 |
+
#define DIM_THREAD_BLOCK 8
|
| 83 |
+
|
| 84 |
+
/* Can switch DATA_TYPE between float and double */
|
| 85 |
+
typedef float DATA_TYPE;
|
| 86 |
+
|
| 87 |
+
void conv2D(DATA_TYPE* A, DATA_TYPE* B)
|
| 88 |
+
{
|
| 89 |
+
uint64_t i, j;
|
| 90 |
+
DATA_TYPE c11, c12, c13, c21, c22, c23, c31, c32, c33;
|
| 91 |
+
|
| 92 |
+
c11 = +0.2; c21 = +0.5; c31 = -0.8;
|
| 93 |
+
c12 = -0.3; c22 = +0.6; c32 = -0.9;
|
| 94 |
+
c13 = +0.4; c23 = +0.7; c33 = +0.10;
|
| 95 |
+
|
| 96 |
+
for (i = 1; i < NI - 1; ++i) // 0
|
| 97 |
+
{
|
| 98 |
+
for (j = 1; j < NJ - 1; ++j) // 1
|
| 99 |
+
{
|
| 100 |
+
B[i*NJ + j] = c11 * A[(i - 1)*NJ + (j - 1)] + c12 * A[(i + 0)*NJ + (j - 1)] + c13 * A[(i + 1)*NJ + (j - 1)]
|
| 101 |
+
+ c21 * A[(i - 1)*NJ + (j + 0)] + c22 * A[(i + 0)*NJ + (j + 0)] + c23 * A[(i + 1)*NJ + (j + 0)]
|
| 102 |
+
+ c31 * A[(i - 1)*NJ + (j + 1)] + c32 * A[(i + 0)*NJ + (j + 1)] + c33 * A[(i + 1)*NJ + (j + 1)];
|
| 103 |
+
}
|
| 104 |
+
}
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
void initGPU(DATA_TYPE* A_gpu)
|
| 109 |
+
{
|
| 110 |
+
uint64_t i, j;
|
| 111 |
+
|
| 112 |
+
for (i = 0; i < NI; ++i) {
|
| 113 |
+
for (j = 0; j < NJ; ++j) {
|
| 114 |
+
A_gpu[i * NJ + j] = ((DATA_TYPE)i * j) / NI;
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
void initCPU(DATA_TYPE* A)
|
| 120 |
+
{
|
| 121 |
+
uint64_t i, j;
|
| 122 |
+
|
| 123 |
+
for (i = 0; i < NI; ++i) {
|
| 124 |
+
for (j = 0; j < NJ; ++j) {
|
| 125 |
+
A[i * NJ + j] = ((DATA_TYPE)i * j) / NI;
|
| 126 |
+
}
|
| 127 |
+
}
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
void compareResults(DATA_TYPE* B, DATA_TYPE* B_outputFromGpu)
|
| 132 |
+
{
|
| 133 |
+
uint64_t i, j, fail;
|
| 134 |
+
fail = 0;
|
| 135 |
+
|
| 136 |
+
// Compare a and b
|
| 137 |
+
for (i=1; i < (NI-1); i++)
|
| 138 |
+
{
|
| 139 |
+
for (j=1; j < (NJ-1); j++)
|
| 140 |
+
{
|
| 141 |
+
if (percentDiff(B[i*NJ + j], B_outputFromGpu[i*NJ + j]) > PERCENT_DIFF_ERROR_THRESHOLD)
|
| 142 |
+
{
|
| 143 |
+
printf("%d, %d, CPU is %f, GPU is %f.\n", i, j, B[i * NJ + j], B_outputFromGpu[i * NJ + j]);
|
| 144 |
+
fail++;
|
| 145 |
+
}
|
| 146 |
+
}
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
// Print results
|
| 150 |
+
printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail);
|
| 151 |
+
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
__global__ void Convolution2D_kernel(DATA_TYPE *A, DATA_TYPE *B, uint64_t NI, uint64_t NJ, uint64_t block_size)
|
| 155 |
+
{
|
| 156 |
+
cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
|
| 157 |
+
pipeline pipe;
|
| 158 |
+
uint64_t tile_dim_x = (NJ + DIM_THREAD_BLOCK - 1) / (DIM_THREAD_BLOCK * BATCH_SIZE);
|
| 159 |
+
|
| 160 |
+
__shared__ DATA_TYPE tmp_A[PREFETCH_COUNT][DIM_THREAD_BLOCK * BATCH_SIZE + KERNEL - 1][DIM_THREAD_BLOCK * BATCH_SIZE + KERNEL - 1];
|
| 161 |
+
__shared__ DATA_TYPE tmp_B[DIM_THREAD_BLOCK * BATCH_SIZE][DIM_THREAD_BLOCK * BATCH_SIZE];
|
| 162 |
+
|
| 163 |
+
uint64_t total_tiles = tile_dim_x * tile_dim_x;
|
| 164 |
+
|
| 165 |
+
uint64_t tiles_this_block_x = (block_size / (DIM_THREAD_BLOCK * BATCH_SIZE));
|
| 166 |
+
uint64_t tiles_this_block = tiles_this_block_x * tiles_this_block_x;
|
| 167 |
+
|
| 168 |
+
// DATA_TYPE c11, c12, c13, c21, c22, c23, c31, c32, c33;
|
| 169 |
+
|
| 170 |
+
// c11 = +0.2; c21 = +0.5; c31 = -0.8;
|
| 171 |
+
// c12 = -0.3; c22 = +0.6; c32 = -0.9;
|
| 172 |
+
// c13 = +0.4; c23 = +0.7; c33 = +0.10;
|
| 173 |
+
|
| 174 |
+
DATA_TYPE c[KERNEL][KERNEL];
|
| 175 |
+
|
| 176 |
+
c[0][0] = +0.2;
|
| 177 |
+
c[1][0] = +0.5;
|
| 178 |
+
c[2][0] = -0.8;
|
| 179 |
+
c[0][1] = -0.3;
|
| 180 |
+
c[1][1] = +0.6;
|
| 181 |
+
c[2][1] = -0.9;
|
| 182 |
+
c[0][2] = +0.4;
|
| 183 |
+
c[1][2] = +0.7;
|
| 184 |
+
c[2][2] = +0.10;
|
| 185 |
+
|
| 186 |
+
uint64_t base_tile = (blockIdx.y * gridDim.x + blockIdx.x) * tiles_this_block;
|
| 187 |
+
uint64_t fetch = base_tile;
|
| 188 |
+
uint64_t end_tile = fetch + tiles_this_block;
|
| 189 |
+
|
| 190 |
+
for (uint64_t compute = fetch; compute < end_tile; compute++)
|
| 191 |
+
{
|
| 192 |
+
for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++)
|
| 193 |
+
{
|
| 194 |
+
// block id
|
| 195 |
+
uint64_t offset = fetch - base_tile;
|
| 196 |
+
uint64_t block_id = fetch / tiles_this_block;
|
| 197 |
+
uint64_t bx = block_id % gridDim.x * tiles_this_block_x + offset % tiles_this_block_x;
|
| 198 |
+
uint64_t by = block_id / gridDim.x * tiles_this_block_x + offset / tiles_this_block_x;
|
| 199 |
+
|
| 200 |
+
uint64_t batch_size = DIM_THREAD_BLOCK * BATCH_SIZE;
|
| 201 |
+
|
| 202 |
+
// thread id
|
| 203 |
+
uint64_t tx = threadIdx.x;
|
| 204 |
+
uint64_t ty = threadIdx.y;
|
| 205 |
+
|
| 206 |
+
uint64_t index_A_y = DIM_THREAD_BLOCK * BATCH_SIZE * by + BATCH_SIZE * ty;
|
| 207 |
+
uint64_t index_A_x = DIM_THREAD_BLOCK * BATCH_SIZE * bx + BATCH_SIZE * tx;
|
| 208 |
+
|
| 209 |
+
uint64_t index_A_y_start = DIM_THREAD_BLOCK * BATCH_SIZE * by;
|
| 210 |
+
uint64_t index_A_x_start = DIM_THREAD_BLOCK * BATCH_SIZE * bx;
|
| 211 |
+
|
| 212 |
+
uint64_t index_A_y_bound = DIM_THREAD_BLOCK * BATCH_SIZE * by + BATCH_SIZE * DIM_THREAD_BLOCK;
|
| 213 |
+
uint64_t index_A_x_bound = DIM_THREAD_BLOCK * BATCH_SIZE * bx + BATCH_SIZE * DIM_THREAD_BLOCK;
|
| 214 |
+
|
| 215 |
+
// fetch A
|
| 216 |
+
for (uint64_t i = 0; i < BATCH_SIZE; i++)
|
| 217 |
+
{
|
| 218 |
+
for (uint64_t j = 0; j < BATCH_SIZE; j++)
|
| 219 |
+
{
|
| 220 |
+
if ((index_A_y + i) < NI && (index_A_x + j) < NJ)
|
| 221 |
+
{
|
| 222 |
+
memcpy_async(tmp_A[fetch % PREFETCH_COUNT][ty * BATCH_SIZE + i][tx * BATCH_SIZE + j], A[(index_A_y + i) * NJ + index_A_x + j], pipe);
|
| 223 |
+
tmp_B[ty * BATCH_SIZE + i][tx * BATCH_SIZE + j] = 0;
|
| 224 |
+
}
|
| 225 |
+
}
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
// fetch A -- padding
|
| 229 |
+
for (uint64_t i = 0; i < KERNEL - 1; i++)
|
| 230 |
+
{
|
| 231 |
+
for (uint64_t j = 0; j < BATCH_SIZE * DIM_THREAD_BLOCK + KERNEL - 1; j++)
|
| 232 |
+
{
|
| 233 |
+
if ((index_A_y_bound + i) < NI && (index_A_x_start + j) < NJ)
|
| 234 |
+
{
|
| 235 |
+
memcpy_async(tmp_A[fetch % PREFETCH_COUNT][DIM_THREAD_BLOCK * BATCH_SIZE + i][j], A[(index_A_y_bound + i) * NJ + index_A_x_start + j], pipe);
|
| 236 |
+
}
|
| 237 |
+
}
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
// fetch A -- padding
|
| 241 |
+
for (uint64_t i = 0; i < BATCH_SIZE * DIM_THREAD_BLOCK + KERNEL - 1; i++)
|
| 242 |
+
{
|
| 243 |
+
for (uint64_t j = 0; j < KERNEL - 1; j++)
|
| 244 |
+
{
|
| 245 |
+
if ((index_A_y_start + i) < NI && (index_A_x_bound + j) < NJ)
|
| 246 |
+
{
|
| 247 |
+
memcpy_async(tmp_A[fetch % PREFETCH_COUNT][i][DIM_THREAD_BLOCK * BATCH_SIZE + j], A[(index_A_y_start + i) * NJ + index_A_x_bound + j], pipe);
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
pipe.commit();
|
| 252 |
+
}
|
| 253 |
+
if (fetch == end_tile)
|
| 254 |
+
{
|
| 255 |
+
for (uint64_t i = 0; i < PREFETCH_COUNT - 1; ++i)
|
| 256 |
+
{
|
| 257 |
+
pipe.commit();
|
| 258 |
+
}
|
| 259 |
+
++fetch;
|
| 260 |
+
}
|
| 261 |
+
pipe.wait_prior<PREFETCH_COUNT - 1>();
|
| 262 |
+
block.sync();
|
| 263 |
+
|
| 264 |
+
// block id
|
| 265 |
+
uint64_t offset = compute - base_tile;
|
| 266 |
+
uint64_t block_id = compute / tiles_this_block;
|
| 267 |
+
uint64_t bx = block_id % gridDim.x * tiles_this_block_x + offset % tiles_this_block_x;
|
| 268 |
+
uint64_t by = block_id / gridDim.x * tiles_this_block_x + offset / tiles_this_block_x;
|
| 269 |
+
|
| 270 |
+
// thread id
|
| 271 |
+
uint64_t tx = threadIdx.x;
|
| 272 |
+
uint64_t ty = threadIdx.y;
|
| 273 |
+
|
| 274 |
+
uint64_t index_B_y = DIM_THREAD_BLOCK * BATCH_SIZE * by + BATCH_SIZE * ty + 1;
|
| 275 |
+
uint64_t index_B_x = DIM_THREAD_BLOCK * BATCH_SIZE * bx + BATCH_SIZE * tx + 1;
|
| 276 |
+
|
| 277 |
+
// Computation
|
| 278 |
+
for (uint64_t i = 0; i < BATCH_SIZE; i++)
|
| 279 |
+
{
|
| 280 |
+
for (uint64_t j = 0; j < BATCH_SIZE; j++)
|
| 281 |
+
{
|
| 282 |
+
tmp_B[ty * BATCH_SIZE + i][tx * BATCH_SIZE + j] = 0;
|
| 283 |
+
}
|
| 284 |
+
}
|
| 285 |
+
block.sync();
|
| 286 |
+
|
| 287 |
+
for (uint64_t i = 0; i < BATCH_SIZE; i++)
|
| 288 |
+
{
|
| 289 |
+
for (uint64_t j = 0; j < BATCH_SIZE; j++)
|
| 290 |
+
{
|
| 291 |
+
for (uint64_t m = 0; m < KERNEL; m++)
|
| 292 |
+
{
|
| 293 |
+
for (uint64_t n = 0; n < KERNEL; n++)
|
| 294 |
+
{
|
| 295 |
+
tmp_B[ty * BATCH_SIZE + i][tx * BATCH_SIZE + j] += tmp_A[compute % PREFETCH_COUNT][ty * BATCH_SIZE + i + m][tx * BATCH_SIZE + j + n] * c[n][m];
|
| 296 |
+
}
|
| 297 |
+
}
|
| 298 |
+
}
|
| 299 |
+
}
|
| 300 |
+
block.sync();
|
| 301 |
+
|
| 302 |
+
// Store B
|
| 303 |
+
for (uint64_t i = 0; i < BATCH_SIZE; i++)
|
| 304 |
+
{
|
| 305 |
+
for (uint64_t j = 0; j < BATCH_SIZE; j++)
|
| 306 |
+
{
|
| 307 |
+
if ((index_B_y + i) < NI && (index_B_x + j) < NJ)
|
| 308 |
+
{
|
| 309 |
+
B[(index_B_y + i) * NJ + index_B_x + j] = tmp_B[ty * BATCH_SIZE + i][tx * BATCH_SIZE + j];
|
| 310 |
+
}
|
| 311 |
+
}
|
| 312 |
+
}
|
| 313 |
+
block.sync();
|
| 314 |
+
}
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
void convolution2DCuda(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *A_gpu, DATA_TYPE *B_gpu)
|
| 318 |
+
{
|
| 319 |
+
double t_start, t_end;
|
| 320 |
+
|
| 321 |
+
uint64_t output_width = NI - KERNEL + 1;
|
| 322 |
+
uint64_t output_height = NJ - KERNEL + 1;
|
| 323 |
+
|
| 324 |
+
dim3 block(DIM_THREAD_BLOCK, DIM_THREAD_BLOCK);
|
| 325 |
+
dim3 grid(nblocks, nblocks);
|
| 326 |
+
|
| 327 |
+
uint64_t block_size = (NJ + (nblocks - 1)) / nblocks;
|
| 328 |
+
|
| 329 |
+
// t_start = rtclock();
|
| 330 |
+
|
| 331 |
+
cudaMemcpy(A_gpu, A, sizeof(DATA_TYPE) * NI * NJ, cudaMemcpyHostToDevice);
|
| 332 |
+
Convolution2D_kernel<<<grid,block>>>(A_gpu, B_gpu, NI, NJ, block_size);
|
| 333 |
+
cudaDeviceSynchronize();
|
| 334 |
+
cudaMemcpy(B, B_gpu, sizeof(DATA_TYPE) * NI * NJ, cudaMemcpyDeviceToHost);
|
| 335 |
+
|
| 336 |
+
// t_end = rtclock();
|
| 337 |
+
// fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);//);
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
int main(int argc, char *argv[])
|
| 341 |
+
{
|
| 342 |
+
uint64_t start_tsc = rdtsc();
|
| 343 |
+
uint64_t start_tsp = rdtsp();
|
| 344 |
+
printf("start_tsc %lu start_tsp %lu\n", start_tsc, start_tsp);
|
| 345 |
+
if (argc >= 4) {
|
| 346 |
+
NI = atoll(argv[1]);
|
| 347 |
+
NJ = atoll(argv[2]);
|
| 348 |
+
nblocks = atoi(argv[3]);
|
| 349 |
+
} else {
|
| 350 |
+
NI = SIZE;
|
| 351 |
+
NJ = SIZE;
|
| 352 |
+
nblocks = NBLOCKS;
|
| 353 |
+
}
|
| 354 |
+
double t_start, t_end;
|
| 355 |
+
|
| 356 |
+
DATA_TYPE* A;
|
| 357 |
+
DATA_TYPE* B;
|
| 358 |
+
DATA_TYPE *B_ref;
|
| 359 |
+
DATA_TYPE *A_gpu;
|
| 360 |
+
DATA_TYPE *B_gpu;
|
| 361 |
+
|
| 362 |
+
A = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE));
|
| 363 |
+
B = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE));
|
| 364 |
+
B_ref = (DATA_TYPE *)malloc(NI * NJ * sizeof(DATA_TYPE));
|
| 365 |
+
initCPU(A);
|
| 366 |
+
GPU_argv_init();
|
| 367 |
+
|
| 368 |
+
initTrace();
|
| 369 |
+
startCPU();
|
| 370 |
+
|
| 371 |
+
cudaMalloc(&A_gpu, sizeof(DATA_TYPE) * NI * NJ);
|
| 372 |
+
cudaMalloc(&B_gpu, sizeof(DATA_TYPE) * NI * NJ);
|
| 373 |
+
// B_outputFromGpu = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE));
|
| 374 |
+
|
| 375 |
+
convolution2DCuda(A, B, A_gpu, B_gpu);
|
| 376 |
+
|
| 377 |
+
cudaFree(A_gpu);
|
| 378 |
+
cudaFree(B_gpu);
|
| 379 |
+
endCPU();
|
| 380 |
+
finiTrace();
|
| 381 |
+
|
| 382 |
+
// t_start = rtclock();
|
| 383 |
+
// conv2D(A, B_ref);
|
| 384 |
+
// t_end = rtclock();
|
| 385 |
+
// fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);
|
| 386 |
+
|
| 387 |
+
// compareResults(B, B_ref);
|
| 388 |
+
free(A);
|
| 389 |
+
free(B);
|
| 390 |
+
|
| 391 |
+
return 0;
|
| 392 |
+
}
|
workloads/micro/async/2DCONV/Makefile
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
include ../../../common/make.config
|
| 2 |
+
|
| 3 |
+
NVCCCFLAGS = -I$(CUPTI_INCLUDE) -L$(CUPTI_LIB_DIR) -std=c++11 -lcuda -lcupti -arch=sm_80 -O3
|
| 4 |
+
NVCC = $(CUDA_DIR)/bin/nvcc
|
| 5 |
+
|
| 6 |
+
EXECUTABLE := 2DConvolution
|
| 7 |
+
CUFILES := 2DConvolution.cu $(CUPTI_ADD_COMMON)/cupti_add.cpp $(CUPTI_ADD_COMMON)/cpu_timestamps.cpp
|
| 8 |
+
|
| 9 |
+
all:
|
| 10 |
+
$(NVCC) ${NVCCCFLAGS} ${CUFILES} ${DEF} -o ${EXECUTABLE}
|
| 11 |
+
clean:
|
| 12 |
+
rm -f *.o 2DConvolution
|
workloads/micro/async/2DCONV/run.sh
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ./2DConvolution 16384 16384 32
|
| 2 |
+
./2DConvolution 32768 32768 32
|
workloads/micro/async/2DCONV/run_large.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./2DConvolution 8192 8192 32
|
workloads/micro/async/2DCONV/run_medium.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./2DConvolution 4096 4096 32
|
workloads/micro/async/2DCONV/run_mega.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./2DConvolution 65536 65536 32
|
workloads/micro/async/2DCONV/run_small.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./2DConvolution 1024 1024 8
|
workloads/micro/async/2DCONV/run_super.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./2DConvolution 32768 32768 32
|
workloads/micro/async/2DCONV/run_tiny.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./2DConvolution 512 512 4
|
workloads/micro/async/3DCONV/3DConvolution.cu
ADDED
|
@@ -0,0 +1,449 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* 3DConvolution.cu: This file is part of the PolyBench/GPU 1.0 test suite.
|
| 3 |
+
*
|
| 4 |
+
*
|
| 5 |
+
* Contact: Scott Grauer-Gray <sgrauerg@gmail.com>
|
| 6 |
+
* Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
|
| 7 |
+
* Web address: http://www.cse.ohio-state.edu/~pouchet/software/polybench/GPU
|
| 8 |
+
*/
|
| 9 |
+
|
| 10 |
+
#include <unistd.h>
|
| 11 |
+
#include <stdio.h>
|
| 12 |
+
#include <time.h>
|
| 13 |
+
#include <sys/time.h>
|
| 14 |
+
#include <stdlib.h>
|
| 15 |
+
#include <stdarg.h>
|
| 16 |
+
#include <string.h>
|
| 17 |
+
#include <cuda.h>
|
| 18 |
+
|
| 19 |
+
#include "../../../common/cupti_add.h"
|
| 20 |
+
#include "../../../common/cpu_timestamps.h"
|
| 21 |
+
|
| 22 |
+
#include <cooperative_groups.h>
|
| 23 |
+
#include <cooperative_groups/memcpy_async.h>
|
| 24 |
+
|
| 25 |
+
using namespace nvcuda::experimental;
|
| 26 |
+
|
| 27 |
+
#define PREFETCH_COUNT 2
|
| 28 |
+
|
| 29 |
+
#define SMALL_FLOAT_VAL 0.00000001f
|
| 30 |
+
|
| 31 |
+
double rtclock()
|
| 32 |
+
{
|
| 33 |
+
struct timezone Tzp;
|
| 34 |
+
struct timeval Tp;
|
| 35 |
+
uint64_t stat;
|
| 36 |
+
stat = gettimeofday(&Tp, &Tzp);
|
| 37 |
+
if (stat != 0)
|
| 38 |
+
printf("Error return from gettimeofday: %d", stat);
|
| 39 |
+
return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
float absVal(float a)
|
| 43 |
+
{
|
| 44 |
+
if (a < 0)
|
| 45 |
+
{
|
| 46 |
+
return (a * -1);
|
| 47 |
+
}
|
| 48 |
+
else
|
| 49 |
+
{
|
| 50 |
+
return a;
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
float percentDiff(double val1, double val2)
|
| 55 |
+
{
|
| 56 |
+
if ((absVal(val1) < 0.01) && (absVal(val2) < 0.01))
|
| 57 |
+
{
|
| 58 |
+
return 0.0f;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
else
|
| 62 |
+
{
|
| 63 |
+
return 100.0f * (absVal(absVal(val1 - val2) / absVal(val1 + SMALL_FLOAT_VAL)));
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
// define the error threshold for the results "not matching"
|
| 68 |
+
#define PERCENT_DIFF_ERROR_THRESHOLD 0.05
|
| 69 |
+
|
| 70 |
+
/* Problem size */
|
| 71 |
+
#define SIZE 4096
|
| 72 |
+
#define NBLOCKS 2
|
| 73 |
+
#define BATCH_SIZE 3
|
| 74 |
+
|
| 75 |
+
uint64_t NI;
|
| 76 |
+
uint64_t NJ;
|
| 77 |
+
uint64_t NK;
|
| 78 |
+
uint64_t nblocks;
|
| 79 |
+
|
| 80 |
+
/* Thread block dimensions */
|
| 81 |
+
#define DIM_THREAD_BLOCK 4
|
| 82 |
+
|
| 83 |
+
#define KERNEL 3
|
| 84 |
+
|
| 85 |
+
/* Can switch DATA_TYPE between float and double */
|
| 86 |
+
typedef float DATA_TYPE;
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
void conv3D(DATA_TYPE* A, DATA_TYPE* B)
|
| 91 |
+
{
|
| 92 |
+
uint64_t i, j, k;
|
| 93 |
+
DATA_TYPE c11, c12, c13, c21, c22, c23, c31, c32, c33;
|
| 94 |
+
|
| 95 |
+
c11 = +2; c21 = +5; c31 = -8;
|
| 96 |
+
c12 = -3; c22 = +6; c32 = -9;
|
| 97 |
+
c13 = +4; c23 = +7; c33 = +10;
|
| 98 |
+
|
| 99 |
+
for (i = 1; i < NI - 1; ++i) // 0
|
| 100 |
+
{
|
| 101 |
+
for (j = 1; j < NJ - 1; ++j) // 1
|
| 102 |
+
{
|
| 103 |
+
for (k = 1; k < NK -1; ++k) // 2
|
| 104 |
+
{
|
| 105 |
+
B[i*(NK * NJ) + j*NK + k] = c11 * A[(i - 1)*(NK * NJ) + (j - 1)*NK + (k - 1)] + c13 * A[(i + 1)*(NK * NJ) + (j - 1)*NK + (k - 1)]
|
| 106 |
+
+ c21 * A[(i - 1)*(NK * NJ) + (j - 1)*NK + (k - 1)] + c23 * A[(i + 1)*(NK * NJ) + (j - 1)*NK + (k - 1)]
|
| 107 |
+
+ c31 * A[(i - 1)*(NK * NJ) + (j - 1)*NK + (k - 1)] + c33 * A[(i + 1)*(NK * NJ) + (j - 1)*NK + (k - 1)]
|
| 108 |
+
+ c12 * A[(i + 0)*(NK * NJ) + (j - 1)*NK + (k + 0)] + c22 * A[(i + 0)*(NK * NJ) + (j + 0)*NK + (k + 0)]
|
| 109 |
+
+ c32 * A[(i + 0)*(NK * NJ) + (j + 1)*NK + (k + 0)] + c11 * A[(i - 1)*(NK * NJ) + (j - 1)*NK + (k + 1)]
|
| 110 |
+
+ c13 * A[(i + 1)*(NK * NJ) + (j - 1)*NK + (k + 1)] + c21 * A[(i - 1)*(NK * NJ) + (j + 0)*NK + (k + 1)]
|
| 111 |
+
+ c23 * A[(i + 1)*(NK * NJ) + (j + 0)*NK + (k + 1)] + c31 * A[(i - 1)*(NK * NJ) + (j + 1)*NK + (k + 1)]
|
| 112 |
+
+ c33 * A[(i + 1)*(NK * NJ) + (j + 1)*NK + (k + 1)];
|
| 113 |
+
}
|
| 114 |
+
}
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
void initGPU(DATA_TYPE *A_gpu)
|
| 119 |
+
{
|
| 120 |
+
uint64_t i, j, k;
|
| 121 |
+
|
| 122 |
+
for (i = 0; i < NI; ++i)
|
| 123 |
+
{
|
| 124 |
+
for (j = 0; j < NJ; ++j)
|
| 125 |
+
{
|
| 126 |
+
for (k = 0; k < NK; ++k)
|
| 127 |
+
{
|
| 128 |
+
A_gpu[i * (NK * NJ) + j * NK + k] = i % 12 + 2 * (j % 7) + 3 * (k % 13);
|
| 129 |
+
}
|
| 130 |
+
}
|
| 131 |
+
}
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
void initCPU(DATA_TYPE *A)
|
| 135 |
+
{
|
| 136 |
+
uint64_t i, j, k;
|
| 137 |
+
|
| 138 |
+
for (i = 0; i < NI; ++i)
|
| 139 |
+
{
|
| 140 |
+
for (j = 0; j < NJ; ++j)
|
| 141 |
+
{
|
| 142 |
+
for (k = 0; k < NK; ++k)
|
| 143 |
+
{
|
| 144 |
+
A[i*(NK * NJ) + j*NK + k] = i % 12 + 2 * (j % 7) + 3 * (k % 13);
|
| 145 |
+
}
|
| 146 |
+
}
|
| 147 |
+
}
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
void compareResults(DATA_TYPE* B, DATA_TYPE* B_outputFromGpu)
|
| 152 |
+
{
|
| 153 |
+
uint64_t i, j, k, fail;
|
| 154 |
+
fail = 0;
|
| 155 |
+
|
| 156 |
+
// Compare result from cpu and gpu...
|
| 157 |
+
for (i = 1; i < NI - 1; ++i) // 0
|
| 158 |
+
{
|
| 159 |
+
for (j = 1; j < NJ - 1; ++j) // 1
|
| 160 |
+
{
|
| 161 |
+
for (k = 1; k < NK - 1; ++k) // 2
|
| 162 |
+
{
|
| 163 |
+
if (percentDiff(B[i*(NK * NJ) + j*NK + k], B_outputFromGpu[i*(NK * NJ) + j*NK + k]) > PERCENT_DIFF_ERROR_THRESHOLD)
|
| 164 |
+
{
|
| 165 |
+
printf("%d, %d, %d, CPU is %f, GPU is %f.\n", i, j, k, B[i * (NK * NJ) + j * NK + k], B_outputFromGpu[i * (NK * NJ) + j * NK + k]);
|
| 166 |
+
fail++;
|
| 167 |
+
}
|
| 168 |
+
}
|
| 169 |
+
}
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
// Print results
|
| 173 |
+
printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail);
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
__global__ void convolution3D_kernel(DATA_TYPE *A, DATA_TYPE *B, uint64_t NI, uint64_t NJ, uint64_t NK, uint64_t block_size)
|
| 177 |
+
{
|
| 178 |
+
cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
|
| 179 |
+
pipeline pipe;
|
| 180 |
+
DATA_TYPE c11, c12, c13, c21, c22, c23, c31, c32, c33;
|
| 181 |
+
|
| 182 |
+
c11 = +2;
|
| 183 |
+
c21 = +5;
|
| 184 |
+
c31 = -8;
|
| 185 |
+
c12 = -3;
|
| 186 |
+
c22 = +6;
|
| 187 |
+
c32 = -9;
|
| 188 |
+
c13 = +4;
|
| 189 |
+
c23 = +7;
|
| 190 |
+
c33 = +10;
|
| 191 |
+
|
| 192 |
+
uint64_t tile_dim_x = (NJ + DIM_THREAD_BLOCK - 1) / (DIM_THREAD_BLOCK * BATCH_SIZE);
|
| 193 |
+
|
| 194 |
+
__shared__ DATA_TYPE tmp_A[PREFETCH_COUNT][DIM_THREAD_BLOCK * BATCH_SIZE + KERNEL - 1][DIM_THREAD_BLOCK * BATCH_SIZE + KERNEL - 1][DIM_THREAD_BLOCK * BATCH_SIZE + KERNEL - 1];
|
| 195 |
+
__shared__ DATA_TYPE tmp_B[DIM_THREAD_BLOCK * BATCH_SIZE][DIM_THREAD_BLOCK * BATCH_SIZE][DIM_THREAD_BLOCK * BATCH_SIZE];
|
| 196 |
+
|
| 197 |
+
// uint64_t total_tiles = tile_dim_x * tile_dim_x * tile_dim_x;
|
| 198 |
+
|
| 199 |
+
uint64_t tiles_this_block_x = (block_size / (DIM_THREAD_BLOCK * BATCH_SIZE));
|
| 200 |
+
uint64_t tiles_this_block = tiles_this_block_x * tiles_this_block_x * tiles_this_block_x;
|
| 201 |
+
|
| 202 |
+
uint64_t base_tile = (blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x) * tiles_this_block;
|
| 203 |
+
uint64_t fetch = base_tile;
|
| 204 |
+
uint64_t end_tile = fetch + tiles_this_block;
|
| 205 |
+
|
| 206 |
+
// printf("block_size is %d, tile_dim_x is %d, tiles_this_block_x is %d.\n", block_size, tile_dim_x, tiles_this_block_x);
|
| 207 |
+
|
| 208 |
+
for (uint64_t compute = fetch; compute < end_tile; compute++)
|
| 209 |
+
{
|
| 210 |
+
for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++)
|
| 211 |
+
{
|
| 212 |
+
// block id
|
| 213 |
+
uint64_t offset = fetch - base_tile;
|
| 214 |
+
uint64_t block_id = fetch / tiles_this_block;
|
| 215 |
+
|
| 216 |
+
uint64_t bz = block_id / (gridDim.y * gridDim.x) * tiles_this_block_x + offset / (tiles_this_block_x * tiles_this_block_x);
|
| 217 |
+
uint64_t by = block_id % (gridDim.y * gridDim.x) / gridDim.x * tiles_this_block_x + offset % (tiles_this_block_x * tiles_this_block_x) / tiles_this_block_x;
|
| 218 |
+
uint64_t bx = block_id % (gridDim.y * gridDim.x) % gridDim.x * tiles_this_block_x + offset % (tiles_this_block_x * tiles_this_block_x) % tiles_this_block_x;
|
| 219 |
+
|
| 220 |
+
// thread id
|
| 221 |
+
uint64_t tx = threadIdx.x;
|
| 222 |
+
uint64_t ty = threadIdx.y;
|
| 223 |
+
uint64_t tz = threadIdx.z;
|
| 224 |
+
|
| 225 |
+
uint64_t index_A_z = DIM_THREAD_BLOCK * BATCH_SIZE * bz + BATCH_SIZE * tz;
|
| 226 |
+
uint64_t index_A_y = DIM_THREAD_BLOCK * BATCH_SIZE * by + BATCH_SIZE * ty;
|
| 227 |
+
uint64_t index_A_x = DIM_THREAD_BLOCK * BATCH_SIZE * bx + BATCH_SIZE * tx;
|
| 228 |
+
|
| 229 |
+
uint64_t index_A_z_start = DIM_THREAD_BLOCK * BATCH_SIZE * bz;
|
| 230 |
+
uint64_t index_A_y_start = DIM_THREAD_BLOCK * BATCH_SIZE * by;
|
| 231 |
+
uint64_t index_A_x_start = DIM_THREAD_BLOCK * BATCH_SIZE * bx;
|
| 232 |
+
|
| 233 |
+
uint64_t index_A_z_bound = DIM_THREAD_BLOCK * BATCH_SIZE * bz + BATCH_SIZE * DIM_THREAD_BLOCK;
|
| 234 |
+
uint64_t index_A_y_bound = DIM_THREAD_BLOCK * BATCH_SIZE * by + BATCH_SIZE * DIM_THREAD_BLOCK;
|
| 235 |
+
uint64_t index_A_x_bound = DIM_THREAD_BLOCK * BATCH_SIZE * bx + BATCH_SIZE * DIM_THREAD_BLOCK;
|
| 236 |
+
|
| 237 |
+
// fetch A
|
| 238 |
+
for (uint64_t i = 0; i < BATCH_SIZE; i++)
|
| 239 |
+
{
|
| 240 |
+
for (uint64_t j = 0; j < BATCH_SIZE; j++)
|
| 241 |
+
{
|
| 242 |
+
for (uint64_t k = 0; k < BATCH_SIZE; k++)
|
| 243 |
+
{
|
| 244 |
+
if ((index_A_z + i) < NI && (index_A_y + j) < NJ && (index_A_x + k) < NK)
|
| 245 |
+
{
|
| 246 |
+
memcpy_async(tmp_A[fetch % PREFETCH_COUNT][tz * BATCH_SIZE + i][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k], A[(index_A_z + i) * NJ * NK + (index_A_y + j) * NK + index_A_x + k], pipe);
|
| 247 |
+
}
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
// fetch A -- padding
|
| 253 |
+
for (uint64_t i = 0; i < KERNEL - 1; i++)
|
| 254 |
+
{
|
| 255 |
+
for (uint64_t j = 0; j < BATCH_SIZE * DIM_THREAD_BLOCK + KERNEL - 1; j++)
|
| 256 |
+
{
|
| 257 |
+
for (uint64_t k = 0; k < BATCH_SIZE * DIM_THREAD_BLOCK + KERNEL - 1; k++)
|
| 258 |
+
{
|
| 259 |
+
if ((index_A_z_bound + i) < NI && (index_A_y_start + j) < NJ && (index_A_x_start + k) < NK)
|
| 260 |
+
{
|
| 261 |
+
memcpy_async(tmp_A[fetch % PREFETCH_COUNT][DIM_THREAD_BLOCK * BATCH_SIZE + i][j][k], A[(index_A_z_bound + i) * NJ * NK + (index_A_y_start + j) * NK + index_A_x_start + k], pipe);
|
| 262 |
+
}
|
| 263 |
+
}
|
| 264 |
+
}
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
// fetch A -- padding
|
| 268 |
+
for (uint64_t i = 0; i < BATCH_SIZE * DIM_THREAD_BLOCK + KERNEL - 1; i++)
|
| 269 |
+
{
|
| 270 |
+
for (uint64_t j = 0; j < KERNEL - 1; j++)
|
| 271 |
+
{
|
| 272 |
+
for (uint64_t k = 0; k < BATCH_SIZE * DIM_THREAD_BLOCK + KERNEL - 1; k++)
|
| 273 |
+
{
|
| 274 |
+
if ((index_A_z_start + i) < NI && (index_A_y_bound + j) < NJ && (index_A_x_start + k) < NK)
|
| 275 |
+
{
|
| 276 |
+
memcpy_async(tmp_A[fetch % PREFETCH_COUNT][i][DIM_THREAD_BLOCK * BATCH_SIZE + j][k], A[(index_A_z_start + i) * NJ * NK + (index_A_y_bound + j) * NK + index_A_x_start + k], pipe);
|
| 277 |
+
}
|
| 278 |
+
}
|
| 279 |
+
}
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
// fetch A -- padding
|
| 283 |
+
for (uint64_t i = 0; i < BATCH_SIZE * DIM_THREAD_BLOCK + KERNEL - 1; i++)
|
| 284 |
+
{
|
| 285 |
+
for (uint64_t j = 0; j < BATCH_SIZE * DIM_THREAD_BLOCK + KERNEL - 1; j++)
|
| 286 |
+
{
|
| 287 |
+
for (uint64_t k = 0; k < KERNEL - 1; k++)
|
| 288 |
+
{
|
| 289 |
+
if ((index_A_z_start + i) < NI && (index_A_y_start + j) < NJ && (index_A_x_bound + k) < NK)
|
| 290 |
+
{
|
| 291 |
+
memcpy_async(tmp_A[fetch % PREFETCH_COUNT][i][j][DIM_THREAD_BLOCK * BATCH_SIZE + k], A[(index_A_z_start + i) * NJ * NK + (index_A_y_start + j) * NK + index_A_x_bound + k], pipe);
|
| 292 |
+
}
|
| 293 |
+
}
|
| 294 |
+
}
|
| 295 |
+
}
|
| 296 |
+
pipe.commit();
|
| 297 |
+
}
|
| 298 |
+
if (fetch == end_tile)
|
| 299 |
+
{
|
| 300 |
+
for (uint64_t i = 0; i < PREFETCH_COUNT - 1; ++i)
|
| 301 |
+
{
|
| 302 |
+
pipe.commit();
|
| 303 |
+
}
|
| 304 |
+
++fetch;
|
| 305 |
+
}
|
| 306 |
+
pipe.wait_prior<PREFETCH_COUNT - 1>();
|
| 307 |
+
block.sync();
|
| 308 |
+
|
| 309 |
+
// block id
|
| 310 |
+
uint64_t offset = compute - base_tile;
|
| 311 |
+
uint64_t block_id = compute / tiles_this_block;
|
| 312 |
+
|
| 313 |
+
uint64_t bz = block_id / (gridDim.y * gridDim.x) * tiles_this_block_x + offset / (tiles_this_block_x * tiles_this_block_x);
|
| 314 |
+
uint64_t by = block_id % (gridDim.y * gridDim.x) / gridDim.x * tiles_this_block_x + offset % (tiles_this_block_x * tiles_this_block_x) / tiles_this_block_x;
|
| 315 |
+
uint64_t bx = block_id % (gridDim.y * gridDim.x) % gridDim.x * tiles_this_block_x + offset % (tiles_this_block_x * tiles_this_block_x) % tiles_this_block_x;
|
| 316 |
+
|
| 317 |
+
// thread id
|
| 318 |
+
uint64_t tx = threadIdx.x;
|
| 319 |
+
uint64_t ty = threadIdx.y;
|
| 320 |
+
uint64_t tz = threadIdx.z;
|
| 321 |
+
|
| 322 |
+
uint64_t index_B_z = DIM_THREAD_BLOCK * BATCH_SIZE * bz + BATCH_SIZE * tz + 1;
|
| 323 |
+
uint64_t index_B_y = DIM_THREAD_BLOCK * BATCH_SIZE * by + BATCH_SIZE * ty + 1;
|
| 324 |
+
uint64_t index_B_x = DIM_THREAD_BLOCK * BATCH_SIZE * bx + BATCH_SIZE * tx + 1;
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
// Computation
|
| 328 |
+
for (uint64_t i = 0; i < BATCH_SIZE; i++)
|
| 329 |
+
{
|
| 330 |
+
for (uint64_t j = 0; j < BATCH_SIZE; j++)
|
| 331 |
+
{
|
| 332 |
+
for (uint64_t k = 0; k < BATCH_SIZE; k++)
|
| 333 |
+
{
|
| 334 |
+
tmp_B[tz * BATCH_SIZE + i][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k] = 0;
|
| 335 |
+
}
|
| 336 |
+
}
|
| 337 |
+
}
|
| 338 |
+
block.sync();
|
| 339 |
+
|
| 340 |
+
for (uint64_t i = 0; i < BATCH_SIZE; i++)
|
| 341 |
+
{
|
| 342 |
+
for (uint64_t j = 0; j < BATCH_SIZE; j++)
|
| 343 |
+
{
|
| 344 |
+
for (uint64_t k = 0; k < BATCH_SIZE; k++)
|
| 345 |
+
{
|
| 346 |
+
tmp_B[tz * BATCH_SIZE + i][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k] =
|
| 347 |
+
c11 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k] + c13 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i + 2][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k] + c21 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k] + c23 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i + 2][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k] + c31 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k] + c33 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i + 2][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k] + c12 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i + 1][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k + 1] + c22 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i + 1][ty * BATCH_SIZE + j + 1][tx * BATCH_SIZE + k + 1] + c32 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i + 1][ty * BATCH_SIZE + j + 2][tx * BATCH_SIZE + k + 1] + c11 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k + 2] + c13 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i + 2][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k + 2] + c21 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i][ty * BATCH_SIZE + j + 1][tx * BATCH_SIZE + k + 2] + c23 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i + 2][ty * BATCH_SIZE + j + 1][tx * BATCH_SIZE + k + 2] + c31 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i][ty * BATCH_SIZE + j + 2][tx * BATCH_SIZE + k + 2] + c33 * tmp_A[compute % PREFETCH_COUNT][tz * BATCH_SIZE + i + 2][ty * BATCH_SIZE + j + 2][tx * BATCH_SIZE + k + 2];
|
| 348 |
+
}
|
| 349 |
+
}
|
| 350 |
+
}
|
| 351 |
+
block.sync();
|
| 352 |
+
|
| 353 |
+
// Store B
|
| 354 |
+
for (uint64_t i = 0; i < BATCH_SIZE; i++)
|
| 355 |
+
{
|
| 356 |
+
for (uint64_t j = 0; j < BATCH_SIZE; j++)
|
| 357 |
+
{
|
| 358 |
+
for (uint64_t k = 0; k < BATCH_SIZE; k++)
|
| 359 |
+
{
|
| 360 |
+
if ((index_B_z + i + 1) < NI && (index_B_y + j + 1) < NJ && (index_B_x + k + 1) < NK)
|
| 361 |
+
{
|
| 362 |
+
B[(index_B_z + i) * NJ * NK + (index_B_y + j) * NK + index_B_x + k] = tmp_B[tz * BATCH_SIZE + i][ty * BATCH_SIZE + j][tx * BATCH_SIZE + k];
|
| 363 |
+
}
|
| 364 |
+
}
|
| 365 |
+
}
|
| 366 |
+
}
|
| 367 |
+
block.sync();
|
| 368 |
+
}
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
void convolution3DCuda(DATA_TYPE* A, DATA_TYPE* B, DATA_TYPE* A_gpu, DATA_TYPE* B_gpu)
|
| 374 |
+
{
|
| 375 |
+
double t_start, t_end;
|
| 376 |
+
|
| 377 |
+
dim3 block(DIM_THREAD_BLOCK, DIM_THREAD_BLOCK, DIM_THREAD_BLOCK);
|
| 378 |
+
dim3 grid(nblocks, nblocks, nblocks);
|
| 379 |
+
|
| 380 |
+
uint64_t block_size = (NI + (nblocks - 1)) / nblocks;
|
| 381 |
+
|
| 382 |
+
// t_start = rtclock();
|
| 383 |
+
|
| 384 |
+
cudaMemcpy(A_gpu, A, sizeof(DATA_TYPE) * NI * NJ * NK, cudaMemcpyHostToDevice);
|
| 385 |
+
convolution3D_kernel<<<grid, block>>>(A_gpu, B_gpu, NI, NJ, NK, block_size);
|
| 386 |
+
cudaDeviceSynchronize();
|
| 387 |
+
cudaMemcpy(B, B_gpu, sizeof(DATA_TYPE) * NI * NJ * NK, cudaMemcpyDeviceToHost);
|
| 388 |
+
|
| 389 |
+
// t_end = rtclock();
|
| 390 |
+
// fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);
|
| 391 |
+
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
int main(int argc, char *argv[])
|
| 395 |
+
{
|
| 396 |
+
uint64_t start_tsc = rdtsc();
|
| 397 |
+
uint64_t start_tsp = rdtsp();
|
| 398 |
+
printf("start_tsc %lu start_tsp %lu\n", start_tsc, start_tsp);
|
| 399 |
+
if (argc >= 5) {
|
| 400 |
+
NI = atoll(argv[1]);
|
| 401 |
+
NJ = atoll(argv[2]);
|
| 402 |
+
NK = atoll(argv[3]);
|
| 403 |
+
nblocks = atoi(argv[4]);
|
| 404 |
+
} else {
|
| 405 |
+
NI = SIZE;
|
| 406 |
+
NJ = SIZE;
|
| 407 |
+
NK = SIZE;
|
| 408 |
+
nblocks = NBLOCKS;
|
| 409 |
+
}
|
| 410 |
+
double t_start, t_end;
|
| 411 |
+
|
| 412 |
+
DATA_TYPE* A;
|
| 413 |
+
DATA_TYPE* B;
|
| 414 |
+
DATA_TYPE *B_ref;
|
| 415 |
+
DATA_TYPE *A_gpu;
|
| 416 |
+
DATA_TYPE *B_gpu;
|
| 417 |
+
|
| 418 |
+
A = (DATA_TYPE*)malloc(NI*NJ*NK*sizeof(DATA_TYPE));
|
| 419 |
+
B = (DATA_TYPE*)malloc(NI*NJ*NK*sizeof(DATA_TYPE));
|
| 420 |
+
B_ref = (DATA_TYPE*)malloc(NI*NJ*NK*sizeof(DATA_TYPE));
|
| 421 |
+
initCPU(A);
|
| 422 |
+
GPU_argv_init();
|
| 423 |
+
|
| 424 |
+
initTrace();
|
| 425 |
+
startCPU();
|
| 426 |
+
|
| 427 |
+
cudaMalloc(&A_gpu, sizeof(DATA_TYPE) * NI * NJ * NK);
|
| 428 |
+
cudaMalloc(&B_gpu, sizeof(DATA_TYPE) * NI * NJ * NK);
|
| 429 |
+
|
| 430 |
+
// initGPU(A_gpu);
|
| 431 |
+
|
| 432 |
+
convolution3DCuda(A, B, A_gpu, B_gpu);
|
| 433 |
+
|
| 434 |
+
cudaFree(A_gpu);
|
| 435 |
+
cudaFree(B_gpu);
|
| 436 |
+
endCPU();
|
| 437 |
+
finiTrace();
|
| 438 |
+
|
| 439 |
+
// t_start = rtclock();
|
| 440 |
+
// conv3D(A, B_ref);
|
| 441 |
+
// t_end = rtclock();
|
| 442 |
+
// fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);
|
| 443 |
+
|
| 444 |
+
// compareResults(B, B_ref);
|
| 445 |
+
free(A);
|
| 446 |
+
free(B);
|
| 447 |
+
|
| 448 |
+
return 0;
|
| 449 |
+
}
|
workloads/micro/async/3DCONV/Makefile
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
include ../../../common/make.config
|
| 2 |
+
|
| 3 |
+
NVCCCFLAGS = -I$(CUPTI_INCLUDE) -L$(CUPTI_LIB_DIR) -std=c++11 -lcuda -lcupti -arch=sm_80 -O3
|
| 4 |
+
NVCC = $(CUDA_DIR)/bin/nvcc
|
| 5 |
+
|
| 6 |
+
EXECUTABLE := 3DConvolution
|
| 7 |
+
CUFILES := 3DConvolution.cu $(CUPTI_ADD_COMMON)/cupti_add.cpp $(CUPTI_ADD_COMMON)/cpu_timestamps.cpp
|
| 8 |
+
|
| 9 |
+
all:
|
| 10 |
+
$(NVCC) ${NVCCCFLAGS} ${CUFILES} ${DEF} -o ${EXECUTABLE}
|
| 11 |
+
clean:
|
| 12 |
+
rm -f *.o 3DConvolution
|
workloads/micro/async/3DCONV/run.sh
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#./3DConvolution 768 768 768 8
|
| 2 |
+
./3DConvolution 1536 1536 1536 8
|
workloads/micro/async/3DCONV/run_large.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./3DConvolution 384 384 384 8
|
workloads/micro/async/3DCONV/run_medium.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./3DConvolution 192 192 192 4
|
workloads/micro/async/3DCONV/run_mega.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./3DConvolution 1536 1536 1536 8
|
workloads/micro/async/3DCONV/run_small.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./3DConvolution 96 96 96 4
|
workloads/micro/async/3DCONV/run_super.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./3DConvolution 768 768 768 8
|
workloads/micro/async/3DCONV/run_tiny.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./3DConvolution 48 48 48 2
|
workloads/micro/async/gemm/Makefile
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
include ../../../common/make.config
|
| 2 |
+
|
| 3 |
+
NVCCCFLAGS = -I$(CUPTI_INCLUDE) -L$(CUPTI_LIB_DIR) -std=c++11 -lcuda -lcupti -arch=sm_80 -O3
|
| 4 |
+
NVCC = $(CUDA_DIR)/bin/nvcc
|
| 5 |
+
|
| 6 |
+
EXECUTABLE := gemm
|
| 7 |
+
CUFILES := gemm.cu $(CUPTI_ADD_COMMON)/cupti_add.cpp $(CUPTI_ADD_COMMON)/cpu_timestamps.cpp
|
| 8 |
+
|
| 9 |
+
all:
|
| 10 |
+
$(NVCC) ${NVCCCFLAGS} ${CUFILES} ${DEF} -o ${EXECUTABLE}
|
| 11 |
+
clean:
|
| 12 |
+
rm -f *.o gemm
|
workloads/micro/async/gemm/gemm.cu
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* gemm.cu: This file is part of the PolyBench/GPU 1.0 test suite.
|
| 3 |
+
*
|
| 4 |
+
*
|
| 5 |
+
* Contact: Scott Grauer-Gray <sgrauerg@gmail.com>
|
| 6 |
+
* Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
|
| 7 |
+
* Web address: http://www.cse.ohio-state.edu/~pouchet/software/polybench/GPU
|
| 8 |
+
*/
|
| 9 |
+
|
| 10 |
+
#include <unistd.h>
|
| 11 |
+
#include <stdio.h>
|
| 12 |
+
#include <time.h>
|
| 13 |
+
#include <sys/time.h>
|
| 14 |
+
#include <stdlib.h>
|
| 15 |
+
#include <stdarg.h>
|
| 16 |
+
#include <string.h>
|
| 17 |
+
#include <cuda.h>
|
| 18 |
+
#include "../../../common/cupti_add.h"
|
| 19 |
+
#include "../../../common/cpu_timestamps.h"
|
| 20 |
+
|
| 21 |
+
#include <cooperative_groups.h>
|
| 22 |
+
#include <cooperative_groups/memcpy_async.h>
|
| 23 |
+
|
| 24 |
+
using namespace nvcuda::experimental;
|
| 25 |
+
|
| 26 |
+
#define PREFETCH_COUNT 2
|
| 27 |
+
|
| 28 |
+
#define SMALL_FLOAT_VAL 0.00000001f
|
| 29 |
+
|
| 30 |
+
double rtclock()
|
| 31 |
+
{
|
| 32 |
+
struct timezone Tzp;
|
| 33 |
+
struct timeval Tp;
|
| 34 |
+
uint64_t stat;
|
| 35 |
+
stat = gettimeofday(&Tp, &Tzp);
|
| 36 |
+
if (stat != 0)
|
| 37 |
+
printf("Error return from gettimeofday: %d", stat);
|
| 38 |
+
return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
float absVal(float a)
|
| 42 |
+
{
|
| 43 |
+
if (a < 0)
|
| 44 |
+
{
|
| 45 |
+
return (a * -1);
|
| 46 |
+
}
|
| 47 |
+
else
|
| 48 |
+
{
|
| 49 |
+
return a;
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
float percentDiff(double val1, double val2)
|
| 54 |
+
{
|
| 55 |
+
if ((absVal(val1) < 0.01) && (absVal(val2) < 0.01))
|
| 56 |
+
{
|
| 57 |
+
return 0.0f;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
else
|
| 61 |
+
{
|
| 62 |
+
return 100.0f * (absVal(absVal(val1 - val2) / absVal(val1 + SMALL_FLOAT_VAL)));
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
//define the error threshold for the results "not matching"
|
| 67 |
+
#define PERCENT_DIFF_ERROR_THRESHOLD 0.05
|
| 68 |
+
|
| 69 |
+
/* Problem size */
|
| 70 |
+
#define SIZE 4096
|
| 71 |
+
uint64_t NI;
|
| 72 |
+
uint64_t NJ;
|
| 73 |
+
uint64_t NK;
|
| 74 |
+
|
| 75 |
+
/* Thread block dimensions */
|
| 76 |
+
#define DIM_THREAD_BLOCK_X 32
|
| 77 |
+
#define DIM_THREAD_BLOCK_Y 32
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
/* Declared constant values for ALPHA and BETA (same as values in PolyBench 2.0) */
|
| 81 |
+
#define ALPHA 1.1f
|
| 82 |
+
#define BETA 1.1f
|
| 83 |
+
|
| 84 |
+
/* Can switch DATA_TYPE between float and double */
|
| 85 |
+
typedef float DATA_TYPE;
|
| 86 |
+
// typedef uint64_t DATA_TYPE;
|
| 87 |
+
|
| 88 |
+
void gemm(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C)
|
| 89 |
+
{
|
| 90 |
+
uint64_t i,j,k;
|
| 91 |
+
|
| 92 |
+
for (i = 0; i < NI; i++) {
|
| 93 |
+
for (j = 0; j < NJ; j++) {
|
| 94 |
+
C[i*NJ + j] *= BETA;
|
| 95 |
+
for (k = 0; k < NK; ++k) {
|
| 96 |
+
C[i*NJ + j] += ALPHA * A[i*NK + k] * B[k*NJ + j];
|
| 97 |
+
}
|
| 98 |
+
}
|
| 99 |
+
}
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
void init(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C, DATA_TYPE *C_ref)
|
| 103 |
+
{
|
| 104 |
+
uint64_t i, j;
|
| 105 |
+
|
| 106 |
+
for (i = 0; i < NI; i++)
|
| 107 |
+
for (j = 0; j < NK; j++)
|
| 108 |
+
A[i*NK + j] = ((DATA_TYPE) i*j) / NI;
|
| 109 |
+
|
| 110 |
+
for (i = 0; i < NK; i++)
|
| 111 |
+
for (j = 0; j < NJ; j++)
|
| 112 |
+
B[i*NJ + j] = ((DATA_TYPE) i*j + 1) / NJ;
|
| 113 |
+
|
| 114 |
+
for (i = 0; i < NI; i++) {
|
| 115 |
+
for (j = 0; j < NJ; j++) {
|
| 116 |
+
C[i * NJ + j] = ((DATA_TYPE)i * j + 2) / NJ;
|
| 117 |
+
C_ref[i * NJ + j] = ((DATA_TYPE)i * j + 2) / NJ;
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
void compareResults(DATA_TYPE* C, DATA_TYPE* C_outputFromGpu)
|
| 125 |
+
{
|
| 126 |
+
uint64_t i, j, fail;
|
| 127 |
+
fail = 0;
|
| 128 |
+
|
| 129 |
+
// Compare C1 and C2
|
| 130 |
+
for (i=0; i < NI; i++)
|
| 131 |
+
{
|
| 132 |
+
for (j=0; j < NJ; j++)
|
| 133 |
+
{
|
| 134 |
+
// printf("%d, %d, GPU is %f, CPU is %f.\n", i, j, C[i*NJ + j], C_outputFromGpu[i*NJ + j]);
|
| 135 |
+
if (percentDiff(C[i*NJ + j], C_outputFromGpu[i*NJ + j]) > PERCENT_DIFF_ERROR_THRESHOLD)
|
| 136 |
+
{
|
| 137 |
+
printf("%d, %d, GPU is %f, CPU is %f.\n", i, j, C[i*NJ + j], C_outputFromGpu[i*NJ + j]);
|
| 138 |
+
fail++;
|
| 139 |
+
}
|
| 140 |
+
}
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
// Print results
|
| 144 |
+
printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail);
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
__global__ void gemm_kernel(DATA_TYPE *a, DATA_TYPE *b, DATA_TYPE *c, uint64_t NI, uint64_t NK, uint64_t NJ)
|
| 148 |
+
{
|
| 149 |
+
cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
|
| 150 |
+
pipeline pipe;
|
| 151 |
+
|
| 152 |
+
uint64_t row = blockIdx.y * blockDim.y + threadIdx.y;
|
| 153 |
+
uint64_t col = blockIdx.x * blockDim.x + threadIdx.x;
|
| 154 |
+
|
| 155 |
+
__shared__ DATA_TYPE s_a[DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y * PREFETCH_COUNT];
|
| 156 |
+
__shared__ DATA_TYPE s_b[DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y * PREFETCH_COUNT];
|
| 157 |
+
|
| 158 |
+
DATA_TYPE tmp = BETA * c[row * NJ + col];
|
| 159 |
+
|
| 160 |
+
uint64_t base_tiles = 0;
|
| 161 |
+
uint64_t end_tile = base_tiles + NK / blockDim.x;
|
| 162 |
+
|
| 163 |
+
uint64_t fetch = base_tiles;
|
| 164 |
+
uint64_t tile_size = DIM_THREAD_BLOCK_X;
|
| 165 |
+
uint64_t mem_size = DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y;
|
| 166 |
+
|
| 167 |
+
for (uint64_t compute = fetch; compute < end_tile; compute++)
|
| 168 |
+
{
|
| 169 |
+
for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++)
|
| 170 |
+
{
|
| 171 |
+
memcpy_async(s_a[(fetch % PREFETCH_COUNT) * mem_size + (threadIdx.y * blockDim.x + threadIdx.x)], a[row * NK + fetch * tile_size + threadIdx.x], pipe);
|
| 172 |
+
memcpy_async(s_b[(fetch % PREFETCH_COUNT) * mem_size + (threadIdx.y * blockDim.x + threadIdx.x)], b[(fetch * tile_size + threadIdx.y) * NJ + col], pipe);
|
| 173 |
+
|
| 174 |
+
pipe.commit();
|
| 175 |
+
}
|
| 176 |
+
if (fetch == end_tile) {
|
| 177 |
+
for (uint64_t i = 0; i < PREFETCH_COUNT-1; ++i) { pipe.commit(); }
|
| 178 |
+
++fetch;
|
| 179 |
+
}
|
| 180 |
+
pipe.wait_prior<PREFETCH_COUNT - 1>();
|
| 181 |
+
block.sync();
|
| 182 |
+
|
| 183 |
+
for (uint64_t k = 0; k < blockDim.x; k++)
|
| 184 |
+
{
|
| 185 |
+
tmp += ALPHA * s_a[(compute % PREFETCH_COUNT) * mem_size + (threadIdx.y * blockDim.x + k)] * s_b[(compute % PREFETCH_COUNT) * mem_size + (k * blockDim.x + threadIdx.x)];
|
| 186 |
+
}
|
| 187 |
+
block.sync();
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
c[row * NJ + col] = tmp;
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
void gemmCuda(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C, DATA_TYPE *A_gpu, DATA_TYPE *B_gpu, DATA_TYPE *C_gpu)
|
| 194 |
+
{
|
| 195 |
+
double t_start, t_end;
|
| 196 |
+
|
| 197 |
+
dim3 block(DIM_THREAD_BLOCK_X, DIM_THREAD_BLOCK_Y);
|
| 198 |
+
dim3 grid((size_t)(ceil( ((float)NI)/ ((float)block.x) )),(size_t)(ceil( ((float)NJ)/ ((float)block.y) )));
|
| 199 |
+
|
| 200 |
+
//t_start = rtclock();
|
| 201 |
+
cudaMemcpy(A_gpu, A, sizeof(DATA_TYPE) * NI * NK, cudaMemcpyHostToDevice);
|
| 202 |
+
cudaMemcpy(B_gpu, B, sizeof(DATA_TYPE) * NK * NJ, cudaMemcpyHostToDevice);
|
| 203 |
+
cudaMemcpy(C_gpu, C, sizeof(DATA_TYPE) * NI * NJ, cudaMemcpyHostToDevice);
|
| 204 |
+
gemm_kernel<<< grid, block >>>(A_gpu, B_gpu, C_gpu, NI, NK, NJ);
|
| 205 |
+
cudaDeviceSynchronize();
|
| 206 |
+
cudaMemcpy(C, C_gpu, sizeof(DATA_TYPE) * NI * NJ, cudaMemcpyDeviceToHost);
|
| 207 |
+
//t_end = rtclock();
|
| 208 |
+
|
| 209 |
+
//fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
int main(int argc, char *argv[])
|
| 213 |
+
{
|
| 214 |
+
uint64_t start_tsc = rdtsc();
|
| 215 |
+
uint64_t start_tsp = rdtsp();
|
| 216 |
+
printf("start_tsc %lu start_tsp %lu\n", start_tsc, start_tsp);
|
| 217 |
+
if (argc >= 4) {
|
| 218 |
+
NI = atoll(argv[1]);
|
| 219 |
+
NK = atoll(argv[2]);
|
| 220 |
+
NJ = atoll(argv[3]);
|
| 221 |
+
} else {
|
| 222 |
+
NI = SIZE;
|
| 223 |
+
NK = SIZE;
|
| 224 |
+
NJ = SIZE;
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
double t_start, t_end;
|
| 228 |
+
|
| 229 |
+
DATA_TYPE* A;
|
| 230 |
+
DATA_TYPE* B;
|
| 231 |
+
DATA_TYPE* C;
|
| 232 |
+
DATA_TYPE *C_ref;
|
| 233 |
+
|
| 234 |
+
DATA_TYPE *A_gpu;
|
| 235 |
+
DATA_TYPE *B_gpu;
|
| 236 |
+
DATA_TYPE *C_gpu;
|
| 237 |
+
|
| 238 |
+
A = (DATA_TYPE*)malloc(NI*NK*sizeof(DATA_TYPE));
|
| 239 |
+
B = (DATA_TYPE*)malloc(NK*NJ*sizeof(DATA_TYPE));
|
| 240 |
+
C = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE));
|
| 241 |
+
C_ref = (DATA_TYPE *)malloc(NI * NJ * sizeof(DATA_TYPE));
|
| 242 |
+
|
| 243 |
+
//cudaMallocManaged(&A_gpu, sizeof(DATA_TYPE) * NI * NK);
|
| 244 |
+
//cudaMallocManaged(&B_gpu, sizeof(DATA_TYPE) * NK * NJ);
|
| 245 |
+
//cudaMallocManaged(&C_gpu, sizeof(DATA_TYPE) * NI * NJ);
|
| 246 |
+
init(A, B, C, C_ref);
|
| 247 |
+
|
| 248 |
+
GPU_argv_init();
|
| 249 |
+
|
| 250 |
+
initTrace();
|
| 251 |
+
startCPU();
|
| 252 |
+
|
| 253 |
+
cudaMalloc(&A_gpu, sizeof(DATA_TYPE) * NI * NK);
|
| 254 |
+
cudaMalloc(&B_gpu, sizeof(DATA_TYPE) * NK * NJ);
|
| 255 |
+
cudaMalloc(&C_gpu, sizeof(DATA_TYPE) * NI * NJ);
|
| 256 |
+
|
| 257 |
+
gemmCuda(A, B, C, A_gpu, B_gpu, C_gpu);
|
| 258 |
+
|
| 259 |
+
cudaFree(A_gpu);
|
| 260 |
+
cudaFree(B_gpu);
|
| 261 |
+
cudaFree(C_gpu);
|
| 262 |
+
endCPU();
|
| 263 |
+
finiTrace();
|
| 264 |
+
|
| 265 |
+
// t_start = rtclock();
|
| 266 |
+
// gemm(A, B, C_ref);
|
| 267 |
+
// t_end = rtclock();
|
| 268 |
+
// fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);
|
| 269 |
+
|
| 270 |
+
// compareResults(C, C_ref);
|
| 271 |
+
free(A);
|
| 272 |
+
free(B);
|
| 273 |
+
free(C);
|
| 274 |
+
free(C_ref);
|
| 275 |
+
return 0;
|
| 276 |
+
}
|
| 277 |
+
|
workloads/micro/async/gemm/run.sh
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#./gemm 1024 1024 1024
|
| 2 |
+
./gemm 32768 32768 32768
|
| 3 |
+
#./gemm 512 512 512
|
workloads/micro/async/gemm/run_large.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./gemm 8192 8192 8192
|
workloads/micro/async/gemm/run_medium.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./gemm 4096 4096 4096
|
workloads/micro/async/gemm/run_mega.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./gemm 32768 32768 32768
|
workloads/micro/async/gemm/run_small.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./gemm 1024 1024 1024
|
workloads/micro/async/gemm/run_super.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./gemm 16384 16384 16384
|
workloads/micro/async/gemm/run_tiny.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./gemm 512 512 512
|
workloads/micro/async/gemm_perf/Makefile
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
include ../../../common/make.config
|
| 2 |
+
|
| 3 |
+
NVCCCFLAGS = -I$(CUPTI_INCLUDE) -L$(CUPTI_LIB_DIR) -std=c++11 -lcuda -lcupti -arch=sm_80 -O3
|
| 4 |
+
NVCC = $(CUDA_DIR)/bin/nvcc
|
| 5 |
+
|
| 6 |
+
EXECUTABLE := gemm
|
| 7 |
+
CUFILES := gemm.cu $(CUPTI_ADD_COMMON)/cupti_add.cpp $(CUPTI_ADD_COMMON)/cpu_timestamps.cpp
|
| 8 |
+
|
| 9 |
+
all:
|
| 10 |
+
$(NVCC) ${NVCCCFLAGS} ${CUFILES} ${DEF} -o ${EXECUTABLE}
|
| 11 |
+
clean:
|
| 12 |
+
rm -f *.o gemm
|
workloads/micro/async/gemm_perf/gemm
ADDED
|
Binary file (801 kB). View file
|
|
|
workloads/micro/async/gemm_perf/gemm.cu
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* gemm.cu: This file is part of the PolyBench/GPU 1.0 test suite.
|
| 3 |
+
*
|
| 4 |
+
*
|
| 5 |
+
* Contact: Scott Grauer-Gray <sgrauerg@gmail.com>
|
| 6 |
+
* Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
|
| 7 |
+
* Web address: http://www.cse.ohio-state.edu/~pouchet/software/polybench/GPU
|
| 8 |
+
*/
|
| 9 |
+
|
| 10 |
+
#include <unistd.h>
|
| 11 |
+
#include <stdio.h>
|
| 12 |
+
#include <time.h>
|
| 13 |
+
#include <sys/time.h>
|
| 14 |
+
#include <stdlib.h>
|
| 15 |
+
#include <stdarg.h>
|
| 16 |
+
#include <string.h>
|
| 17 |
+
#include <cuda.h>
|
| 18 |
+
#include "../../../common/cupti_add.h"
|
| 19 |
+
#include "../../../common/cpu_timestamps.h"
|
| 20 |
+
|
| 21 |
+
#include <cooperative_groups.h>
|
| 22 |
+
#include <cooperative_groups/memcpy_async.h>
|
| 23 |
+
|
| 24 |
+
using namespace nvcuda::experimental;
|
| 25 |
+
|
| 26 |
+
#define PREFETCH_COUNT 2
|
| 27 |
+
|
| 28 |
+
#define SMALL_FLOAT_VAL 0.00000001f
|
| 29 |
+
|
| 30 |
+
double rtclock()
|
| 31 |
+
{
|
| 32 |
+
struct timezone Tzp;
|
| 33 |
+
struct timeval Tp;
|
| 34 |
+
uint64_t stat;
|
| 35 |
+
stat = gettimeofday(&Tp, &Tzp);
|
| 36 |
+
if (stat != 0)
|
| 37 |
+
printf("Error return from gettimeofday: %d", stat);
|
| 38 |
+
return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
float absVal(float a)
|
| 42 |
+
{
|
| 43 |
+
if (a < 0)
|
| 44 |
+
{
|
| 45 |
+
return (a * -1);
|
| 46 |
+
}
|
| 47 |
+
else
|
| 48 |
+
{
|
| 49 |
+
return a;
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
float percentDiff(double val1, double val2)
|
| 54 |
+
{
|
| 55 |
+
if ((absVal(val1) < 0.01) && (absVal(val2) < 0.01))
|
| 56 |
+
{
|
| 57 |
+
return 0.0f;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
else
|
| 61 |
+
{
|
| 62 |
+
return 100.0f * (absVal(absVal(val1 - val2) / absVal(val1 + SMALL_FLOAT_VAL)));
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
//define the error threshold for the results "not matching"
|
| 67 |
+
#define PERCENT_DIFF_ERROR_THRESHOLD 0.05
|
| 68 |
+
|
| 69 |
+
/* Problem size */
|
| 70 |
+
#define SIZE 4096
|
| 71 |
+
uint64_t NI;
|
| 72 |
+
uint64_t NJ;
|
| 73 |
+
uint64_t NK;
|
| 74 |
+
|
| 75 |
+
/* Thread block dimensions */
|
| 76 |
+
#define DIM_THREAD_BLOCK_X 32
|
| 77 |
+
#define DIM_THREAD_BLOCK_Y 32
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
/* Declared constant values for ALPHA and BETA (same as values in PolyBench 2.0) */
|
| 81 |
+
#define ALPHA 1.1f
|
| 82 |
+
#define BETA 1.1f
|
| 83 |
+
|
| 84 |
+
/* Can switch DATA_TYPE between float and double */
|
| 85 |
+
typedef float DATA_TYPE;
|
| 86 |
+
// typedef uint64_t DATA_TYPE;
|
| 87 |
+
|
| 88 |
+
void gemm(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C)
|
| 89 |
+
{
|
| 90 |
+
uint64_t i,j,k;
|
| 91 |
+
|
| 92 |
+
for (i = 0; i < NI; i++) {
|
| 93 |
+
for (j = 0; j < NJ; j++) {
|
| 94 |
+
C[i*NJ + j] *= BETA;
|
| 95 |
+
for (k = 0; k < NK; ++k) {
|
| 96 |
+
C[i*NJ + j] += ALPHA * A[i*NK + k] * B[k*NJ + j];
|
| 97 |
+
}
|
| 98 |
+
}
|
| 99 |
+
}
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
void init(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C, DATA_TYPE *C_ref)
|
| 103 |
+
{
|
| 104 |
+
uint64_t i, j;
|
| 105 |
+
|
| 106 |
+
for (i = 0; i < NI; i++)
|
| 107 |
+
for (j = 0; j < NK; j++)
|
| 108 |
+
A[i*NK + j] = ((DATA_TYPE) i*j) / NI;
|
| 109 |
+
|
| 110 |
+
for (i = 0; i < NK; i++)
|
| 111 |
+
for (j = 0; j < NJ; j++)
|
| 112 |
+
B[i*NJ + j] = ((DATA_TYPE) i*j + 1) / NJ;
|
| 113 |
+
|
| 114 |
+
for (i = 0; i < NI; i++) {
|
| 115 |
+
for (j = 0; j < NJ; j++) {
|
| 116 |
+
C[i * NJ + j] = ((DATA_TYPE)i * j + 2) / NJ;
|
| 117 |
+
C_ref[i * NJ + j] = ((DATA_TYPE)i * j + 2) / NJ;
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
void compareResults(DATA_TYPE* C, DATA_TYPE* C_outputFromGpu)
|
| 125 |
+
{
|
| 126 |
+
uint64_t i, j, fail;
|
| 127 |
+
fail = 0;
|
| 128 |
+
|
| 129 |
+
// Compare C1 and C2
|
| 130 |
+
for (i=0; i < NI; i++)
|
| 131 |
+
{
|
| 132 |
+
for (j=0; j < NJ; j++)
|
| 133 |
+
{
|
| 134 |
+
// printf("%d, %d, GPU is %f, CPU is %f.\n", i, j, C[i*NJ + j], C_outputFromGpu[i*NJ + j]);
|
| 135 |
+
if (percentDiff(C[i*NJ + j], C_outputFromGpu[i*NJ + j]) > PERCENT_DIFF_ERROR_THRESHOLD)
|
| 136 |
+
{
|
| 137 |
+
printf("%d, %d, GPU is %f, CPU is %f.\n", i, j, C[i*NJ + j], C_outputFromGpu[i*NJ + j]);
|
| 138 |
+
fail++;
|
| 139 |
+
}
|
| 140 |
+
}
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
// Print results
|
| 144 |
+
printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail);
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
__global__ void gemm_kernel(DATA_TYPE *a, DATA_TYPE *b, DATA_TYPE *c, uint64_t NI, uint64_t NK, uint64_t NJ)
|
| 148 |
+
{
|
| 149 |
+
cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
|
| 150 |
+
pipeline pipe;
|
| 151 |
+
|
| 152 |
+
uint64_t row = blockIdx.y * blockDim.y + threadIdx.y;
|
| 153 |
+
uint64_t col = blockIdx.x * blockDim.x + threadIdx.x;
|
| 154 |
+
|
| 155 |
+
__shared__ DATA_TYPE s_a[DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y * PREFETCH_COUNT];
|
| 156 |
+
__shared__ DATA_TYPE s_b[DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y * PREFETCH_COUNT];
|
| 157 |
+
|
| 158 |
+
DATA_TYPE tmp = BETA * c[row * NJ + col];
|
| 159 |
+
|
| 160 |
+
uint64_t base_tiles = 0;
|
| 161 |
+
uint64_t end_tile = base_tiles + NK / blockDim.x;
|
| 162 |
+
|
| 163 |
+
uint64_t fetch = base_tiles;
|
| 164 |
+
uint64_t tile_size = DIM_THREAD_BLOCK_X;
|
| 165 |
+
uint64_t mem_size = DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y;
|
| 166 |
+
|
| 167 |
+
for (uint64_t compute = fetch; compute < end_tile; compute++)
|
| 168 |
+
{
|
| 169 |
+
for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++)
|
| 170 |
+
{
|
| 171 |
+
memcpy_async(s_a[(fetch % PREFETCH_COUNT) * mem_size + (threadIdx.y * blockDim.x + threadIdx.x)], a[row * NK + fetch * tile_size + threadIdx.x], pipe);
|
| 172 |
+
memcpy_async(s_b[(fetch % PREFETCH_COUNT) * mem_size + (threadIdx.y * blockDim.x + threadIdx.x)], b[(fetch * tile_size + threadIdx.y) * NJ + col], pipe);
|
| 173 |
+
|
| 174 |
+
pipe.commit();
|
| 175 |
+
}
|
| 176 |
+
if (fetch == end_tile) {
|
| 177 |
+
for (uint64_t i = 0; i < PREFETCH_COUNT-1; ++i) { pipe.commit(); }
|
| 178 |
+
++fetch;
|
| 179 |
+
}
|
| 180 |
+
pipe.wait_prior<PREFETCH_COUNT - 1>();
|
| 181 |
+
block.sync();
|
| 182 |
+
|
| 183 |
+
for (uint64_t k = 0; k < blockDim.x; k++)
|
| 184 |
+
{
|
| 185 |
+
tmp += ALPHA * s_a[(compute % PREFETCH_COUNT) * mem_size + (threadIdx.y * blockDim.x + k)] * s_b[(compute % PREFETCH_COUNT) * mem_size + (k * blockDim.x + threadIdx.x)];
|
| 186 |
+
}
|
| 187 |
+
block.sync();
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
c[row * NJ + col] = tmp;
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
void gemmCuda(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C, DATA_TYPE *A_gpu, DATA_TYPE *B_gpu, DATA_TYPE *C_gpu)
|
| 194 |
+
{
|
| 195 |
+
double t_start, t_end;
|
| 196 |
+
|
| 197 |
+
dim3 block(DIM_THREAD_BLOCK_X, DIM_THREAD_BLOCK_Y);
|
| 198 |
+
dim3 grid((size_t)(ceil( ((float)NI)/ ((float)block.x) )),(size_t)(ceil( ((float)NJ)/ ((float)block.y) )));
|
| 199 |
+
|
| 200 |
+
//t_start = rtclock();
|
| 201 |
+
cudaMemcpy(A_gpu, A, sizeof(DATA_TYPE) * NI * NK, cudaMemcpyHostToDevice);
|
| 202 |
+
cudaMemcpy(B_gpu, B, sizeof(DATA_TYPE) * NK * NJ, cudaMemcpyHostToDevice);
|
| 203 |
+
cudaMemcpy(C_gpu, C, sizeof(DATA_TYPE) * NI * NJ, cudaMemcpyHostToDevice);
|
| 204 |
+
gemm_kernel<<< grid, block >>>(A_gpu, B_gpu, C_gpu, NI, NK, NJ);
|
| 205 |
+
cudaDeviceSynchronize();
|
| 206 |
+
cudaMemcpy(C, C_gpu, sizeof(DATA_TYPE) * NI * NJ, cudaMemcpyDeviceToHost);
|
| 207 |
+
//t_end = rtclock();
|
| 208 |
+
|
| 209 |
+
//fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
int main(int argc, char *argv[])
|
| 213 |
+
{
|
| 214 |
+
uint64_t start_tsc = rdtsc();
|
| 215 |
+
uint64_t start_tsp = rdtsp();
|
| 216 |
+
printf("start_tsc %lu start_tsp %lu\n", start_tsc, start_tsp);
|
| 217 |
+
if (argc >= 4) {
|
| 218 |
+
NI = atoll(argv[1]);
|
| 219 |
+
NK = atoll(argv[2]);
|
| 220 |
+
NJ = atoll(argv[3]);
|
| 221 |
+
} else {
|
| 222 |
+
NI = SIZE;
|
| 223 |
+
NK = SIZE;
|
| 224 |
+
NJ = SIZE;
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
double t_start, t_end;
|
| 228 |
+
|
| 229 |
+
DATA_TYPE* A;
|
| 230 |
+
DATA_TYPE* B;
|
| 231 |
+
DATA_TYPE* C;
|
| 232 |
+
DATA_TYPE *C_ref;
|
| 233 |
+
|
| 234 |
+
DATA_TYPE *A_gpu;
|
| 235 |
+
DATA_TYPE *B_gpu;
|
| 236 |
+
DATA_TYPE *C_gpu;
|
| 237 |
+
|
| 238 |
+
A = (DATA_TYPE*)malloc(NI*NK*sizeof(DATA_TYPE));
|
| 239 |
+
B = (DATA_TYPE*)malloc(NK*NJ*sizeof(DATA_TYPE));
|
| 240 |
+
C = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE));
|
| 241 |
+
C_ref = (DATA_TYPE *)malloc(NI * NJ * sizeof(DATA_TYPE));
|
| 242 |
+
|
| 243 |
+
//cudaMallocManaged(&A_gpu, sizeof(DATA_TYPE) * NI * NK);
|
| 244 |
+
//cudaMallocManaged(&B_gpu, sizeof(DATA_TYPE) * NK * NJ);
|
| 245 |
+
//cudaMallocManaged(&C_gpu, sizeof(DATA_TYPE) * NI * NJ);
|
| 246 |
+
init(A, B, C, C_ref);
|
| 247 |
+
|
| 248 |
+
GPU_argv_init();
|
| 249 |
+
|
| 250 |
+
//initTrace();
|
| 251 |
+
startCPU();
|
| 252 |
+
|
| 253 |
+
cudaMalloc(&A_gpu, sizeof(DATA_TYPE) * NI * NK);
|
| 254 |
+
cudaMalloc(&B_gpu, sizeof(DATA_TYPE) * NK * NJ);
|
| 255 |
+
cudaMalloc(&C_gpu, sizeof(DATA_TYPE) * NI * NJ);
|
| 256 |
+
|
| 257 |
+
gemmCuda(A, B, C, A_gpu, B_gpu, C_gpu);
|
| 258 |
+
|
| 259 |
+
cudaFree(A_gpu);
|
| 260 |
+
cudaFree(B_gpu);
|
| 261 |
+
cudaFree(C_gpu);
|
| 262 |
+
endCPU();
|
| 263 |
+
//finiTrace();
|
| 264 |
+
|
| 265 |
+
// t_start = rtclock();
|
| 266 |
+
// gemm(A, B, C_ref);
|
| 267 |
+
// t_end = rtclock();
|
| 268 |
+
// fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);
|
| 269 |
+
|
| 270 |
+
// compareResults(C, C_ref);
|
| 271 |
+
free(A);
|
| 272 |
+
free(B);
|
| 273 |
+
free(C);
|
| 274 |
+
free(C_ref);
|
| 275 |
+
return 0;
|
| 276 |
+
}
|
| 277 |
+
|
workloads/micro/async/gemm_perf/run.sh
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#./gemm 1024 1024 1024
|
| 2 |
+
./gemm 32768 32768 32768
|
| 3 |
+
#./gemm 512 512 512
|
workloads/micro/async/gemm_perf/run_large.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./gemm 8192 8192 8192
|
workloads/micro/async/gemm_perf/run_medium.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./gemm 4096 4096 4096
|
workloads/micro/async/gemm_perf/run_mega.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./gemm 32768 32768 32768
|
workloads/micro/async/gemm_perf/run_small.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./gemm 1024 1024 1024
|
workloads/micro/async/gemm_perf/run_super.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./gemm 16384 16384 16384
|
workloads/micro/async/gemm_perf/run_tiny.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
./gemm 512 512 512
|
workloads/micro/async/gemv/Makefile
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
include ../../../common/make.config
|
| 2 |
+
|
| 3 |
+
NVCCCFLAGS = -I$(CUPTI_INCLUDE) -L$(CUPTI_LIB_DIR) -std=c++11 -lcuda -lcupti -arch=sm_80 -O3
|
| 4 |
+
NVCC = $(CUDA_DIR)/bin/nvcc
|
| 5 |
+
|
| 6 |
+
EXECUTABLE := gemv
|
| 7 |
+
CUFILES := gemv.cu $(CUPTI_ADD_COMMON)/cupti_add.cpp $(CUPTI_ADD_COMMON)/cpu_timestamps.cpp
|
| 8 |
+
|
| 9 |
+
all:
|
| 10 |
+
$(NVCC) ${NVCCCFLAGS} ${CUFILES} ${DEF} -o ${EXECUTABLE}
|
| 11 |
+
clean:
|
| 12 |
+
rm -f *.o gemv
|
workloads/micro/async/gemv/gemv.cu
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* gemm.cu: This file is part of the PolyBench/GPU 1.0 test suite.
|
| 3 |
+
*
|
| 4 |
+
*
|
| 5 |
+
* Contact: Scott Grauer-Gray <sgrauerg@gmail.com>
|
| 6 |
+
* Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
|
| 7 |
+
* Web address: http://www.cse.ohio-state.edu/~pouchet/software/polybench/GPU
|
| 8 |
+
*/
|
| 9 |
+
|
| 10 |
+
#include <unistd.h>
|
| 11 |
+
#include <stdio.h>
|
| 12 |
+
#include <time.h>
|
| 13 |
+
#include <sys/time.h>
|
| 14 |
+
#include <stdlib.h>
|
| 15 |
+
#include <stdarg.h>
|
| 16 |
+
#include <string.h>
|
| 17 |
+
#include <cuda.h>
|
| 18 |
+
#include "../../../common/cupti_add.h"
|
| 19 |
+
#include "../../../common/cpu_timestamps.h"
|
| 20 |
+
|
| 21 |
+
#include <cooperative_groups.h>
|
| 22 |
+
#include <cooperative_groups/memcpy_async.h>
|
| 23 |
+
|
| 24 |
+
using namespace nvcuda::experimental;
|
| 25 |
+
|
| 26 |
+
#define PREFETCH_COUNT 2
|
| 27 |
+
|
| 28 |
+
#define SMALL_FLOAT_VAL 0.00000001f
|
| 29 |
+
|
| 30 |
+
double rtclock()
|
| 31 |
+
{
|
| 32 |
+
struct timezone Tzp;
|
| 33 |
+
struct timeval Tp;
|
| 34 |
+
uint64_t stat;
|
| 35 |
+
stat = gettimeofday(&Tp, &Tzp);
|
| 36 |
+
if (stat != 0)
|
| 37 |
+
printf("Error return from gettimeofday: %d", stat);
|
| 38 |
+
return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
float absVal(float a)
|
| 42 |
+
{
|
| 43 |
+
if (a < 0)
|
| 44 |
+
{
|
| 45 |
+
return (a * -1);
|
| 46 |
+
}
|
| 47 |
+
else
|
| 48 |
+
{
|
| 49 |
+
return a;
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
float percentDiff(double val1, double val2)
|
| 54 |
+
{
|
| 55 |
+
if ((absVal(val1) < 0.01) && (absVal(val2) < 0.01))
|
| 56 |
+
{
|
| 57 |
+
return 0.0f;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
else
|
| 61 |
+
{
|
| 62 |
+
return 100.0f * (absVal(absVal(val1 - val2) / absVal(val1 + SMALL_FLOAT_VAL)));
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
// define the error threshold for the results "not matching"
|
| 67 |
+
#define PERCENT_DIFF_ERROR_THRESHOLD 0.05
|
| 68 |
+
|
| 69 |
+
/* Problem size */
|
| 70 |
+
#define SIZE 40960
|
| 71 |
+
uint64_t NI;
|
| 72 |
+
uint64_t NJ;
|
| 73 |
+
|
| 74 |
+
/* Thread block dimensions */
|
| 75 |
+
#define DIM_THREAD_BLOCK 256
|
| 76 |
+
|
| 77 |
+
#define BATCH_SIZE 16
|
| 78 |
+
|
| 79 |
+
/* Declared constant values for ALPHA and BETA (same as values in PolyBench 2.0) */
|
| 80 |
+
#define ALPHA 1.1f
|
| 81 |
+
#define BETA 1.1f
|
| 82 |
+
|
| 83 |
+
/* Can switch DATA_TYPE between float and double */
|
| 84 |
+
typedef float DATA_TYPE;
|
| 85 |
+
// typedef uint64_t DATA_TYPE;
|
| 86 |
+
|
| 87 |
+
void gemv(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C)
|
| 88 |
+
{
|
| 89 |
+
uint64_t i, j;
|
| 90 |
+
|
| 91 |
+
for (i = 0; i < NI; i++)
|
| 92 |
+
{
|
| 93 |
+
C[i] *= BETA;
|
| 94 |
+
for (j = 0; j < NJ; j++)
|
| 95 |
+
{
|
| 96 |
+
C[i] += ALPHA * A[i * NJ + j] * B[j];
|
| 97 |
+
}
|
| 98 |
+
}
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
void init(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C, DATA_TYPE *C_ref)
|
| 102 |
+
{
|
| 103 |
+
uint64_t i, j;
|
| 104 |
+
|
| 105 |
+
for (i = 0; i < NI; i++)
|
| 106 |
+
for (j = 0; j < NJ; j++)
|
| 107 |
+
A[i * NJ + j] = ((DATA_TYPE)i * j) / NI;
|
| 108 |
+
|
| 109 |
+
for (j = 0; j < NJ; j++)
|
| 110 |
+
B[j] = ((DATA_TYPE)j + 1) / NJ;
|
| 111 |
+
|
| 112 |
+
for (i = 0; i < NI; i++)
|
| 113 |
+
{
|
| 114 |
+
C[i] = ((DATA_TYPE)i + 2) / NI;
|
| 115 |
+
C_ref[i] = ((DATA_TYPE)i + 2) / NI;
|
| 116 |
+
}
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
void compareResults(DATA_TYPE *C, DATA_TYPE *C_outputFromGpu)
|
| 120 |
+
{
|
| 121 |
+
uint64_t i, fail;
|
| 122 |
+
fail = 0;
|
| 123 |
+
|
| 124 |
+
// Compare C1 and C2
|
| 125 |
+
for (i = 0; i < NI; i++)
|
| 126 |
+
{
|
| 127 |
+
if (percentDiff(C[i], C_outputFromGpu[i]) > PERCENT_DIFF_ERROR_THRESHOLD)
|
| 128 |
+
{
|
| 129 |
+
fail++;
|
| 130 |
+
printf("%d, GPU is %f, CPU is %f.\n", i, C[i], C_outputFromGpu[i]);
|
| 131 |
+
}
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
// Print results
|
| 135 |
+
printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail);
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
__global__ void gemv_kernel(DATA_TYPE *a, DATA_TYPE *b, DATA_TYPE *c, uint64_t NI, uint64_t NJ)
|
| 139 |
+
{
|
| 140 |
+
cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
|
| 141 |
+
pipeline pipe;
|
| 142 |
+
uint64_t row = blockIdx.x * blockDim.x + threadIdx.x;
|
| 143 |
+
uint64_t tx = threadIdx.x;
|
| 144 |
+
|
| 145 |
+
__shared__ DATA_TYPE s_b[PREFETCH_COUNT][DIM_THREAD_BLOCK][BATCH_SIZE];
|
| 146 |
+
|
| 147 |
+
DATA_TYPE tmp = BETA * c[row];
|
| 148 |
+
__syncthreads();
|
| 149 |
+
|
| 150 |
+
uint64_t fetch = 0;
|
| 151 |
+
uint64_t end_tile = NJ / BATCH_SIZE;
|
| 152 |
+
|
| 153 |
+
for (uint64_t compute = fetch; compute < end_tile; compute++)
|
| 154 |
+
{
|
| 155 |
+
for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++)
|
| 156 |
+
{
|
| 157 |
+
uint64_t base_index = fetch * BATCH_SIZE;
|
| 158 |
+
for (uint64_t k = 0; k < BATCH_SIZE; k++)
|
| 159 |
+
{
|
| 160 |
+
memcpy_async(s_b[fetch % PREFETCH_COUNT][tx][k], b[base_index + k], pipe);
|
| 161 |
+
}
|
| 162 |
+
pipe.commit();
|
| 163 |
+
}
|
| 164 |
+
if (fetch == end_tile)
|
| 165 |
+
{
|
| 166 |
+
for (uint64_t i = 0; i < PREFETCH_COUNT - 1; ++i)
|
| 167 |
+
{
|
| 168 |
+
pipe.commit();
|
| 169 |
+
}
|
| 170 |
+
++fetch;
|
| 171 |
+
}
|
| 172 |
+
pipe.wait_prior<PREFETCH_COUNT - 1>();
|
| 173 |
+
block.sync();
|
| 174 |
+
|
| 175 |
+
uint64_t base_index = compute * BATCH_SIZE;
|
| 176 |
+
for (uint64_t k = 0; k < BATCH_SIZE; k++)
|
| 177 |
+
{
|
| 178 |
+
tmp += ALPHA * a[row * NJ + base_index + k] * s_b[compute % PREFETCH_COUNT][tx][k];
|
| 179 |
+
}
|
| 180 |
+
block.sync();
|
| 181 |
+
}
|
| 182 |
+
c[row] = tmp;
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
void gemvCuda(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C, DATA_TYPE *A_gpu, DATA_TYPE *B_gpu, DATA_TYPE *C_gpu)
|
| 186 |
+
{
|
| 187 |
+
double t_start, t_end;
|
| 188 |
+
|
| 189 |
+
dim3 block(DIM_THREAD_BLOCK);
|
| 190 |
+
dim3 grid(NI / (DIM_THREAD_BLOCK));
|
| 191 |
+
|
| 192 |
+
// t_start = rtclock();
|
| 193 |
+
cudaMemcpy(A_gpu, A, sizeof(DATA_TYPE) * NI * NJ, cudaMemcpyHostToDevice);
|
| 194 |
+
cudaMemcpy(B_gpu, B, sizeof(DATA_TYPE) * NJ, cudaMemcpyHostToDevice);
|
| 195 |
+
cudaMemcpy(C_gpu, C, sizeof(DATA_TYPE) * NI, cudaMemcpyHostToDevice);
|
| 196 |
+
gemv_kernel<<<grid, block>>>(A_gpu, B_gpu, C_gpu, NI, NJ);
|
| 197 |
+
cudaDeviceSynchronize();
|
| 198 |
+
cudaMemcpy(C, C_gpu, sizeof(DATA_TYPE) * NI, cudaMemcpyDeviceToHost);
|
| 199 |
+
// t_end = rtclock();
|
| 200 |
+
|
| 201 |
+
// fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
int main(int argc, char *argv[])
|
| 205 |
+
{
|
| 206 |
+
uint64_t start_tsc = rdtsc();
|
| 207 |
+
uint64_t start_tsp = rdtsp();
|
| 208 |
+
printf("start_tsc %lu start_tsp %lu\n", start_tsc, start_tsp);
|
| 209 |
+
if (argc >= 3)
|
| 210 |
+
{
|
| 211 |
+
NI = atoll(argv[1]);
|
| 212 |
+
NJ = atoll(argv[2]);
|
| 213 |
+
}
|
| 214 |
+
else
|
| 215 |
+
{
|
| 216 |
+
NI = SIZE;
|
| 217 |
+
NJ = SIZE;
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
double t_start, t_end;
|
| 221 |
+
|
| 222 |
+
DATA_TYPE *A;
|
| 223 |
+
DATA_TYPE *B;
|
| 224 |
+
DATA_TYPE *C;
|
| 225 |
+
DATA_TYPE *C_ref;
|
| 226 |
+
|
| 227 |
+
DATA_TYPE *A_gpu;
|
| 228 |
+
DATA_TYPE *B_gpu;
|
| 229 |
+
DATA_TYPE *C_gpu;
|
| 230 |
+
|
| 231 |
+
A = (DATA_TYPE *)malloc(NI * NJ * sizeof(DATA_TYPE));
|
| 232 |
+
B = (DATA_TYPE *)malloc(NJ * sizeof(DATA_TYPE));
|
| 233 |
+
C = (DATA_TYPE *)malloc(NI * sizeof(DATA_TYPE));
|
| 234 |
+
C_ref = (DATA_TYPE *)malloc(NI * sizeof(DATA_TYPE));
|
| 235 |
+
|
| 236 |
+
// cudaMallocManaged(&A_gpu, sizeof(DATA_TYPE) * NI * NK);
|
| 237 |
+
// cudaMallocManaged(&B_gpu, sizeof(DATA_TYPE) * NK * NJ);
|
| 238 |
+
// cudaMallocManaged(&C_gpu, sizeof(DATA_TYPE) * NI * NJ);
|
| 239 |
+
|
| 240 |
+
init(A, B, C, C_ref);
|
| 241 |
+
|
| 242 |
+
GPU_argv_init();
|
| 243 |
+
initTrace();
|
| 244 |
+
startCPU();
|
| 245 |
+
|
| 246 |
+
cudaMalloc(&A_gpu, sizeof(DATA_TYPE) * NI * NJ);
|
| 247 |
+
cudaMalloc(&B_gpu, sizeof(DATA_TYPE) * NJ);
|
| 248 |
+
cudaMalloc(&C_gpu, sizeof(DATA_TYPE) * NI);
|
| 249 |
+
|
| 250 |
+
gemvCuda(A, B, C, A_gpu, B_gpu, C_gpu);
|
| 251 |
+
|
| 252 |
+
cudaFree(A_gpu);
|
| 253 |
+
cudaFree(B_gpu);
|
| 254 |
+
cudaFree(C_gpu);
|
| 255 |
+
endCPU();
|
| 256 |
+
finiTrace();
|
| 257 |
+
|
| 258 |
+
// t_start = rtclock();
|
| 259 |
+
// gemv(A, B, C_ref);
|
| 260 |
+
// t_end = rtclock();
|
| 261 |
+
// fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);
|
| 262 |
+
|
| 263 |
+
// compareResults(C, C_ref);
|
| 264 |
+
free(A);
|
| 265 |
+
free(B);
|
| 266 |
+
free(C);
|
| 267 |
+
free(C_ref);
|
| 268 |
+
return 0;
|
| 269 |
+
}
|