|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <cuda.h> |
|
|
#include <cuda_runtime.h> |
|
|
#include <cupti.h> |
|
|
#include <stdio.h> |
|
|
#include <stdlib.h> |
|
|
|
|
|
#define RUNTIME_API_CALL(apiFuncCall) \ |
|
|
do { \ |
|
|
cudaError_t _status = apiFuncCall |
|
|
if (_status != cudaSuccess) { \ |
|
|
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ |
|
|
__FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\ |
|
|
exit(EXIT_FAILURE) |
|
|
} \ |
|
|
} while (0) |
|
|
|
|
|
#define CUPTI_CALL(call) \ |
|
|
do { \ |
|
|
CUptiResult _status = call |
|
|
if (_status != CUPTI_SUCCESS) { \ |
|
|
const char *errstr |
|
|
cuptiGetResultString(_status, &errstr) |
|
|
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ |
|
|
__FILE__, __LINE__, #call, errstr); \ |
|
|
exit(EXIT_FAILURE) |
|
|
} \ |
|
|
} while (0) |
|
|
|
|
|
static CUpti_SubscriberHandle g_subscriber |
|
|
|
|
|
#define BUF_SIZE (32 * 1024) |
|
|
#define ALIGN_SIZE (8) |
|
|
#define ALIGN_BUFFER(buffer, align) \ |
|
|
(((uintptr_t) (buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t) (buffer) & ((align)-1))) : (buffer)) |
|
|
|
|
|
const int TILE_DIM = 32 |
|
|
const int BLOCK_ROWS = 8 |
|
|
|
|
|
__global__ |
|
|
void transpose(float *d_Outdata, const float *d_Indata) |
|
|
{ |
|
|
__shared__ float tile[TILE_DIM][TILE_DIM+1] |
|
|
|
|
|
int x = blockIdx.x * TILE_DIM + threadIdx.x |
|
|
int y = blockIdx.y * TILE_DIM + threadIdx.y |
|
|
int width = gridDim.x * TILE_DIM |
|
|
|
|
|
for (int j = 0 |
|
|
tile[threadIdx.y+j][threadIdx.x] = d_Indata[(y+j)*width + x] |
|
|
__syncthreads() |
|
|
|
|
|
x = blockIdx.y * TILE_DIM + threadIdx.x |
|
|
y = blockIdx.x * TILE_DIM + threadIdx.y |
|
|
|
|
|
for (int j = 0 |
|
|
d_Outdata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j] |
|
|
} |
|
|
|
|
|
static void |
|
|
printActivity(CUpti_Activity *record) |
|
|
{ |
|
|
switch (record->kind) { |
|
|
|
|
|
|
|
|
case CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR: |
|
|
{ |
|
|
CUpti_ActivitySourceLocator *sourceLocator = (CUpti_ActivitySourceLocator *)record |
|
|
printf("SOURCE_LOCATOR SrcLctrId %d, File %s Line %d\n", sourceLocator->id, sourceLocator->fileName, sourceLocator->lineNumber) |
|
|
break |
|
|
} |
|
|
|
|
|
|
|
|
case CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION: |
|
|
{ |
|
|
CUpti_ActivityInstructionExecution *sourceRecord = (CUpti_ActivityInstructionExecution *)record |
|
|
printf("INSTRUCTION_EXECUTION srcLctr %u, corr %u, functionId %u, pc %x\n", |
|
|
sourceRecord->sourceLocatorId, sourceRecord->correlationId, sourceRecord->functionId, |
|
|
sourceRecord->pcOffset) |
|
|
|
|
|
printf("notPredOffthread_inst_executed %llu, thread_inst_executed %llu, inst_executed %u\n\n", |
|
|
(unsigned long long)sourceRecord->notPredOffThreadsExecuted, |
|
|
(unsigned long long)sourceRecord->threadsExecuted, sourceRecord->executed) |
|
|
break |
|
|
} |
|
|
|
|
|
case CUPTI_ACTIVITY_KIND_FUNCTION: |
|
|
{ |
|
|
CUpti_ActivityFunction *fResult = (CUpti_ActivityFunction *)record |
|
|
printf("FUCTION functionId %u, moduleId %u, name %s\n", |
|
|
fResult->id, |
|
|
fResult->moduleId, |
|
|
fResult->name) |
|
|
break |
|
|
} |
|
|
default: |
|
|
printf(" <unknown>\n") |
|
|
exit(EXIT_FAILURE) |
|
|
break |
|
|
} |
|
|
} |
|
|
|
|
|
static void CUPTIAPI |
|
|
bufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords) |
|
|
{ |
|
|
uint8_t *b |
|
|
|
|
|
*size = BUF_SIZE |
|
|
b = (uint8_t *)malloc(*size + ALIGN_SIZE) |
|
|
|
|
|
*buffer = ALIGN_BUFFER(b, ALIGN_SIZE) |
|
|
*maxNumRecords = 0 |
|
|
|
|
|
if (*buffer == NULL) { |
|
|
printf("Error: out of memory\n") |
|
|
exit(EXIT_FAILURE) |
|
|
} |
|
|
} |
|
|
|
|
|
static void CUPTIAPI |
|
|
bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize) |
|
|
{ |
|
|
CUptiResult status |
|
|
CUpti_Activity *record = NULL |
|
|
|
|
|
do { |
|
|
status = cuptiActivityGetNextRecord(buffer, validSize, &record) |
|
|
if(status == CUPTI_SUCCESS) { |
|
|
printActivity(record) |
|
|
} |
|
|
else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) { |
|
|
break |
|
|
} |
|
|
else { |
|
|
CUPTI_CALL(status) |
|
|
} |
|
|
} while (1) |
|
|
|
|
|
size_t dropped |
|
|
CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)) |
|
|
if (dropped != 0) { |
|
|
printf("Dropped %u activity records\n", (unsigned int)dropped) |
|
|
} |
|
|
|
|
|
free(buffer) |
|
|
} |
|
|
|
|
|
#define DUMP_CUBIN 0 |
|
|
|
|
|
void CUPTIAPI dumpCudaModule(CUpti_CallbackId cbid, void *resourceDescriptor) |
|
|
{ |
|
|
#if DUMP_CUBIN |
|
|
const char *pCubin |
|
|
size_t cubinSize |
|
|
|
|
|
|
|
|
CUpti_ModuleResourceData *moduleResourceData = (CUpti_ModuleResourceData *)resourceDescriptor |
|
|
#endif |
|
|
|
|
|
if (cbid == CUPTI_CBID_RESOURCE_MODULE_LOADED) { |
|
|
#if DUMP_CUBIN |
|
|
|
|
|
|
|
|
pCubin = moduleResourceData->pCubin |
|
|
cubinSize = moduleResourceData->cubinSize |
|
|
|
|
|
FILE *cubin |
|
|
cubin = fopen("sass_source_map.cubin", "wb") |
|
|
fwrite(pCubin, sizeof(uint8_t), cubinSize, cubin) |
|
|
fclose(cubin) |
|
|
#endif |
|
|
} |
|
|
else if (cbid == CUPTI_CBID_RESOURCE_MODULE_UNLOAD_STARTING) { |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
static void |
|
|
handleResource(CUpti_CallbackId cbid, const CUpti_ResourceData *resourceData) |
|
|
{ |
|
|
if (cbid == CUPTI_CBID_RESOURCE_MODULE_LOADED) { |
|
|
dumpCudaModule(cbid, resourceData->resourceDescriptor) |
|
|
} |
|
|
else if (cbid == CUPTI_CBID_RESOURCE_MODULE_UNLOAD_STARTING) { |
|
|
dumpCudaModule(cbid, resourceData->resourceDescriptor) |
|
|
} |
|
|
} |
|
|
|
|
|
static void CUPTIAPI |
|
|
traceCallback(void *userdata, CUpti_CallbackDomain domain, |
|
|
CUpti_CallbackId cbid, const void *cbdata) |
|
|
{ |
|
|
if (domain == CUPTI_CB_DOMAIN_RESOURCE) { |
|
|
handleResource(cbid, (CUpti_ResourceData *)cbdata) |
|
|
} |
|
|
} |
|
|
|
|
|
void |
|
|
initTrace() |
|
|
{ |
|
|
|
|
|
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION)) |
|
|
CUPTI_CALL(cuptiSubscribe(&g_subscriber, (CUpti_CallbackFunc)traceCallback, NULL)) |
|
|
CUPTI_CALL(cuptiEnableDomain(1, g_subscriber, CUPTI_CB_DOMAIN_RESOURCE)) |
|
|
CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted)) |
|
|
} |
|
|
|
|
|
void |
|
|
finiTrace() |
|
|
{ |
|
|
CUPTI_CALL(cuptiActivityFlushAll(0)) |
|
|
CUPTI_CALL(cuptiUnsubscribe(g_subscriber)) |
|
|
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION)) |
|
|
} |
|
|
|
|
|
int |
|
|
main(int argc, char *argv[]) |
|
|
{ |
|
|
const int nx = 32 |
|
|
const int ny = 32 |
|
|
const int mem_size = nx*ny*sizeof(float) |
|
|
dim3 dimGrid(nx/TILE_DIM, ny/TILE_DIM, 1) |
|
|
dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1) |
|
|
cudaDeviceProp g_deviceProp |
|
|
|
|
|
initTrace() |
|
|
|
|
|
RUNTIME_API_CALL(cudaGetDeviceProperties(&g_deviceProp, 0)) |
|
|
printf("Device Name: %s\n", g_deviceProp.name) |
|
|
|
|
|
float *d_X, *d_Y |
|
|
|
|
|
float *h_X = (float*)malloc(mem_size) |
|
|
float *h_Y = (float*)malloc(mem_size) |
|
|
if (!(h_X && h_Y)) { |
|
|
printf("Malloc failed\n") |
|
|
exit(EXIT_FAILURE) |
|
|
} |
|
|
|
|
|
for (int j = 0 |
|
|
for (int i = 0 |
|
|
h_X[j*nx + i] = (float) (j*nx + i) |
|
|
} |
|
|
} |
|
|
RUNTIME_API_CALL(cudaMalloc(&d_X, mem_size)) |
|
|
RUNTIME_API_CALL(cudaMalloc(&d_Y, mem_size)) |
|
|
|
|
|
RUNTIME_API_CALL(cudaMemcpy(d_X, h_X, mem_size, cudaMemcpyHostToDevice)) |
|
|
|
|
|
transpose<<<dimGrid, dimBlock>>>(d_Y, d_X) |
|
|
|
|
|
RUNTIME_API_CALL(cudaMemcpy(h_Y, d_Y, mem_size, cudaMemcpyDeviceToHost)) |
|
|
|
|
|
free(h_X) |
|
|
free(h_Y) |
|
|
|
|
|
cudaFree(d_X) |
|
|
cudaFree(d_Y) |
|
|
|
|
|
cudaDeviceSynchronize() |
|
|
cudaDeviceReset() |
|
|
|
|
|
finiTrace() |
|
|
exit(EXIT_SUCCESS) |
|
|
} |
|
|
|
|
|
|