MNghia's picture
Add files using upload-large-folder tool
563c80f verified
/*
* Copyright 2014-2017 NVIDIA Corporation. All rights reserved
*
* Sample CUPTI app to print sass to source correlation
*/
#include <cuda.h>
#include <cuda_runtime.h>
#include <cupti.h>
#include <stdio.h>
#include <stdlib.h>
#define RUNTIME_API_CALL(apiFuncCall) \
do { \
cudaError_t _status = apiFuncCall; \
if (_status != cudaSuccess) { \
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
__FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\
exit(EXIT_FAILURE); \
} \
} while (0)
#define CUPTI_CALL(call) \
do { \
CUptiResult _status = call; \
if (_status != CUPTI_SUCCESS) { \
const char *errstr; \
cuptiGetResultString(_status, &errstr); \
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
__FILE__, __LINE__, #call, errstr); \
exit(EXIT_FAILURE); \
} \
} while (0)
static CUpti_SubscriberHandle g_subscriber;
#define BUF_SIZE (32 * 1024)
#define ALIGN_SIZE (8)
#define ALIGN_BUFFER(buffer, align) \
(((uintptr_t) (buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t) (buffer) & ((align)-1))) : (buffer))
const int TILE_DIM = 32;
const int BLOCK_ROWS = 8;
__global__
void transpose(float *d_Outdata, const float *d_Indata)
{
__shared__ float tile[TILE_DIM][TILE_DIM+1];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[threadIdx.y+j][threadIdx.x] = d_Indata[(y+j)*width + x];
__syncthreads();
x = blockIdx.y * TILE_DIM + threadIdx.x;
y = blockIdx.x * TILE_DIM + threadIdx.y;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
d_Outdata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
}
static void
printActivity(CUpti_Activity *record)
{
switch (record->kind) {
// The activity record for source locator contains the ID for the source path,
// path for the file and the line number in the source.
case CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR:
{
CUpti_ActivitySourceLocator *sourceLocator = (CUpti_ActivitySourceLocator *)record;
printf("SOURCE_LOCATOR SrcLctrId %d, File %s Line %d\n", sourceLocator->id, sourceLocator->fileName, sourceLocator->lineNumber);
break;
}
// The activity record for instruction execution corresponds to a PC of the generated code, it contains the ID for source locator
// the correlation ID of the kernel to which this record is associated, function ID and pc offset for the instruction.
case CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION:
{
CUpti_ActivityInstructionExecution *sourceRecord = (CUpti_ActivityInstructionExecution *)record;
printf("INSTRUCTION_EXECUTION srcLctr %u, corr %u, functionId %u, pc %x\n",
sourceRecord->sourceLocatorId, sourceRecord->correlationId, sourceRecord->functionId,
sourceRecord->pcOffset);
// number of threads that executed this instruction and number of times the instruction was executed
printf("notPredOffthread_inst_executed %llu, thread_inst_executed %llu, inst_executed %u\n\n",
(unsigned long long)sourceRecord->notPredOffThreadsExecuted,
(unsigned long long)sourceRecord->threadsExecuted, sourceRecord->executed);
break;
}
// function name and corresponding module information
case CUPTI_ACTIVITY_KIND_FUNCTION:
{
CUpti_ActivityFunction *fResult = (CUpti_ActivityFunction *)record;
printf("FUCTION functionId %u, moduleId %u, name %s\n",
fResult->id,
fResult->moduleId,
fResult->name);
break;
}
default:
printf(" <unknown>\n");
exit(EXIT_FAILURE);
break;
}
}
static void CUPTIAPI
bufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords)
{
uint8_t *b;
*size = BUF_SIZE;
b = (uint8_t *)malloc(*size + ALIGN_SIZE);
*buffer = ALIGN_BUFFER(b, ALIGN_SIZE);
*maxNumRecords = 0;
if (*buffer == NULL) {
printf("Error: out of memory\n");
exit(EXIT_FAILURE);
}
}
static void CUPTIAPI
bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize)
{
CUptiResult status;
CUpti_Activity *record = NULL;
do {
status = cuptiActivityGetNextRecord(buffer, validSize, &record);
if(status == CUPTI_SUCCESS) {
printActivity(record);
}
else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
break;
}
else {
CUPTI_CALL(status);
}
} while (1);
size_t dropped;
CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
if (dropped != 0) {
printf("Dropped %u activity records\n", (unsigned int)dropped);
}
free(buffer);
}
#define DUMP_CUBIN 0
void CUPTIAPI dumpCudaModule(CUpti_CallbackId cbid, void *resourceDescriptor)
{
#if DUMP_CUBIN
const char *pCubin;
size_t cubinSize;
// dump the cubin at MODULE_LOADED_STARTING
CUpti_ModuleResourceData *moduleResourceData = (CUpti_ModuleResourceData *)resourceDescriptor;
#endif
if (cbid == CUPTI_CBID_RESOURCE_MODULE_LOADED) {
#if DUMP_CUBIN
// You can use nvdisasm to dump the SASS from the cubin.
// Try nvdisasm -b -fun <function_id> sass_to_source.cubin
pCubin = moduleResourceData->pCubin;
cubinSize = moduleResourceData->cubinSize;
FILE *cubin;
cubin = fopen("sass_source_map.cubin", "wb");
fwrite(pCubin, sizeof(uint8_t), cubinSize, cubin);
fclose(cubin);
#endif
}
else if (cbid == CUPTI_CBID_RESOURCE_MODULE_UNLOAD_STARTING) {
// You can dump the cubin either at MODULE_LOADED or MODULE_UNLOAD_STARTING
}
}
static void
handleResource(CUpti_CallbackId cbid, const CUpti_ResourceData *resourceData)
{
if (cbid == CUPTI_CBID_RESOURCE_MODULE_LOADED) {
dumpCudaModule(cbid, resourceData->resourceDescriptor);
}
else if (cbid == CUPTI_CBID_RESOURCE_MODULE_UNLOAD_STARTING) {
dumpCudaModule(cbid, resourceData->resourceDescriptor);
}
}
static void CUPTIAPI
traceCallback(void *userdata, CUpti_CallbackDomain domain,
CUpti_CallbackId cbid, const void *cbdata)
{
if (domain == CUPTI_CB_DOMAIN_RESOURCE) {
handleResource(cbid, (CUpti_ResourceData *)cbdata);
}
}
void
initTrace()
{
// do cupti calls before any CUDA call
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION));
CUPTI_CALL(cuptiSubscribe(&g_subscriber, (CUpti_CallbackFunc)traceCallback, NULL));
CUPTI_CALL(cuptiEnableDomain(1, g_subscriber, CUPTI_CB_DOMAIN_RESOURCE));
CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted));
}
void
finiTrace()
{
CUPTI_CALL(cuptiActivityFlushAll(0));
CUPTI_CALL(cuptiUnsubscribe(g_subscriber));
CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION));
}
int
main(int argc, char *argv[])
{
const int nx = 32;
const int ny = 32;
const int mem_size = nx*ny*sizeof(float);
dim3 dimGrid(nx/TILE_DIM, ny/TILE_DIM, 1);
dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
cudaDeviceProp g_deviceProp;
initTrace();
RUNTIME_API_CALL(cudaGetDeviceProperties(&g_deviceProp, 0));
printf("Device Name: %s\n", g_deviceProp.name);
float *d_X, *d_Y;
float *h_X = (float*)malloc(mem_size);
float *h_Y = (float*)malloc(mem_size);
if (!(h_X && h_Y)) {
printf("Malloc failed\n");
exit(EXIT_FAILURE);
}
// initialization of host data
for (int j = 0; j < ny; j++) {
for (int i = 0; i < nx; i++) {
h_X[j*nx + i] = (float) (j*nx + i);
}
}
RUNTIME_API_CALL(cudaMalloc(&d_X, mem_size));
RUNTIME_API_CALL(cudaMalloc(&d_Y, mem_size));
RUNTIME_API_CALL(cudaMemcpy(d_X, h_X, mem_size, cudaMemcpyHostToDevice));
transpose<<<dimGrid, dimBlock>>>(d_Y, d_X);
RUNTIME_API_CALL(cudaMemcpy(h_Y, d_Y, mem_size, cudaMemcpyDeviceToHost));
free(h_X);
free(h_Y);
cudaFree(d_X);
cudaFree(d_Y);
cudaDeviceSynchronize();
cudaDeviceReset();
finiTrace();
exit(EXIT_SUCCESS);
}