/* * Copyright 2014-2017 NVIDIA Corporation. All rights reserved * * Sample CUPTI app to print sass to source correlation */ #include #include #include #include #include #define RUNTIME_API_CALL(apiFuncCall) \ do { \ cudaError_t _status = apiFuncCall; \ if (_status != cudaSuccess) { \ fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\ exit(EXIT_FAILURE); \ } \ } while (0) #define CUPTI_CALL(call) \ do { \ CUptiResult _status = call; \ if (_status != CUPTI_SUCCESS) { \ const char *errstr; \ cuptiGetResultString(_status, &errstr); \ fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ __FILE__, __LINE__, #call, errstr); \ exit(EXIT_FAILURE); \ } \ } while (0) static CUpti_SubscriberHandle g_subscriber; #define BUF_SIZE (32 * 1024) #define ALIGN_SIZE (8) #define ALIGN_BUFFER(buffer, align) \ (((uintptr_t) (buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t) (buffer) & ((align)-1))) : (buffer)) const int TILE_DIM = 32; const int BLOCK_ROWS = 8; __global__ void transpose(float *d_Outdata, const float *d_Indata) { __shared__ float tile[TILE_DIM][TILE_DIM+1]; int x = blockIdx.x * TILE_DIM + threadIdx.x; int y = blockIdx.y * TILE_DIM + threadIdx.y; int width = gridDim.x * TILE_DIM; for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) tile[threadIdx.y+j][threadIdx.x] = d_Indata[(y+j)*width + x]; __syncthreads(); x = blockIdx.y * TILE_DIM + threadIdx.x; y = blockIdx.x * TILE_DIM + threadIdx.y; for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) d_Outdata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; } static void printActivity(CUpti_Activity *record) { switch (record->kind) { // The activity record for source locator contains the ID for the source path, // path for the file and the line number in the source. case CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR: { CUpti_ActivitySourceLocator *sourceLocator = (CUpti_ActivitySourceLocator *)record; printf("SOURCE_LOCATOR SrcLctrId %d, File %s Line %d\n", sourceLocator->id, sourceLocator->fileName, sourceLocator->lineNumber); break; } // The activity record for instruction execution corresponds to a PC of the generated code, it contains the ID for source locator // the correlation ID of the kernel to which this record is associated, function ID and pc offset for the instruction. case CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION: { CUpti_ActivityInstructionExecution *sourceRecord = (CUpti_ActivityInstructionExecution *)record; printf("INSTRUCTION_EXECUTION srcLctr %u, corr %u, functionId %u, pc %x\n", sourceRecord->sourceLocatorId, sourceRecord->correlationId, sourceRecord->functionId, sourceRecord->pcOffset); // number of threads that executed this instruction and number of times the instruction was executed printf("notPredOffthread_inst_executed %llu, thread_inst_executed %llu, inst_executed %u\n\n", (unsigned long long)sourceRecord->notPredOffThreadsExecuted, (unsigned long long)sourceRecord->threadsExecuted, sourceRecord->executed); break; } // function name and corresponding module information case CUPTI_ACTIVITY_KIND_FUNCTION: { CUpti_ActivityFunction *fResult = (CUpti_ActivityFunction *)record; printf("FUCTION functionId %u, moduleId %u, name %s\n", fResult->id, fResult->moduleId, fResult->name); break; } default: printf(" \n"); exit(EXIT_FAILURE); break; } } static void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords) { uint8_t *b; *size = BUF_SIZE; b = (uint8_t *)malloc(*size + ALIGN_SIZE); *buffer = ALIGN_BUFFER(b, ALIGN_SIZE); *maxNumRecords = 0; if (*buffer == NULL) { printf("Error: out of memory\n"); exit(EXIT_FAILURE); } } static void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize) { CUptiResult status; CUpti_Activity *record = NULL; do { status = cuptiActivityGetNextRecord(buffer, validSize, &record); if(status == CUPTI_SUCCESS) { printActivity(record); } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) { break; } else { CUPTI_CALL(status); } } while (1); size_t dropped; CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); if (dropped != 0) { printf("Dropped %u activity records\n", (unsigned int)dropped); } free(buffer); } #define DUMP_CUBIN 0 void CUPTIAPI dumpCudaModule(CUpti_CallbackId cbid, void *resourceDescriptor) { #if DUMP_CUBIN const char *pCubin; size_t cubinSize; // dump the cubin at MODULE_LOADED_STARTING CUpti_ModuleResourceData *moduleResourceData = (CUpti_ModuleResourceData *)resourceDescriptor; #endif if (cbid == CUPTI_CBID_RESOURCE_MODULE_LOADED) { #if DUMP_CUBIN // You can use nvdisasm to dump the SASS from the cubin. // Try nvdisasm -b -fun sass_to_source.cubin pCubin = moduleResourceData->pCubin; cubinSize = moduleResourceData->cubinSize; FILE *cubin; cubin = fopen("sass_source_map.cubin", "wb"); fwrite(pCubin, sizeof(uint8_t), cubinSize, cubin); fclose(cubin); #endif } else if (cbid == CUPTI_CBID_RESOURCE_MODULE_UNLOAD_STARTING) { // You can dump the cubin either at MODULE_LOADED or MODULE_UNLOAD_STARTING } } static void handleResource(CUpti_CallbackId cbid, const CUpti_ResourceData *resourceData) { if (cbid == CUPTI_CBID_RESOURCE_MODULE_LOADED) { dumpCudaModule(cbid, resourceData->resourceDescriptor); } else if (cbid == CUPTI_CBID_RESOURCE_MODULE_UNLOAD_STARTING) { dumpCudaModule(cbid, resourceData->resourceDescriptor); } } static void CUPTIAPI traceCallback(void *userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const void *cbdata) { if (domain == CUPTI_CB_DOMAIN_RESOURCE) { handleResource(cbid, (CUpti_ResourceData *)cbdata); } } void initTrace() { // do cupti calls before any CUDA call CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION)); CUPTI_CALL(cuptiSubscribe(&g_subscriber, (CUpti_CallbackFunc)traceCallback, NULL)); CUPTI_CALL(cuptiEnableDomain(1, g_subscriber, CUPTI_CB_DOMAIN_RESOURCE)); CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted)); } void finiTrace() { CUPTI_CALL(cuptiActivityFlushAll(0)); CUPTI_CALL(cuptiUnsubscribe(g_subscriber)); CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION)); } int main(int argc, char *argv[]) { const int nx = 32; const int ny = 32; const int mem_size = nx*ny*sizeof(float); dim3 dimGrid(nx/TILE_DIM, ny/TILE_DIM, 1); dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1); cudaDeviceProp g_deviceProp; initTrace(); RUNTIME_API_CALL(cudaGetDeviceProperties(&g_deviceProp, 0)); printf("Device Name: %s\n", g_deviceProp.name); float *d_X, *d_Y; float *h_X = (float*)malloc(mem_size); float *h_Y = (float*)malloc(mem_size); if (!(h_X && h_Y)) { printf("Malloc failed\n"); exit(EXIT_FAILURE); } // initialization of host data for (int j = 0; j < ny; j++) { for (int i = 0; i < nx; i++) { h_X[j*nx + i] = (float) (j*nx + i); } } RUNTIME_API_CALL(cudaMalloc(&d_X, mem_size)); RUNTIME_API_CALL(cudaMalloc(&d_Y, mem_size)); RUNTIME_API_CALL(cudaMemcpy(d_X, h_X, mem_size, cudaMemcpyHostToDevice)); transpose<<>>(d_Y, d_X); RUNTIME_API_CALL(cudaMemcpy(h_Y, d_Y, mem_size, cudaMemcpyDeviceToHost)); free(h_X); free(h_Y); cudaFree(d_X); cudaFree(d_Y); cudaDeviceSynchronize(); cudaDeviceReset(); finiTrace(); exit(EXIT_SUCCESS); }