/* * Copyright 2011-2020 NVIDIA Corporation. All rights reserved * * Sample CUPTI app to print a trace of CUDA API and GPU activity * using asynchronous handling of activity buffers. * */ #include #include #include #include #ifndef EXIT_WAIVED #define EXIT_WAIVED 2 #endif #define CUPTI_CALL(call) \ do { \ CUptiResult _status = call; \ if (_status != CUPTI_SUCCESS) { \ const char *errstr; \ cuptiGetResultString(_status, &errstr); \ fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ __FILE__, __LINE__, #call, errstr); \ exit(EXIT_FAILURE); \ } \ } while (0) #define BUF_SIZE (32 * 1024) #define ALIGN_SIZE (8) #define ALIGN_BUFFER(buffer, align) \ (((uintptr_t) (buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t) (buffer) & ((align)-1))) : (buffer)) // Timestamp at trace initialization time. Used to normalized other // timestamps static uint64_t startTimestamp; static const char * getMemcpyKindString(CUpti_ActivityMemcpyKind kind) { switch (kind) { case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD: return "HtoD"; case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH: return "DtoH"; case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA: return "HtoA"; case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH: return "AtoH"; case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA: return "AtoA"; case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD: return "AtoD"; case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA: return "DtoA"; case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD: return "DtoD"; case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH: return "HtoH"; default: break; } return ""; } const char * getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) { switch (kind) { case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER: return "COMPILER"; case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH: return "BUFFER_FLUSH"; case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION: return "INSTRUMENTATION"; case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE: return "RESOURCE"; default: break; } return ""; } const char * getActivityObjectKindString(CUpti_ActivityObjectKind kind) { switch (kind) { case CUPTI_ACTIVITY_OBJECT_PROCESS: return "PROCESS"; case CUPTI_ACTIVITY_OBJECT_THREAD: return "THREAD"; case CUPTI_ACTIVITY_OBJECT_DEVICE: return "DEVICE"; case CUPTI_ACTIVITY_OBJECT_CONTEXT: return "CONTEXT"; case CUPTI_ACTIVITY_OBJECT_STREAM: return "STREAM"; default: break; } return ""; } uint32_t getActivityObjectKindId(CUpti_ActivityObjectKind kind, CUpti_ActivityObjectKindId *id) { switch (kind) { case CUPTI_ACTIVITY_OBJECT_PROCESS: return id->pt.processId; case CUPTI_ACTIVITY_OBJECT_THREAD: return id->pt.threadId; case CUPTI_ACTIVITY_OBJECT_DEVICE: return id->dcs.deviceId; case CUPTI_ACTIVITY_OBJECT_CONTEXT: return id->dcs.contextId; case CUPTI_ACTIVITY_OBJECT_STREAM: return id->dcs.streamId; default: break; } return 0xffffffff; } static const char * getComputeApiKindString(CUpti_ActivityComputeApiKind kind) { switch (kind) { case CUPTI_ACTIVITY_COMPUTE_API_CUDA: return "CUDA"; case CUPTI_ACTIVITY_COMPUTE_API_CUDA_MPS: return "CUDA_MPS"; default: break; } return ""; } static void printActivity(CUpti_Activity *record) { switch (record->kind) { case CUPTI_ACTIVITY_KIND_DEVICE: { CUpti_ActivityDevice4 *device = (CUpti_ActivityDevice4 *) record; printf("DEVICE %s (%u), capability %u.%u, global memory (bandwidth %u GB/s, size %u MB), " "multiprocessors %u, clock %u MHz\n", device->name, device->id, device->computeCapabilityMajor, device->computeCapabilityMinor, (unsigned int) (device->globalMemoryBandwidth / 1024 / 1024), (unsigned int) (device->globalMemorySize / 1024 / 1024), device->numMultiprocessors, (unsigned int) (device->coreClockRate / 1000)); break; } case CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE: { CUpti_ActivityDeviceAttribute *attribute = (CUpti_ActivityDeviceAttribute *)record; printf("DEVICE_ATTRIBUTE %u, device %u, value=0x%llx\n", attribute->attribute.cupti, attribute->deviceId, (unsigned long long)attribute->value.vUint64); break; } case CUPTI_ACTIVITY_KIND_CONTEXT: { CUpti_ActivityContext *context = (CUpti_ActivityContext *) record; printf("CONTEXT %u, device %u, compute API %s, NULL stream %d\n", context->contextId, context->deviceId, getComputeApiKindString((CUpti_ActivityComputeApiKind) context->computeApiKind), (int) context->nullStreamId); break; } case CUPTI_ACTIVITY_KIND_MEMCPY: { CUpti_ActivityMemcpy5 *memcpy = (CUpti_ActivityMemcpy5 *) record; printf("MEMCPY %s [ %llu - %llu ] device %u, context %u, stream %u, size %llu, correlation %u\n", getMemcpyKindString((CUpti_ActivityMemcpyKind)memcpy->copyKind), (unsigned long long) (memcpy->start - startTimestamp), (unsigned long long) (memcpy->end - startTimestamp), memcpy->deviceId, memcpy->contextId, memcpy->streamId, (unsigned long long)memcpy->bytes, memcpy->correlationId); break; } case CUPTI_ACTIVITY_KIND_MEMSET: { CUpti_ActivityMemset4 *memset = (CUpti_ActivityMemset4 *) record; printf("MEMSET value=%u [ %llu - %llu ] device %u, context %u, stream %u, correlation %u\n", memset->value, (unsigned long long) (memset->start - startTimestamp), (unsigned long long) (memset->end - startTimestamp), memset->deviceId, memset->contextId, memset->streamId, memset->correlationId); break; } case CUPTI_ACTIVITY_KIND_KERNEL: case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: { const char* kindString = (record->kind == CUPTI_ACTIVITY_KIND_KERNEL) ? "KERNEL" : "CONC KERNEL"; CUpti_ActivityKernel8 *kernel = (CUpti_ActivityKernel8 *) record; printf("%s \"%s\" [ %llu - %llu ] device %u, context %u, stream %u, correlation %u\n", kindString, kernel->name, (unsigned long long) (kernel->start - startTimestamp), (unsigned long long) (kernel->end - startTimestamp), kernel->deviceId, kernel->contextId, kernel->streamId, kernel->correlationId); printf(" grid [%u,%u,%u], block [%u,%u,%u], shared memory (static %u, dynamic %u)\n", kernel->gridX, kernel->gridY, kernel->gridZ, kernel->blockX, kernel->blockY, kernel->blockZ, kernel->staticSharedMemory, kernel->dynamicSharedMemory); break; } case CUPTI_ACTIVITY_KIND_DRIVER: { CUpti_ActivityAPI *api = (CUpti_ActivityAPI *) record; printf("DRIVER cbid=%u [ %llu - %llu ] process %u, thread %u, correlation %u\n", api->cbid, (unsigned long long) (api->start - startTimestamp), (unsigned long long) (api->end - startTimestamp), api->processId, api->threadId, api->correlationId); break; } case CUPTI_ACTIVITY_KIND_RUNTIME: { CUpti_ActivityAPI *api = (CUpti_ActivityAPI *) record; printf("RUNTIME cbid=%u [ %llu - %llu ] process %u, thread %u, correlation %u\n", api->cbid, (unsigned long long) (api->start - startTimestamp), (unsigned long long) (api->end - startTimestamp), api->processId, api->threadId, api->correlationId); break; } case CUPTI_ACTIVITY_KIND_NAME: { CUpti_ActivityName *name = (CUpti_ActivityName *) record; switch (name->objectKind) { case CUPTI_ACTIVITY_OBJECT_CONTEXT: printf("NAME %s %u %s id %u, name %s\n", getActivityObjectKindString(name->objectKind), getActivityObjectKindId(name->objectKind, &name->objectId), getActivityObjectKindString(CUPTI_ACTIVITY_OBJECT_DEVICE), getActivityObjectKindId(CUPTI_ACTIVITY_OBJECT_DEVICE, &name->objectId), name->name); break; case CUPTI_ACTIVITY_OBJECT_STREAM: printf("NAME %s %u %s %u %s id %u, name %s\n", getActivityObjectKindString(name->objectKind), getActivityObjectKindId(name->objectKind, &name->objectId), getActivityObjectKindString(CUPTI_ACTIVITY_OBJECT_CONTEXT), getActivityObjectKindId(CUPTI_ACTIVITY_OBJECT_CONTEXT, &name->objectId), getActivityObjectKindString(CUPTI_ACTIVITY_OBJECT_DEVICE), getActivityObjectKindId(CUPTI_ACTIVITY_OBJECT_DEVICE, &name->objectId), name->name); break; default: printf("NAME %s id %u, name %s\n", getActivityObjectKindString(name->objectKind), getActivityObjectKindId(name->objectKind, &name->objectId), name->name); break; } break; } case CUPTI_ACTIVITY_KIND_MARKER: { CUpti_ActivityMarker2 *marker = (CUpti_ActivityMarker2 *) record; printf("MARKER id %u [ %llu ], name %s, domain %s\n", marker->id, (unsigned long long) marker->timestamp, marker->name, marker->domain); break; } case CUPTI_ACTIVITY_KIND_MARKER_DATA: { CUpti_ActivityMarkerData *marker = (CUpti_ActivityMarkerData *) record; printf("MARKER_DATA id %u, color 0x%x, category %u, payload %llu/%f\n", marker->id, marker->color, marker->category, (unsigned long long) marker->payload.metricValueUint64, marker->payload.metricValueDouble); break; } case CUPTI_ACTIVITY_KIND_OVERHEAD: { CUpti_ActivityOverhead *overhead = (CUpti_ActivityOverhead *) record; printf("OVERHEAD %s [ %llu, %llu ] %s id %u\n", getActivityOverheadKindString(overhead->overheadKind), (unsigned long long) overhead->start - startTimestamp, (unsigned long long) overhead->end - startTimestamp, getActivityObjectKindString(overhead->objectKind), getActivityObjectKindId(overhead->objectKind, &overhead->objectId)); break; } default: printf(" \n"); break; } } void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords) { uint8_t *bfr = (uint8_t *) malloc(BUF_SIZE + ALIGN_SIZE); if (bfr == NULL) { printf("Error: out of memory\n"); exit(EXIT_FAILURE); } *size = BUF_SIZE; *buffer = ALIGN_BUFFER(bfr, ALIGN_SIZE); *maxNumRecords = 0; } void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize) { CUptiResult status; CUpti_Activity *record = NULL; if (validSize > 0) { do { status = cuptiActivityGetNextRecord(buffer, validSize, &record); if (status == CUPTI_SUCCESS) { printActivity(record); } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) break; else { CUPTI_CALL(status); } } while (1); // report any records dropped from the queue size_t dropped; CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); if (dropped != 0) { printf("Dropped %u activity records\n", (unsigned int) dropped); } } free(buffer); } void initTrace() { size_t attrValue = 0, attrValueSize = sizeof(size_t); // Device activity record is created when CUDA initializes, so we // want to enable it before cuInit() or any CUDA runtime call. CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); // Enable all other activity record kinds. CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT)); CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER)); CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME)); CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER)); CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); // Register callbacks for buffer requests and for buffers completed by CUPTI. CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted)); // Optionally get and set activity attributes. // Attributes can be set by the CUPTI client to change behavior of the activity API. // Some attributes require to be set before any CUDA context is created to be effective, // e.g. to be applied to all device buffer allocations (see documentation). CUPTI_CALL(cuptiActivityGetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE, &attrValueSize, &attrValue)); printf("%s = %llu B\n", "CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE", (long long unsigned)attrValue); attrValue *= 2; CUPTI_CALL(cuptiActivitySetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE, &attrValueSize, &attrValue)); CUPTI_CALL(cuptiActivityGetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT, &attrValueSize, &attrValue)); printf("%s = %llu\n", "CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT", (long long unsigned)attrValue); attrValue *= 2; CUPTI_CALL(cuptiActivitySetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT, &attrValueSize, &attrValue)); CUPTI_CALL(cuptiGetTimestamp(&startTimestamp)); } void finiTrace() { // Force flush any remaining activity buffers before termination of the application CUPTI_CALL(cuptiActivityFlushAll(1)); }