/* * Copyright 2014-2017 NVIDIA Corporation. All rights reserved. * * Sample app to demonstrate use of CUPTI library to obtain profiler * event values on a multi-gpu setup without serializing the compute * work on the devices. */ #include #include #include #include #ifndef EXIT_WAIVED #define EXIT_WAIVED 2 #endif #define CUPTI_CALL(call) \ do { \ CUptiResult _status = call; \ if (_status != CUPTI_SUCCESS) { \ const char *errstr; \ cuptiGetResultString(_status, &errstr); \ fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ __FILE__, __LINE__, #call, errstr); \ exit(EXIT_FAILURE); \ } \ } while (0) #define DRIVER_API_CALL(apiFuncCall) \ do { \ CUresult _status = apiFuncCall; \ if (_status != CUDA_SUCCESS) { \ fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \ __FILE__, __LINE__, #apiFuncCall, _status); \ exit(EXIT_FAILURE); \ } \ } while (0) #define RUNTIME_API_CALL(apiFuncCall) \ do { \ cudaError_t _status = apiFuncCall; \ if (_status != cudaSuccess) { \ fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\ exit(EXIT_FAILURE); \ } \ } while (0) #define MAX_DEVICES 32 #define BLOCK_X 32 #define GRID_X 32 // default event #define EVENT_NAME "inst_executed" // dummy kernel __global__ void kernel() { uint64_t i = 0; volatile uint64_t limit = 1024 * 128; for (i = 0; i < limit; i++) { } } int main(int argc, char *argv[]) { int deviceCount; char deviceName[256]; CUdevice device[MAX_DEVICES]; CUcontext context[MAX_DEVICES]; CUpti_EventGroup eventGroup[MAX_DEVICES]; CUpti_EventID eventId[MAX_DEVICES]; size_t bytesRead, valueSize; uint32_t numInstances = 0, j = 0; uint64_t *eventValues = NULL, eventVal = 0; const char *eventName; int i = 0; uint32_t profile_all = 1; printf("Usage: %s [event_name]\n", argv[0]); DRIVER_API_CALL(cuInit(0)); DRIVER_API_CALL(cuDeviceGetCount(&deviceCount)); if (deviceCount == 0) { printf("There is no device supporting CUDA.\n"); exit(EXIT_WAIVED); } if (deviceCount < 2) { printf("This multi-gpu test is waived on single gpu setup.\n"); exit(EXIT_WAIVED); } if (deviceCount > MAX_DEVICES) { printf("Found more devices (%d) than handled in the test (%d)\n", deviceCount, MAX_DEVICES); exit(EXIT_WAIVED); } if (argc > 1) { eventName = argv[1]; } else { eventName = EVENT_NAME; } for (i = 0; i < deviceCount; i++) { DRIVER_API_CALL(cuDeviceGet(&device[i], i)); DRIVER_API_CALL(cuDeviceGetName(deviceName, 256, device[i])); printf("CUDA Device Name: %s\n", deviceName); } // create one context per device for (i = 0; i < deviceCount; i++) { RUNTIME_API_CALL(cudaSetDevice(i)); DRIVER_API_CALL(cuCtxCreate(&(context[i]), 0, device[i])); DRIVER_API_CALL(cuCtxPopCurrent(&(context[i]))); } // enable event profiling on each device for (i = 0; i < deviceCount; i++) { RUNTIME_API_CALL(cudaSetDevice(i)); DRIVER_API_CALL(cuCtxPushCurrent(context[i])); CUPTI_CALL(cuptiSetEventCollectionMode(context[i], CUPTI_EVENT_COLLECTION_MODE_KERNEL)); CUPTI_CALL(cuptiEventGroupCreate(context[i], &eventGroup[i], 0)); CUPTI_CALL(cuptiEventGetIdFromName(device[i], eventName, &eventId[i])); CUPTI_CALL(cuptiEventGroupAddEvent(eventGroup[i], eventId[i])); CUPTI_CALL(cuptiEventGroupSetAttribute(eventGroup[i], CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES, sizeof(profile_all), &profile_all)); CUPTI_CALL(cuptiEventGroupEnable(eventGroup[i])); DRIVER_API_CALL(cuCtxPopCurrent(&context[i])); } // launch kernel on each device for (i = 0; i < deviceCount; i++) { RUNTIME_API_CALL(cudaSetDevice(i)); DRIVER_API_CALL(cuCtxPushCurrent(context[i])); kernel<<>>(); // don't do any sync here, it's done once // work is queued on all devices DRIVER_API_CALL(cuCtxPopCurrent(&context[i])); } // sync each context now for (i = 0; i < deviceCount; i++) { RUNTIME_API_CALL(cudaSetDevice(i)); DRIVER_API_CALL(cuCtxPushCurrent(context[i])); DRIVER_API_CALL(cuCtxSynchronize()); DRIVER_API_CALL(cuCtxPopCurrent(&context[i])); } // read events for (i = 0; i < deviceCount; i++) { RUNTIME_API_CALL(cudaSetDevice(i)); DRIVER_API_CALL(cuCtxPushCurrent(context[i])); valueSize = sizeof(numInstances); CUPTI_CALL(cuptiEventGroupGetAttribute(eventGroup[i], CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT, &valueSize, &numInstances)); bytesRead = sizeof(uint64_t) * numInstances; eventValues = (uint64_t *) malloc(bytesRead); if (eventValues == NULL) { printf("%s:%d: Failed to allocate memory.\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } CUPTI_CALL(cuptiEventGroupReadEvent(eventGroup[i], CUPTI_EVENT_READ_FLAG_NONE, eventId[i], &bytesRead, eventValues)); if (bytesRead != (sizeof(uint64_t) * numInstances)) { printf("Failed to read value for \"%s\"\n", eventName); exit(EXIT_FAILURE); } for (j = 0; j < numInstances; j++) { eventVal += eventValues[j]; } printf("[%d] %s: %llu\n", i, eventName, (unsigned long long)eventVal); CUPTI_CALL(cuptiEventGroupDisable(eventGroup[i])); CUPTI_CALL(cuptiEventGroupDestroy(eventGroup[i])); DRIVER_API_CALL(cuCtxPopCurrent(&context[i])); } exit(EXIT_SUCCESS); }