|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
do { \ |
|
|
CUptiResult _status = call; \ |
|
|
if (_status != CUPTI_SUCCESS) { \ |
|
|
const char *errstr; \ |
|
|
cuptiGetResultString(_status, &errstr); \ |
|
|
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ |
|
|
__FILE__, __LINE__, |
|
|
exit(EXIT_FAILURE); \ |
|
|
} \ |
|
|
} while (0) |
|
|
|
|
|
|
|
|
do { \ |
|
|
CUresult _status = apiFuncCall; \ |
|
|
if (_status != CUDA_SUCCESS) { \ |
|
|
fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \ |
|
|
__FILE__, __LINE__, |
|
|
exit(EXIT_FAILURE); \ |
|
|
} \ |
|
|
} while (0) |
|
|
|
|
|
|
|
|
do { \ |
|
|
cudaError_t _status = apiFuncCall; \ |
|
|
if (_status != cudaSuccess) { \ |
|
|
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ |
|
|
__FILE__, __LINE__, |
|
|
exit(EXIT_FAILURE); \ |
|
|
} \ |
|
|
} while (0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// default event |
|
|
|
|
|
|
|
|
// dummy kernel |
|
|
__global__ void kernel() { |
|
|
uint64_t i = 0; |
|
|
volatile uint64_t limit = 1024 * 128; |
|
|
for (i = 0; i < limit; i++) { |
|
|
} |
|
|
} |
|
|
|
|
|
int |
|
|
main(int argc, char *argv[]) |
|
|
{ |
|
|
int deviceCount; |
|
|
char deviceName[256]; |
|
|
CUdevice device[MAX_DEVICES]; |
|
|
CUcontext context[MAX_DEVICES]; |
|
|
CUpti_EventGroup eventGroup[MAX_DEVICES]; |
|
|
CUpti_EventID eventId[MAX_DEVICES]; |
|
|
size_t bytesRead, valueSize; |
|
|
uint32_t numInstances = 0, j = 0; |
|
|
uint64_t *eventValues = NULL, eventVal = 0; |
|
|
const char *eventName; |
|
|
int i = 0; |
|
|
uint32_t profile_all = 1; |
|
|
|
|
|
printf("Usage: %s [event_name]\n", argv[0]); |
|
|
|
|
|
DRIVER_API_CALL(cuInit(0)); |
|
|
|
|
|
DRIVER_API_CALL(cuDeviceGetCount(&deviceCount)); |
|
|
if (deviceCount == 0) { |
|
|
printf("There is no device supporting CUDA.\n"); |
|
|
exit(EXIT_WAIVED); |
|
|
} |
|
|
|
|
|
if (deviceCount < 2) { |
|
|
printf("This multi-gpu test is waived on single gpu setup.\n"); |
|
|
exit(EXIT_WAIVED); |
|
|
} |
|
|
|
|
|
if (deviceCount > MAX_DEVICES) { |
|
|
printf("Found more devices (%d) than handled in the test (%d)\n", |
|
|
deviceCount, MAX_DEVICES); |
|
|
exit(EXIT_WAIVED); |
|
|
} |
|
|
|
|
|
if (argc > 1) { |
|
|
eventName = argv[1]; |
|
|
} |
|
|
else { |
|
|
eventName = EVENT_NAME; |
|
|
} |
|
|
|
|
|
for (i = 0; i < deviceCount; i++) { |
|
|
DRIVER_API_CALL(cuDeviceGet(&device[i], i)); |
|
|
|
|
|
DRIVER_API_CALL(cuDeviceGetName(deviceName, 256, device[i])); |
|
|
|
|
|
printf("CUDA Device Name: %s\n", deviceName); |
|
|
} |
|
|
|
|
|
// create one context per device |
|
|
for (i = 0; i < deviceCount; i++) { |
|
|
RUNTIME_API_CALL(cudaSetDevice(i)); |
|
|
|
|
|
DRIVER_API_CALL(cuCtxCreate(&(context[i]), 0, device[i])); |
|
|
|
|
|
DRIVER_API_CALL(cuCtxPopCurrent(&(context[i]))); |
|
|
} |
|
|
|
|
|
// enable event profiling on each device |
|
|
for (i = 0; i < deviceCount; i++) { |
|
|
RUNTIME_API_CALL(cudaSetDevice(i)); |
|
|
|
|
|
DRIVER_API_CALL(cuCtxPushCurrent(context[i])); |
|
|
|
|
|
CUPTI_CALL(cuptiSetEventCollectionMode(context[i], |
|
|
CUPTI_EVENT_COLLECTION_MODE_KERNEL)); |
|
|
CUPTI_CALL(cuptiEventGroupCreate(context[i], &eventGroup[i], 0)); |
|
|
CUPTI_CALL(cuptiEventGetIdFromName(device[i], eventName, &eventId[i])); |
|
|
CUPTI_CALL(cuptiEventGroupAddEvent(eventGroup[i], eventId[i])); |
|
|
CUPTI_CALL(cuptiEventGroupSetAttribute(eventGroup[i], |
|
|
CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES, |
|
|
sizeof(profile_all), &profile_all)); |
|
|
CUPTI_CALL(cuptiEventGroupEnable(eventGroup[i])); |
|
|
|
|
|
DRIVER_API_CALL(cuCtxPopCurrent(&context[i])); |
|
|
} |
|
|
|
|
|
// launch kernel on each device |
|
|
for (i = 0; i < deviceCount; i++) { |
|
|
RUNTIME_API_CALL(cudaSetDevice(i)); |
|
|
|
|
|
DRIVER_API_CALL(cuCtxPushCurrent(context[i])); |
|
|
|
|
|
kernel<<<GRID_X, BLOCK_X>>>(); |
|
|
|
|
|
// don't do any sync here, it's done once |
|
|
// work is queued on all devices |
|
|
|
|
|
DRIVER_API_CALL(cuCtxPopCurrent(&context[i])); |
|
|
} |
|
|
|
|
|
// sync each context now |
|
|
for (i = 0; i < deviceCount; i++) { |
|
|
RUNTIME_API_CALL(cudaSetDevice(i)); |
|
|
|
|
|
DRIVER_API_CALL(cuCtxPushCurrent(context[i])); |
|
|
|
|
|
DRIVER_API_CALL(cuCtxSynchronize()); |
|
|
|
|
|
DRIVER_API_CALL(cuCtxPopCurrent(&context[i])); |
|
|
} |
|
|
|
|
|
// read events |
|
|
for (i = 0; i < deviceCount; i++) { |
|
|
RUNTIME_API_CALL(cudaSetDevice(i)); |
|
|
|
|
|
DRIVER_API_CALL(cuCtxPushCurrent(context[i])); |
|
|
|
|
|
valueSize = sizeof(numInstances); |
|
|
CUPTI_CALL(cuptiEventGroupGetAttribute(eventGroup[i], |
|
|
CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT, |
|
|
&valueSize, &numInstances)); |
|
|
|
|
|
bytesRead = sizeof(uint64_t) * numInstances; |
|
|
eventValues = (uint64_t *) malloc(bytesRead); |
|
|
if (eventValues == NULL) { |
|
|
printf("%s:%d: Failed to allocate memory.\n", __FILE__, __LINE__); |
|
|
exit(EXIT_FAILURE); |
|
|
} |
|
|
|
|
|
CUPTI_CALL(cuptiEventGroupReadEvent(eventGroup[i], |
|
|
CUPTI_EVENT_READ_FLAG_NONE, |
|
|
eventId[i], &bytesRead, eventValues)); |
|
|
|
|
|
if (bytesRead != (sizeof(uint64_t) * numInstances)) { |
|
|
printf("Failed to read value for \"%s\"\n", eventName); |
|
|
exit(EXIT_FAILURE); |
|
|
} |
|
|
|
|
|
for (j = 0; j < numInstances; j++) { |
|
|
eventVal += eventValues[j]; |
|
|
} |
|
|
|
|
|
printf("[%d] %s: %llu\n", i, eventName, (unsigned long long)eventVal); |
|
|
|
|
|
CUPTI_CALL(cuptiEventGroupDisable(eventGroup[i])); |
|
|
CUPTI_CALL(cuptiEventGroupDestroy(eventGroup[i])); |
|
|
|
|
|
DRIVER_API_CALL(cuCtxPopCurrent(&context[i])); |
|
|
} |
|
|
|
|
|
exit(EXIT_SUCCESS); |
|
|
} |
|
|
|