soft_gripper_envs / samples /event_multi_gpu /event_multi_gpu.cu

Add files using upload-large-folder tool

563c80f verified 5 months ago

7.3 kB

	/*
	* Copyright 2014-2017 NVIDIA Corporation. All rights reserved.
	*
	* Sample app to demonstrate use of CUPTI library to obtain profiler
	* event values on a multi-gpu setup without serializing the compute
	* work on the devices.
	*/

	#include <stdio.h>
	#include <cuda_runtime_api.h>
	#include <cupti_events.h>
	#include <stdlib.h>

	#ifndef EXIT_WAIVED
	#define EXIT_WAIVED 2
	#endif

	#define CUPTI_CALL(call) \
	do { \
	CUptiResult _status = call; \
	if (_status != CUPTI_SUCCESS) { \
	const char *errstr; \
	cuptiGetResultString(_status, &errstr); \
	fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
	__FILE__, __LINE__, #call, errstr); \
	exit(EXIT_FAILURE); \
	} \
	} while (0)

	#define DRIVER_API_CALL(apiFuncCall) \
	do { \
	CUresult _status = apiFuncCall; \
	if (_status != CUDA_SUCCESS) { \
	fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \
	__FILE__, __LINE__, #apiFuncCall, _status); \
	exit(EXIT_FAILURE); \
	} \
	} while (0)

	#define RUNTIME_API_CALL(apiFuncCall) \
	do { \
	cudaError_t _status = apiFuncCall; \
	if (_status != cudaSuccess) { \
	fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
	__FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\
	exit(EXIT_FAILURE); \
	} \
	} while (0)

	#define MAX_DEVICES 32
	#define BLOCK_X 32
	#define GRID_X 32

	// default event
	#define EVENT_NAME "inst_executed"

	// dummy kernel
	__global__ void kernel() {
	uint64_t i = 0;
	volatile uint64_t limit = 1024 * 128;
	for (i = 0; i < limit; i++) {
	}
	}

	int
	main(int argc, char *argv[])
	{
	int deviceCount;
	char deviceName[256];
	CUdevice device[MAX_DEVICES];
	CUcontext context[MAX_DEVICES];
	CUpti_EventGroup eventGroup[MAX_DEVICES];
	CUpti_EventID eventId[MAX_DEVICES];
	size_t bytesRead, valueSize;
	uint32_t numInstances = 0, j = 0;
	uint64_t *eventValues = NULL, eventVal = 0;
	const char *eventName;
	int i = 0;
	uint32_t profile_all = 1;

	printf("Usage: %s [event_name]\n", argv[0]);

	DRIVER_API_CALL(cuInit(0));

	DRIVER_API_CALL(cuDeviceGetCount(&deviceCount));
	if (deviceCount == 0) {
	printf("There is no device supporting CUDA.\n");
	exit(EXIT_WAIVED);
	}

	if (deviceCount < 2) {
	printf("This multi-gpu test is waived on single gpu setup.\n");
	exit(EXIT_WAIVED);
	}

	if (deviceCount > MAX_DEVICES) {
	printf("Found more devices (%d) than handled in the test (%d)\n",
	deviceCount, MAX_DEVICES);
	exit(EXIT_WAIVED);
	}

	if (argc > 1) {
	eventName = argv[1];
	}
	else {
	eventName = EVENT_NAME;
	}

	for (i = 0; i < deviceCount; i++) {
	DRIVER_API_CALL(cuDeviceGet(&device[i], i));

	DRIVER_API_CALL(cuDeviceGetName(deviceName, 256, device[i]));

	printf("CUDA Device Name: %s\n", deviceName);
	}

	// create one context per device
	for (i = 0; i < deviceCount; i++) {
	RUNTIME_API_CALL(cudaSetDevice(i));

	DRIVER_API_CALL(cuCtxCreate(&(context[i]), 0, device[i]));

	DRIVER_API_CALL(cuCtxPopCurrent(&(context[i])));
	}

	// enable event profiling on each device
	for (i = 0; i < deviceCount; i++) {
	RUNTIME_API_CALL(cudaSetDevice(i));

	DRIVER_API_CALL(cuCtxPushCurrent(context[i]));

	CUPTI_CALL(cuptiSetEventCollectionMode(context[i],
	CUPTI_EVENT_COLLECTION_MODE_KERNEL));
	CUPTI_CALL(cuptiEventGroupCreate(context[i], &eventGroup[i], 0));
	CUPTI_CALL(cuptiEventGetIdFromName(device[i], eventName, &eventId[i]));
	CUPTI_CALL(cuptiEventGroupAddEvent(eventGroup[i], eventId[i]));
	CUPTI_CALL(cuptiEventGroupSetAttribute(eventGroup[i],
	CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES,
	sizeof(profile_all), &profile_all));
	CUPTI_CALL(cuptiEventGroupEnable(eventGroup[i]));

	DRIVER_API_CALL(cuCtxPopCurrent(&context[i]));
	}

	// launch kernel on each device
	for (i = 0; i < deviceCount; i++) {
	RUNTIME_API_CALL(cudaSetDevice(i));

	DRIVER_API_CALL(cuCtxPushCurrent(context[i]));

	kernel<<<GRID_X, BLOCK_X>>>();

	// don't do any sync here, it's done once
	// work is queued on all devices

	DRIVER_API_CALL(cuCtxPopCurrent(&context[i]));
	}

	// sync each context now
	for (i = 0; i < deviceCount; i++) {
	RUNTIME_API_CALL(cudaSetDevice(i));

	DRIVER_API_CALL(cuCtxPushCurrent(context[i]));

	DRIVER_API_CALL(cuCtxSynchronize());

	DRIVER_API_CALL(cuCtxPopCurrent(&context[i]));
	}

	// read events
	for (i = 0; i < deviceCount; i++) {
	RUNTIME_API_CALL(cudaSetDevice(i));

	DRIVER_API_CALL(cuCtxPushCurrent(context[i]));

	valueSize = sizeof(numInstances);
	CUPTI_CALL(cuptiEventGroupGetAttribute(eventGroup[i],
	CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT,
	&valueSize, &numInstances));

	bytesRead = sizeof(uint64_t) * numInstances;
	eventValues = (uint64_t *) malloc(bytesRead);
	if (eventValues == NULL) {
	printf("%s:%d: Failed to allocate memory.\n", __FILE__, __LINE__);
	exit(EXIT_FAILURE);
	}

	CUPTI_CALL(cuptiEventGroupReadEvent(eventGroup[i],
	CUPTI_EVENT_READ_FLAG_NONE,
	eventId[i], &bytesRead, eventValues));

	if (bytesRead != (sizeof(uint64_t) * numInstances)) {
	printf("Failed to read value for \"%s\"\n", eventName);
	exit(EXIT_FAILURE);
	}

	for (j = 0; j < numInstances; j++) {
	eventVal += eventValues[j];
	}

	printf("[%d] %s: %llu\n", i, eventName, (unsigned long long)eventVal);

	CUPTI_CALL(cuptiEventGroupDisable(eventGroup[i]));
	CUPTI_CALL(cuptiEventGroupDestroy(eventGroup[i]));

	DRIVER_API_CALL(cuCtxPopCurrent(&context[i]));
	}

	exit(EXIT_SUCCESS);
	}