Add files using upload-large-folder tool

6be3106 verified 6 months ago

6.43 kB

	/*
	* Copyright 2020 NVIDIA Corporation. All rights reserved
	*
	* Sample CUPTI app to print the trace of CUDA graphs and correlate
	* the graph node launch to the node creation API using CUPTI callbacks.
	*/
	#include <cuda.h>
	#include <cuda_runtime.h>
	#include <cupti.h>
	#include <stdio.h>
	#include <string.h>
	#include <stdlib.h>

	#define DRIVER_API_CALL(apiFuncCall) \
	do { \
	CUresult _status = apiFuncCall; \
	if (_status != CUDA_SUCCESS) { \
	const char* errstr; \
	cuGetErrorString(_status, &errstr); \
	fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
	__FILE__, __LINE__, #apiFuncCall, errstr); \
	exit(EXIT_FAILURE); \
	} \
	} while (0)

	#define RUNTIME_API_CALL(apiFuncCall) \
	do { \
	cudaError_t _status = apiFuncCall; \
	if (_status != cudaSuccess) { \
	fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
	__FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\
	exit(EXIT_FAILURE); \
	} \
	} while (0)

	#define CUPTI_CALL(call) \
	do { \
	CUptiResult _status = call; \
	if (_status != CUPTI_SUCCESS) { \
	const char *errstr; \
	cuptiGetResultString(_status, &errstr); \
	fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
	__FILE__, __LINE__, #call, errstr); \
	exit(EXIT_FAILURE); \
	} \
	} while (0)

	#define COMPUTE_N 50000

	extern void initTrace(void);
	extern void finiTrace(void);

	// Kernels
	__global__ void
	VecAdd(const int* A, const int* B, int* C, int N)
	{
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	if (i < N)
	C[i] = A[i] + B[i];
	}

	__global__ void
	VecSub(const int* A, const int* B, int* C, int N)
	{
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	if (i < N)
	C[i] = A[i] - B[i];
	}

	static void
	do_pass(cudaStream_t stream)
	{
	int h_A, h_B, *h_C;
	int d_A, d_B, *d_C;
	size_t size = COMPUTE_N * sizeof(int);
	int threadsPerBlock = 256;
	int blocksPerGrid = 0;
	cudaKernelNodeParams kernelParams;
	cudaMemcpy3DParms memcpyParams = {0};
	cudaGraph_t graph;
	cudaGraphExec_t graphExec;
	cudaGraphNode_t nodes[5];

	// Allocate input vectors h_A and h_B in host memory
	// don't bother to initialize
	h_A = (int*)malloc(size);
	h_B = (int*)malloc(size);
	h_C = (int*)malloc(size);
	if (!h_A \|\| !h_B \|\| !h_C) {
	printf("Error: out of memory\n");
	exit(EXIT_FAILURE);
	}

	// Allocate vectors in device memory
	RUNTIME_API_CALL(cudaMalloc((void**)&d_A, size));
	RUNTIME_API_CALL(cudaMalloc((void**)&d_B, size));
	RUNTIME_API_CALL(cudaMalloc((void**)&d_C, size));

	RUNTIME_API_CALL(cudaGraphCreate(&graph, 0));

	// Init memcpy params
	memcpyParams.kind = cudaMemcpyHostToDevice;
	memcpyParams.srcPtr.ptr = h_A;
	memcpyParams.dstPtr.ptr = d_A;
	memcpyParams.extent.width = size;
	memcpyParams.extent.height = 1;
	memcpyParams.extent.depth = 1;
	RUNTIME_API_CALL(cudaGraphAddMemcpyNode(&nodes[0], graph, NULL, 0, &memcpyParams));

	memcpyParams.srcPtr.ptr = h_B;
	memcpyParams.dstPtr.ptr = d_B;
	RUNTIME_API_CALL(cudaGraphAddMemcpyNode(&nodes[1], graph, NULL, 0, &memcpyParams));

	// Init kernel params
	int num = COMPUTE_N;
	void* kernelArgs[] = {(void )&d_A, (void )&d_B, (void )&d_C, (void )&num};
	blocksPerGrid = (COMPUTE_N + threadsPerBlock - 1) / threadsPerBlock;
	kernelParams.func = (void *)VecAdd;
	kernelParams.gridDim = dim3(blocksPerGrid, 1, 1);
	kernelParams.blockDim = dim3(threadsPerBlock, 1, 1);
	kernelParams.sharedMemBytes = 0;
	kernelParams.kernelParams = (void **)kernelArgs;
	kernelParams.extra = NULL;

	RUNTIME_API_CALL(cudaGraphAddKernelNode(&nodes[2], graph, &nodes[0], 2, &kernelParams));

	kernelParams.func = (void *)VecSub;
	RUNTIME_API_CALL(cudaGraphAddKernelNode(&nodes[3], graph, &nodes[2], 1, &kernelParams));

	memcpyParams.kind = cudaMemcpyDeviceToHost;
	memcpyParams.srcPtr.ptr = d_C;
	memcpyParams.dstPtr.ptr = h_C;
	memcpyParams.extent.width = size;
	memcpyParams.extent.height = 1;
	memcpyParams.extent.depth = 1;
	RUNTIME_API_CALL(cudaGraphAddMemcpyNode(&nodes[4], graph, &nodes[3], 1, &memcpyParams));

	RUNTIME_API_CALL(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));

	RUNTIME_API_CALL(cudaGraphLaunch(graphExec, stream));
	RUNTIME_API_CALL(cudaStreamSynchronize(stream));

	free(h_A);
	free(h_B);
	free(h_C);
	RUNTIME_API_CALL(cudaFree(d_A));
	RUNTIME_API_CALL(cudaFree(d_B));
	RUNTIME_API_CALL(cudaFree(d_C));
	}

	int
	main(int argc, char *argv[])
	{
	CUdevice device;
	char deviceName[256];

	// initialize the activity trace
	initTrace();

	DRIVER_API_CALL(cuInit(0));

	DRIVER_API_CALL(cuDeviceGet(&device, 0));
	DRIVER_API_CALL(cuDeviceGetName(deviceName, 256, device));
	printf("Device Name: %s\n", deviceName);

	RUNTIME_API_CALL(cudaSetDevice(0));

	// do pass with user stream
	cudaStream_t stream0;
	RUNTIME_API_CALL(cudaStreamCreate(&stream0));
	do_pass(stream0);

	RUNTIME_API_CALL(cudaDeviceSynchronize());

	// Flush CUPTI buffers before resetting the device.
	// This can also be called in the cudaDeviceReset callback.
	CUPTI_CALL(cuptiActivityFlushAll(0));
	RUNTIME_API_CALL(cudaDeviceReset());

	finiTrace();
	exit(EXIT_SUCCESS);
	}