Add files using upload-large-folder tool

6be3106 verified 8 months ago

4.21 kB

	/*
	* Copyright 2021 NVIDIA Corporation. All rights reserved
	*
	* Sample CUPTI app to show how to do external correlation.
	* The sample pushes external ids in a simple vector addition
	* application showing how you can externally correlate different
	* phases of the code. In this sample it is broken into
	* initialization, execution and cleanup showing how you can
	* correlate all the APIs invloved in these 3 phases in the app.
	*
	* Psuedo code:
	* cuptiActivityPushExternalCorrelationId()
	* ExternalAPI() -> (runs bunch of CUDA APIs/ launches activity on GPU)
	* cuptiActivityPopExternalCorrelationId()
	* All CUDA activity activities within this range will generate external correlation
	* record which then can be used to correlate it with the external API
	*/

	#include "cupti_external_correlation.h"

	__global__ void VecAdd(const int* A, const int* B, int* C, int N) {
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	if (i < N)
	C[i] = A[i] + B[i];
	}

	static void initVec(int *vec, int n) {
	for (int i = 0; i < n; i++)
	vec[i] = i;
	}

	void vectorAdd() {
	CUcontext context = 0;
	CUdevice device = 0;
	int N = 50000;
	size_t size = N * sizeof (int);
	int threadsPerBlock = 0;
	int blocksPerGrid = 0;
	int h_A = 0, h_B = 0, *h_C = 0;
	int d_A = 0, d_B = 0, *d_C = 0;
	uint64_t id = 0;

	DRIVER_API_CALL(cuDeviceGet(&device, 0));

	// Allocate input vectors h_A and h_B in host memory
	h_A = (int*) malloc(size);
	h_B = (int*) malloc(size);
	h_C = (int*) malloc(size);

	if (!h_A \|\| !h_B \|\| !h_C) {
	printf("Error: Out of memory\n");
	return;
	}

	// Initialize input vectors
	initVec(h_A, N);
	initVec(h_B, N);
	memset(h_C, 0, size);

	// Push external id for the initialization: memory allocation and memcpy operations from host to device
	CUPTI_CALL(cuptiActivityPushExternalCorrelationId(CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, static_cast<uint64_t>(INITIALIZATION_EXTERNAL_ID)));

	DRIVER_API_CALL(cuCtxCreate(&context, 0, device));

	// Allocate vectors in device memory
	RUNTIME_API_CALL(cudaMalloc((void**) &d_A, size));
	RUNTIME_API_CALL(cudaMalloc((void**) &d_B, size));
	RUNTIME_API_CALL(cudaMalloc((void**) &d_C, size));

	// Copy vectors from host memory to device memory
	RUNTIME_API_CALL(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
	RUNTIME_API_CALL(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice));

	// Pop the external id
	CUPTI_CALL(cuptiActivityPopExternalCorrelationId(CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, &id));

	// Push external id for the vector addition and copy of results from device to host.
	CUPTI_CALL(cuptiActivityPushExternalCorrelationId(CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, static_cast<uint64_t>(EXECUTION_EXTERNAL_ID)));

	// Invoke kernel
	threadsPerBlock = 256;
	blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

	VecAdd << <blocksPerGrid, threadsPerBlock >> >(d_A, d_B, d_C, N);
	DRIVER_API_CALL(cuCtxSynchronize());

	// Copy result from device memory to host memory
	// h_C contains the result in host memory
	RUNTIME_API_CALL(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));

	// Pop the external id
	CUPTI_CALL(cuptiActivityPopExternalCorrelationId(CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, &id));

	// Push external id for the cleanup phase in the code
	CUPTI_CALL(cuptiActivityPushExternalCorrelationId(CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, static_cast<uint64_t>(CLEANUP_EXTERNAL_ID)));

	// Free device memory
	RUNTIME_API_CALL(cudaFree(d_A));
	RUNTIME_API_CALL(cudaFree(d_B));
	RUNTIME_API_CALL(cudaFree(d_C));

	// Pop the external id
	CUPTI_CALL(cuptiActivityPopExternalCorrelationId(CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, &id));

	// Free host memory
	if (h_A) {
	free(h_A);
	}
	if (h_B) {
	free(h_B);
	}
	if (h_C) {
	free(h_C);
	}

	DRIVER_API_CALL(cuCtxSynchronize());
	DRIVER_API_CALL(cuCtxDestroy(context));
	}

	int main(int argc, char *argv[]) {
	initTrace();

	DRIVER_API_CALL(cuInit(0));
	vectorAdd();

	finiTrace();

	exit(EXIT_SUCCESS);
	}