soft_gripper_envs / samples /callback_timestamp /callback_timestamp.cu

Add files using upload-large-folder tool

6be3106 verified 5 months ago

10.1 kB

	/*
	* Copyright 2010-2021 NVIDIA Corporation. All rights reserved
	*
	* Sample app to demonstrate use of CUPTI library to obtain timestamps
	* using callbacks for CUDA runtime APIs
	*
	*/

	#include <stdio.h>
	#include <cuda.h>
	#include <cupti.h>
	#include <stdlib.h>

	#ifndef EXIT_WAIVED
	#define EXIT_WAIVED 2
	#endif

	#define DRIVER_API_CALL(apiFuncCall) \
	do { \
	CUresult _status = apiFuncCall; \
	if (_status != CUDA_SUCCESS) { \
	const char* errstr; \
	cuGetErrorString(_status, &errstr); \
	fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
	__FILE__, __LINE__, #apiFuncCall, errstr); \
	exit(EXIT_FAILURE); \
	} \
	} while (0)

	#define RUNTIME_API_CALL(apiFuncCall) \
	do { \
	cudaError_t _status = apiFuncCall; \
	if (_status != cudaSuccess) { \
	fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
	__FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\
	exit(EXIT_FAILURE); \
	} \
	} while (0)

	#define CUPTI_CALL(call) \
	do { \
	CUptiResult _status = call; \
	if (_status != CUPTI_SUCCESS) { \
	const char* errstr; \
	cuptiGetResultString(_status, &errstr); \
	fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
	__FILE__, __LINE__, #call, errstr); \
	exit(EXIT_FAILURE); \
	} \
	} while (0)

	// Structure to hold data collected by callback
	typedef struct RuntimeApiTrace_st {
	const char *functionName;
	uint64_t startTimestamp;
	uint64_t endTimestamp;
	size_t memcpy_bytes;
	enum cudaMemcpyKind memcpy_kind;
	} RuntimeApiTrace_t;

	enum launchOrder{ MEMCPY_H2D1, MEMCPY_H2D2, MEMCPY_D2H, KERNEL, THREAD_SYNC, LAUNCH_LAST};

	// Vector addition kernel
	__global__ void
	VecAdd(const int* A, const int* B, int* C, int N)
	{
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	if (i < N)
	C[i] = A[i] + B[i];
	}

	// Initialize a vector
	static void
	initVec(int *vec, int n)
	{
	for (int i = 0; i < n; i++)
	vec[i] = i;
	}

	void CUPTIAPI
	getTimestampCallback(void *userdata, CUpti_CallbackDomain domain,
	CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo)
	{
	static int memTransCount = 0;
	uint64_t startTimestamp;
	uint64_t endTimestamp;
	RuntimeApiTrace_t traceData = (RuntimeApiTrace_t)userdata;

	// Data is collected only for the following API
	if ((cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) \|\|
	(cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000) \|\|
	(cbid == CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020) \|\|
	(cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020)) {

	// Set pointer depending on API
	if ((cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) \|\|
	(cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000))
	{
	traceData = traceData + KERNEL;
	}
	else if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020)
	traceData = traceData + THREAD_SYNC;
	else if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020)
	traceData = traceData + MEMCPY_H2D1 + memTransCount;

	if (cbInfo->callbackSite == CUPTI_API_ENTER) {
	// for a kernel launch report the kernel name, otherwise use the API
	// function name.
	if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020 \|\|
	cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000)
	{
	traceData->functionName = cbInfo->symbolName;
	}
	else {
	traceData->functionName = cbInfo->functionName;
	}

	// Store parameters passed to cudaMemcpy
	if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020) {
	traceData->memcpy_bytes = ((cudaMemcpy_v3020_params *)(cbInfo->functionParams))->count;
	traceData->memcpy_kind = ((cudaMemcpy_v3020_params *)(cbInfo->functionParams))->kind;
	}

	// Collect timestamp for API start
	CUPTI_CALL(cuptiGetTimestamp(&startTimestamp));

	traceData->startTimestamp = startTimestamp;
	}

	if (cbInfo->callbackSite == CUPTI_API_EXIT) {
	// Collect timestamp for API exit
	CUPTI_CALL(cuptiGetTimestamp(&endTimestamp));

	traceData->endTimestamp = endTimestamp;

	// Advance to the next memory transfer operation
	if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020) {
	memTransCount++;
	}
	}
	}
	}

	static const char *
	memcpyKindStr(enum cudaMemcpyKind kind)
	{
	switch (kind) {
	case cudaMemcpyHostToDevice:
	return "HostToDevice";
	case cudaMemcpyDeviceToHost:
	return "DeviceToHost";
	default:
	break;
	}

	return "<unknown>";
	}

	static void
	displayTimestamps(RuntimeApiTrace_t *trace)
	{
	// Calculate timestamp of kernel based on timestamp from
	// cudaDeviceSynchronize() call
	trace[KERNEL].endTimestamp = trace[THREAD_SYNC].endTimestamp;

	printf("startTimeStamp/Duration reported in nano-seconds\n\n");
	printf("Name\t\tStart Time\t\tDuration\tBytes\tKind\n");
	printf("%s\t%llu\t%llu\t\t%llu\t%s\n", trace[MEMCPY_H2D1].functionName,
	(unsigned long long)trace[MEMCPY_H2D1].startTimestamp,
	(unsigned long long)trace[MEMCPY_H2D1].endTimestamp - trace[MEMCPY_H2D1].startTimestamp,
	(unsigned long long)trace[MEMCPY_H2D1].memcpy_bytes,
	memcpyKindStr(trace[MEMCPY_H2D1].memcpy_kind));
	printf("%s\t%llu\t%llu\t\t%llu\t%s\n", trace[MEMCPY_H2D2].functionName,
	(unsigned long long)trace[MEMCPY_H2D2].startTimestamp,
	(unsigned long long)trace[MEMCPY_H2D2].endTimestamp - trace[MEMCPY_H2D2].startTimestamp,
	(unsigned long long)trace[MEMCPY_H2D2].memcpy_bytes,
	memcpyKindStr(trace[MEMCPY_H2D2].memcpy_kind));
	printf("%s\t%llu\t%llu\t\tNA\tNA\n", trace[KERNEL].functionName,
	(unsigned long long)trace[KERNEL].startTimestamp,
	(unsigned long long)trace[KERNEL].endTimestamp - trace[KERNEL].startTimestamp);
	printf("%s\t%llu\t%llu\t\t%llu\t%s\n", trace[MEMCPY_D2H].functionName,
	(unsigned long long)trace[MEMCPY_D2H].startTimestamp,
	(unsigned long long)trace[MEMCPY_D2H].endTimestamp - trace[MEMCPY_D2H].startTimestamp,
	(unsigned long long)trace[MEMCPY_D2H].memcpy_bytes,
	memcpyKindStr(trace[MEMCPY_D2H].memcpy_kind));
	}

	static void
	cleanUp(int h_A, int h_B, int h_C, int d_A, int d_B, int d_C)
	{
	if (d_A)
	RUNTIME_API_CALL(cudaFree(d_A));
	if (d_B)
	RUNTIME_API_CALL(cudaFree(d_B));
	if (d_C)
	RUNTIME_API_CALL(cudaFree(d_C));

	// Free host memory
	if (h_A)
	free(h_A);
	if (h_B)
	free(h_B);
	if (h_C)
	free(h_C);
	}

	int
	main()
	{
	CUcontext context = 0;
	CUdevice device = 0;
	int N = 50000;
	size_t size = N * sizeof(int);
	int threadsPerBlock = 0;
	int blocksPerGrid = 0;
	int sum, i;
	int h_A, h_B, *h_C;
	int d_A, d_B, *d_C;

	CUpti_SubscriberHandle subscriber;
	RuntimeApiTrace_t trace[LAUNCH_LAST];

	// subscribe to CUPTI callbacks
	CUPTI_CALL(cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)getTimestampCallback , &trace));

	DRIVER_API_CALL(cuInit(0));

	DRIVER_API_CALL(cuCtxCreate(&context, 0, device));

	// Enable all callbacks for CUDA Runtime APIs.
	// Callback will be invoked at the entry and exit points of each of the CUDA Runtime API
	CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API));

	// Allocate input vectors h_A and h_B in host memory
	h_A = (int*)malloc(size);
	h_B = (int*)malloc(size);
	h_C = (int*)malloc(size);
	if (!h_A \|\| !h_B \|\| !h_C) {
	printf("Error: out of memory\n");
	exit(EXIT_FAILURE);
	}

	// Initialize input vectors
	initVec(h_A, N);
	initVec(h_B, N);
	memset(h_C, 0, size);

	// Allocate vectors in device memory
	RUNTIME_API_CALL(cudaMalloc((void**)&d_A, size));
	RUNTIME_API_CALL(cudaMalloc((void**)&d_B, size));
	RUNTIME_API_CALL(cudaMalloc((void**)&d_C, size));

	// Copy vectors from host memory to device memory
	RUNTIME_API_CALL(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
	RUNTIME_API_CALL(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice));

	// Invoke kernel
	threadsPerBlock = 256;
	blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

	VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
	RUNTIME_API_CALL(cudaDeviceSynchronize());

	// Copy result from device memory to host memory
	// h_C contains the result in host memory
	RUNTIME_API_CALL(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));

	// Verify result
	for (i = 0; i < N; ++i) {
	sum = h_A[i] + h_B[i];
	if (h_C[i] != sum) {
	printf("kernel execution FAILED\n");
	goto Error;
	}
	}

	// display timestamps collected in the callback
	displayTimestamps(trace);

	CUPTI_CALL(cuptiUnsubscribe(subscriber));

	cleanUp(h_A, h_B, h_C, d_A, d_B, d_C);
	RUNTIME_API_CALL(cudaDeviceSynchronize());
	exit(EXIT_SUCCESS);

	Error:
	cleanUp(h_A, h_B, h_C, d_A, d_B, d_C);
	RUNTIME_API_CALL(cudaDeviceSynchronize());
	exit(EXIT_FAILURE);
	}