MNghia's picture
Add files using upload-large-folder tool
6be3106 verified
/*
* Copyright 2010-2021 NVIDIA Corporation. All rights reserved
*
* Sample app to demonstrate use of CUPTI library to obtain timestamps
* using callbacks for CUDA runtime APIs
*
*/
#include <stdio.h>
#include <cuda.h>
#include <cupti.h>
#include <stdlib.h>
#ifndef EXIT_WAIVED
#define EXIT_WAIVED 2
#endif
#define DRIVER_API_CALL(apiFuncCall) \
do { \
CUresult _status = apiFuncCall; \
if (_status != CUDA_SUCCESS) { \
const char* errstr; \
cuGetErrorString(_status, &errstr); \
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
__FILE__, __LINE__, #apiFuncCall, errstr); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define RUNTIME_API_CALL(apiFuncCall) \
do { \
cudaError_t _status = apiFuncCall; \
if (_status != cudaSuccess) { \
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
__FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\
exit(EXIT_FAILURE); \
} \
} while (0)
#define CUPTI_CALL(call) \
do { \
CUptiResult _status = call; \
if (_status != CUPTI_SUCCESS) { \
const char* errstr; \
cuptiGetResultString(_status, &errstr); \
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
__FILE__, __LINE__, #call, errstr); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Structure to hold data collected by callback
typedef struct RuntimeApiTrace_st {
const char *functionName;
uint64_t startTimestamp;
uint64_t endTimestamp;
size_t memcpy_bytes;
enum cudaMemcpyKind memcpy_kind;
} RuntimeApiTrace_t;
enum launchOrder{ MEMCPY_H2D1, MEMCPY_H2D2, MEMCPY_D2H, KERNEL, THREAD_SYNC, LAUNCH_LAST};
// Vector addition kernel
__global__ void
VecAdd(const int* A, const int* B, int* C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i] + B[i];
}
// Initialize a vector
static void
initVec(int *vec, int n)
{
for (int i = 0; i < n; i++)
vec[i] = i;
}
void CUPTIAPI
getTimestampCallback(void *userdata, CUpti_CallbackDomain domain,
CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo)
{
static int memTransCount = 0;
uint64_t startTimestamp;
uint64_t endTimestamp;
RuntimeApiTrace_t *traceData = (RuntimeApiTrace_t*)userdata;
// Data is collected only for the following API
if ((cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) ||
(cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000) ||
(cbid == CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020) ||
(cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020)) {
// Set pointer depending on API
if ((cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) ||
(cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000))
{
traceData = traceData + KERNEL;
}
else if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020)
traceData = traceData + THREAD_SYNC;
else if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020)
traceData = traceData + MEMCPY_H2D1 + memTransCount;
if (cbInfo->callbackSite == CUPTI_API_ENTER) {
// for a kernel launch report the kernel name, otherwise use the API
// function name.
if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020 ||
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000)
{
traceData->functionName = cbInfo->symbolName;
}
else {
traceData->functionName = cbInfo->functionName;
}
// Store parameters passed to cudaMemcpy
if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020) {
traceData->memcpy_bytes = ((cudaMemcpy_v3020_params *)(cbInfo->functionParams))->count;
traceData->memcpy_kind = ((cudaMemcpy_v3020_params *)(cbInfo->functionParams))->kind;
}
// Collect timestamp for API start
CUPTI_CALL(cuptiGetTimestamp(&startTimestamp));
traceData->startTimestamp = startTimestamp;
}
if (cbInfo->callbackSite == CUPTI_API_EXIT) {
// Collect timestamp for API exit
CUPTI_CALL(cuptiGetTimestamp(&endTimestamp));
traceData->endTimestamp = endTimestamp;
// Advance to the next memory transfer operation
if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020) {
memTransCount++;
}
}
}
}
static const char *
memcpyKindStr(enum cudaMemcpyKind kind)
{
switch (kind) {
case cudaMemcpyHostToDevice:
return "HostToDevice";
case cudaMemcpyDeviceToHost:
return "DeviceToHost";
default:
break;
}
return "<unknown>";
}
static void
displayTimestamps(RuntimeApiTrace_t *trace)
{
// Calculate timestamp of kernel based on timestamp from
// cudaDeviceSynchronize() call
trace[KERNEL].endTimestamp = trace[THREAD_SYNC].endTimestamp;
printf("startTimeStamp/Duration reported in nano-seconds\n\n");
printf("Name\t\tStart Time\t\tDuration\tBytes\tKind\n");
printf("%s\t%llu\t%llu\t\t%llu\t%s\n", trace[MEMCPY_H2D1].functionName,
(unsigned long long)trace[MEMCPY_H2D1].startTimestamp,
(unsigned long long)trace[MEMCPY_H2D1].endTimestamp - trace[MEMCPY_H2D1].startTimestamp,
(unsigned long long)trace[MEMCPY_H2D1].memcpy_bytes,
memcpyKindStr(trace[MEMCPY_H2D1].memcpy_kind));
printf("%s\t%llu\t%llu\t\t%llu\t%s\n", trace[MEMCPY_H2D2].functionName,
(unsigned long long)trace[MEMCPY_H2D2].startTimestamp,
(unsigned long long)trace[MEMCPY_H2D2].endTimestamp - trace[MEMCPY_H2D2].startTimestamp,
(unsigned long long)trace[MEMCPY_H2D2].memcpy_bytes,
memcpyKindStr(trace[MEMCPY_H2D2].memcpy_kind));
printf("%s\t%llu\t%llu\t\tNA\tNA\n", trace[KERNEL].functionName,
(unsigned long long)trace[KERNEL].startTimestamp,
(unsigned long long)trace[KERNEL].endTimestamp - trace[KERNEL].startTimestamp);
printf("%s\t%llu\t%llu\t\t%llu\t%s\n", trace[MEMCPY_D2H].functionName,
(unsigned long long)trace[MEMCPY_D2H].startTimestamp,
(unsigned long long)trace[MEMCPY_D2H].endTimestamp - trace[MEMCPY_D2H].startTimestamp,
(unsigned long long)trace[MEMCPY_D2H].memcpy_bytes,
memcpyKindStr(trace[MEMCPY_D2H].memcpy_kind));
}
static void
cleanUp(int *h_A, int *h_B, int *h_C, int *d_A, int *d_B, int *d_C)
{
if (d_A)
RUNTIME_API_CALL(cudaFree(d_A));
if (d_B)
RUNTIME_API_CALL(cudaFree(d_B));
if (d_C)
RUNTIME_API_CALL(cudaFree(d_C));
// Free host memory
if (h_A)
free(h_A);
if (h_B)
free(h_B);
if (h_C)
free(h_C);
}
int
main()
{
CUcontext context = 0;
CUdevice device = 0;
int N = 50000;
size_t size = N * sizeof(int);
int threadsPerBlock = 0;
int blocksPerGrid = 0;
int sum, i;
int *h_A, *h_B, *h_C;
int *d_A, *d_B, *d_C;
CUpti_SubscriberHandle subscriber;
RuntimeApiTrace_t trace[LAUNCH_LAST];
// subscribe to CUPTI callbacks
CUPTI_CALL(cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)getTimestampCallback , &trace));
DRIVER_API_CALL(cuInit(0));
DRIVER_API_CALL(cuCtxCreate(&context, 0, device));
// Enable all callbacks for CUDA Runtime APIs.
// Callback will be invoked at the entry and exit points of each of the CUDA Runtime API
CUPTI_CALL(cuptiEnableDomain(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API));
// Allocate input vectors h_A and h_B in host memory
h_A = (int*)malloc(size);
h_B = (int*)malloc(size);
h_C = (int*)malloc(size);
if (!h_A || !h_B || !h_C) {
printf("Error: out of memory\n");
exit(EXIT_FAILURE);
}
// Initialize input vectors
initVec(h_A, N);
initVec(h_B, N);
memset(h_C, 0, size);
// Allocate vectors in device memory
RUNTIME_API_CALL(cudaMalloc((void**)&d_A, size));
RUNTIME_API_CALL(cudaMalloc((void**)&d_B, size));
RUNTIME_API_CALL(cudaMalloc((void**)&d_C, size));
// Copy vectors from host memory to device memory
RUNTIME_API_CALL(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
RUNTIME_API_CALL(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice));
// Invoke kernel
threadsPerBlock = 256;
blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
RUNTIME_API_CALL(cudaDeviceSynchronize());
// Copy result from device memory to host memory
// h_C contains the result in host memory
RUNTIME_API_CALL(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));
// Verify result
for (i = 0; i < N; ++i) {
sum = h_A[i] + h_B[i];
if (h_C[i] != sum) {
printf("kernel execution FAILED\n");
goto Error;
}
}
// display timestamps collected in the callback
displayTimestamps(trace);
CUPTI_CALL(cuptiUnsubscribe(subscriber));
cleanUp(h_A, h_B, h_C, d_A, d_B, d_C);
RUNTIME_API_CALL(cudaDeviceSynchronize());
exit(EXIT_SUCCESS);
Error:
cleanUp(h_A, h_B, h_C, d_A, d_B, d_C);
RUNTIME_API_CALL(cudaDeviceSynchronize());
exit(EXIT_FAILURE);
}