/*
 * Copyright 2020 NVIDIA Corporation. All rights reserved
 *
 * Sample CUPTI app to print the trace of CUDA graphs and correlate
 * the graph node launch to the node creation API using CUPTI callbacks.
 */
#include <cuda.h>
#include <cuda_runtime.h>
#include <cupti.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define DRIVER_API_CALL(apiFuncCall)                                           \
do {                                                                           \
    CUresult _status = apiFuncCall;                                            \
    if (_status != CUDA_SUCCESS) {                                             \
        const char* errstr;                                                    \
        cuGetErrorString(_status, &errstr);                                    \
        fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n",   \
                __FILE__, __LINE__, #apiFuncCall, errstr);                     \
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
} while (0)

#define RUNTIME_API_CALL(apiFuncCall)                                          \
do {                                                                           \
    cudaError_t _status = apiFuncCall;                                         \
    if (_status != cudaSuccess) {                                              \
        fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n",   \
                __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
} while (0)

#define CUPTI_CALL(call)                                                    \
  do {                                                                      \
    CUptiResult _status = call;                                             \
    if (_status != CUPTI_SUCCESS) {                                         \
      const char *errstr;                                                   \
      cuptiGetResultString(_status, &errstr);                               \
      fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n",  \
              __FILE__, __LINE__, #call, errstr);                           \
      exit(EXIT_FAILURE);                                                   \
    }                                                                       \
  } while (0)

#define COMPUTE_N 50000

extern void initTrace(void);
extern void finiTrace(void);

// Kernels
__global__ void
VecAdd(const int* A, const int* B, int* C, int N)
{
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < N)
    C[i] = A[i] + B[i];
}

__global__ void
VecSub(const int* A, const int* B, int* C, int N)
{
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < N)
    C[i] = A[i] - B[i];
}

static void
do_pass(cudaStream_t stream)
{
  int *h_A, *h_B, *h_C;
  int *d_A, *d_B, *d_C;
  size_t size = COMPUTE_N * sizeof(int);
  int threadsPerBlock = 256;
  int blocksPerGrid = 0;
  cudaKernelNodeParams kernelParams;
  cudaMemcpy3DParms memcpyParams = {0};
  cudaGraph_t graph;
  cudaGraphExec_t graphExec;
  cudaGraphNode_t nodes[5];

  // Allocate input vectors h_A and h_B in host memory
  // don't bother to initialize
  h_A = (int*)malloc(size);
  h_B = (int*)malloc(size);
  h_C = (int*)malloc(size);
  if (!h_A || !h_B || !h_C) {
    printf("Error: out of memory\n");
    exit(EXIT_FAILURE);
  }

  // Allocate vectors in device memory
  RUNTIME_API_CALL(cudaMalloc((void**)&d_A, size));
  RUNTIME_API_CALL(cudaMalloc((void**)&d_B, size));
  RUNTIME_API_CALL(cudaMalloc((void**)&d_C, size));

  RUNTIME_API_CALL(cudaGraphCreate(&graph, 0));

  // Init memcpy params
  memcpyParams.kind = cudaMemcpyHostToDevice;
  memcpyParams.srcPtr.ptr = h_A;
  memcpyParams.dstPtr.ptr = d_A;
  memcpyParams.extent.width = size;
  memcpyParams.extent.height = 1;
  memcpyParams.extent.depth = 1;
  RUNTIME_API_CALL(cudaGraphAddMemcpyNode(&nodes[0], graph, NULL, 0, &memcpyParams));

  memcpyParams.srcPtr.ptr = h_B;
  memcpyParams.dstPtr.ptr = d_B;
  RUNTIME_API_CALL(cudaGraphAddMemcpyNode(&nodes[1], graph, NULL, 0, &memcpyParams));

  // Init kernel params
  int num = COMPUTE_N;
  void* kernelArgs[] = {(void *)&d_A, (void *)&d_B, (void *)&d_C, (void *)&num};
  blocksPerGrid = (COMPUTE_N + threadsPerBlock - 1) / threadsPerBlock;
  kernelParams.func = (void *)VecAdd;
  kernelParams.gridDim = dim3(blocksPerGrid, 1, 1);
  kernelParams.blockDim = dim3(threadsPerBlock, 1, 1);
  kernelParams.sharedMemBytes = 0;
  kernelParams.kernelParams = (void **)kernelArgs;
  kernelParams.extra = NULL;

  RUNTIME_API_CALL(cudaGraphAddKernelNode(&nodes[2], graph, &nodes[0], 2, &kernelParams));

  kernelParams.func = (void *)VecSub;
  RUNTIME_API_CALL(cudaGraphAddKernelNode(&nodes[3], graph, &nodes[2], 1, &kernelParams));

  memcpyParams.kind = cudaMemcpyDeviceToHost;
  memcpyParams.srcPtr.ptr = d_C;
  memcpyParams.dstPtr.ptr = h_C;
  memcpyParams.extent.width = size;
  memcpyParams.extent.height = 1;
  memcpyParams.extent.depth = 1;
  RUNTIME_API_CALL(cudaGraphAddMemcpyNode(&nodes[4], graph, &nodes[3], 1, &memcpyParams));

  RUNTIME_API_CALL(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));

  RUNTIME_API_CALL(cudaGraphLaunch(graphExec, stream));
  RUNTIME_API_CALL(cudaStreamSynchronize(stream));

  free(h_A);
  free(h_B);
  free(h_C);
  RUNTIME_API_CALL(cudaFree(d_A));
  RUNTIME_API_CALL(cudaFree(d_B));
  RUNTIME_API_CALL(cudaFree(d_C));
}

int
main(int argc, char *argv[])
{
  CUdevice device;
  char deviceName[256];

  // initialize the activity trace
  initTrace();

  DRIVER_API_CALL(cuInit(0));

  DRIVER_API_CALL(cuDeviceGet(&device, 0));
  DRIVER_API_CALL(cuDeviceGetName(deviceName, 256, device));
  printf("Device Name: %s\n", deviceName);

  RUNTIME_API_CALL(cudaSetDevice(0));

  // do pass with user stream
  cudaStream_t stream0;
  RUNTIME_API_CALL(cudaStreamCreate(&stream0));
  do_pass(stream0);

  RUNTIME_API_CALL(cudaDeviceSynchronize());

  // Flush CUPTI buffers before resetting the device.
  // This can also be called in the cudaDeviceReset callback.
  CUPTI_CALL(cuptiActivityFlushAll(0));
  RUNTIME_API_CALL(cudaDeviceReset());

  finiTrace();
  exit(EXIT_SUCCESS);
}