// // Copyright 2020 NVIDIA Corporation. All rights reserved // #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef EXIT_WAIVED #define EXIT_WAIVED 2 #endif #define NVPW_API_CALL(apiFuncCall) \ do { \ NVPA_Status _status = apiFuncCall; \ if (_status != NVPA_STATUS_SUCCESS) { \ fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \ __FILE__, __LINE__, #apiFuncCall, _status); \ exit(EXIT_FAILURE); \ } \ } while (0) #define CUPTI_API_CALL(apiFuncCall) \ do { \ CUptiResult _status = apiFuncCall; \ if (_status != CUPTI_SUCCESS) { \ const char *errstr; \ cuptiGetResultString(_status, &errstr); \ fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ __FILE__, __LINE__, #apiFuncCall, errstr); \ exit(EXIT_FAILURE); \ } \ } while (0) #define DRIVER_API_CALL(apiFuncCall) \ do { \ CUresult _status = apiFuncCall; \ if (_status != CUDA_SUCCESS) { \ fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \ __FILE__, __LINE__, #apiFuncCall, _status); \ exit(EXIT_FAILURE); \ } \ } while (0) #define RUNTIME_API_CALL(apiFuncCall) \ do { \ cudaError_t _status = apiFuncCall; \ if (_status != cudaSuccess) { \ fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\ exit(EXIT_FAILURE); \ } \ } while (0) #define METRIC_NAME "sm__ctas_launched.sum" struct ProfilingData_t { int numRanges = 2; bool bProfiling = false; std::string chipName; std::vector metricNames; std::string CounterDataFileName = "SimpleCupti.counterdata"; std::string CounterDataSBFileName = "SimpleCupti.counterdataSB"; CUpti_ProfilerRange profilerRange = CUPTI_AutoRange; CUpti_ProfilerReplayMode profilerReplayMode = CUPTI_UserReplay; bool allPassesSubmitted = true; std::vector counterDataImagePrefix; std::vector configImage; std::vector counterDataImage; std::vector counterDataScratchBuffer; }; __global__ void VecAdd(const int* A, const int* B, int* C, int N) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < N) C[i] = A[i] + B[i]; } void enableProfiling(ProfilingData_t* pProfilingData) { CUpti_Profiler_EnableProfiling_Params enableProfilingParams = { CUpti_Profiler_EnableProfiling_Params_STRUCT_SIZE }; if (pProfilingData->profilerReplayMode == CUPTI_KernelReplay) { CUPTI_API_CALL(cuptiProfilerEnableProfiling(&enableProfilingParams)); } else if (pProfilingData->profilerReplayMode == CUPTI_UserReplay) { CUpti_Profiler_BeginPass_Params beginPassParams = { CUpti_Profiler_BeginPass_Params_STRUCT_SIZE }; CUPTI_API_CALL(cuptiProfilerBeginPass(&beginPassParams)); CUPTI_API_CALL(cuptiProfilerEnableProfiling(&enableProfilingParams)); } } void disableProfiling(ProfilingData_t* pProfilingData) { CUpti_Profiler_DisableProfiling_Params disableProfilingParams = { CUpti_Profiler_DisableProfiling_Params_STRUCT_SIZE }; CUPTI_API_CALL(cuptiProfilerDisableProfiling(&disableProfilingParams)); if (pProfilingData->profilerReplayMode == CUPTI_UserReplay) { CUpti_Profiler_EndPass_Params endPassParams = { CUpti_Profiler_EndPass_Params_STRUCT_SIZE }; CUPTI_API_CALL(cuptiProfilerEndPass(&endPassParams)); pProfilingData->allPassesSubmitted = (endPassParams.allPassesSubmitted == 1) ? true : false; } else if (pProfilingData->profilerReplayMode == CUPTI_KernelReplay) { pProfilingData->allPassesSubmitted = true; } if (pProfilingData->allPassesSubmitted) { CUpti_Profiler_FlushCounterData_Params flushCounterDataParams = { CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE }; CUPTI_API_CALL(cuptiProfilerFlushCounterData(&flushCounterDataParams)); } } void beginSession(ProfilingData_t* pProfilingData) { CUpti_Profiler_BeginSession_Params beginSessionParams = { CUpti_Profiler_BeginSession_Params_STRUCT_SIZE }; beginSessionParams.ctx = NULL; beginSessionParams.counterDataImageSize = pProfilingData->counterDataImage.size(); beginSessionParams.pCounterDataImage = &pProfilingData->counterDataImage[0]; beginSessionParams.counterDataScratchBufferSize = pProfilingData->counterDataScratchBuffer.size(); beginSessionParams.pCounterDataScratchBuffer = &pProfilingData->counterDataScratchBuffer[0]; beginSessionParams.range = pProfilingData->profilerRange; beginSessionParams.replayMode = pProfilingData->profilerReplayMode; beginSessionParams.maxRangesPerPass = pProfilingData->numRanges; beginSessionParams.maxLaunchesPerPass = pProfilingData->numRanges; CUPTI_API_CALL(cuptiProfilerBeginSession(&beginSessionParams)); } void setConfig(ProfilingData_t* pProfilingData) { CUpti_Profiler_SetConfig_Params setConfigParams = { CUpti_Profiler_SetConfig_Params_STRUCT_SIZE }; setConfigParams.pConfig = &pProfilingData->configImage[0]; setConfigParams.configSize = pProfilingData->configImage.size(); setConfigParams.passIndex = 0; CUPTI_API_CALL(cuptiProfilerSetConfig(&setConfigParams)); } void createCounterDataImage(int numRanges, std::vector& counterDataImagePrefix, std::vector& counterDataScratchBuffer, std::vector& counterDataImage ) { CUpti_Profiler_CounterDataImageOptions counterDataImageOptions; counterDataImageOptions.pCounterDataPrefix = &counterDataImagePrefix[0]; counterDataImageOptions.counterDataPrefixSize = counterDataImagePrefix.size(); counterDataImageOptions.maxNumRanges = numRanges; counterDataImageOptions.maxNumRangeTreeNodes = numRanges; counterDataImageOptions.maxRangeNameLength = 64; CUpti_Profiler_CounterDataImage_CalculateSize_Params calculateSizeParams = { CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE }; calculateSizeParams.pOptions = &counterDataImageOptions; calculateSizeParams.sizeofCounterDataImageOptions = CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE; CUPTI_API_CALL(cuptiProfilerCounterDataImageCalculateSize(&calculateSizeParams)); CUpti_Profiler_CounterDataImage_Initialize_Params initializeParams = { CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE }; initializeParams.sizeofCounterDataImageOptions = CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE; initializeParams.pOptions = &counterDataImageOptions; initializeParams.counterDataImageSize = calculateSizeParams.counterDataImageSize; counterDataImage.resize(calculateSizeParams.counterDataImageSize); initializeParams.pCounterDataImage = &counterDataImage[0]; CUPTI_API_CALL(cuptiProfilerCounterDataImageInitialize(&initializeParams)); CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params scratchBufferSizeParams = { CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE }; scratchBufferSizeParams.counterDataImageSize = calculateSizeParams.counterDataImageSize; scratchBufferSizeParams.pCounterDataImage = initializeParams.pCounterDataImage; CUPTI_API_CALL(cuptiProfilerCounterDataImageCalculateScratchBufferSize(&scratchBufferSizeParams)); counterDataScratchBuffer.resize(scratchBufferSizeParams.counterDataScratchBufferSize); CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params initScratchBufferParams = { CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE }; initScratchBufferParams.counterDataImageSize = calculateSizeParams.counterDataImageSize; initScratchBufferParams.pCounterDataImage = initializeParams.pCounterDataImage; initScratchBufferParams.counterDataScratchBufferSize = scratchBufferSizeParams.counterDataScratchBufferSize; initScratchBufferParams.pCounterDataScratchBuffer = &counterDataScratchBuffer[0]; CUPTI_API_CALL(cuptiProfilerCounterDataImageInitializeScratchBuffer(&initScratchBufferParams)); } void setupProfiling(ProfilingData_t* pProfilingData) { /* Generate configuration for metrics, this can also be done offline*/ NVPW_InitializeHost_Params initializeHostParams = { NVPW_InitializeHost_Params_STRUCT_SIZE }; NVPW_API_CALL(NVPW_InitializeHost(&initializeHostParams)); if (pProfilingData->metricNames.size()) { if (!NV::Metric::Config::GetConfigImage(pProfilingData->chipName, pProfilingData->metricNames, pProfilingData->configImage)) { std::cout << "Failed to create configImage" << std::endl; exit(EXIT_FAILURE); } if (!NV::Metric::Config::GetCounterDataPrefixImage(pProfilingData->chipName, pProfilingData->metricNames, pProfilingData->counterDataImagePrefix)) { std::cout << "Failed to create counterDataImagePrefix" << std::endl; exit(EXIT_FAILURE); } } else { std::cout << "No metrics provided to profile" << std::endl; exit(EXIT_FAILURE); } createCounterDataImage(pProfilingData->numRanges, pProfilingData->counterDataImagePrefix, pProfilingData->counterDataScratchBuffer, pProfilingData->counterDataImage); beginSession(pProfilingData); setConfig(pProfilingData); } void stopProfiling(ProfilingData_t* pProfilingData) { CUpti_Profiler_UnsetConfig_Params unsetConfigParams = { CUpti_Profiler_UnsetConfig_Params_STRUCT_SIZE }; CUpti_Profiler_EndSession_Params endSessionParams = { CUpti_Profiler_EndSession_Params_STRUCT_SIZE }; CUpti_Profiler_DeInitialize_Params profilerDeInitializeParams = {CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE}; CUPTI_API_CALL(cuptiProfilerUnsetConfig(&unsetConfigParams)); CUPTI_API_CALL(cuptiProfilerEndSession(&endSessionParams)); CUPTI_API_CALL(cuptiProfilerDeInitialize(&profilerDeInitializeParams)); // Dump counterDataImage and counterDataScratchBuffer in file. WriteBinaryFile(pProfilingData->CounterDataFileName.c_str(), pProfilingData->counterDataImage); WriteBinaryFile(pProfilingData->CounterDataSBFileName.c_str(), pProfilingData->counterDataScratchBuffer); } void callbackHandler(void* userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, void* cbdata) { ProfilingData_t* profilingData = (ProfilingData_t*)(userdata); const CUpti_CallbackData* cbInfo = (CUpti_CallbackData*)cbdata; switch (domain) { case CUPTI_CB_DOMAIN_DRIVER_API: switch (cbid) { case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: { if (cbInfo->callbackSite == CUPTI_API_ENTER) { enableProfiling(profilingData); } else { disableProfiling(profilingData); } } break; default: break; } break; case CUPTI_CB_DOMAIN_RESOURCE: switch (cbid) { case CUPTI_CBID_RESOURCE_CONTEXT_CREATED: { setupProfiling(profilingData); profilingData->bProfiling = true; } break; default: break; } break; default: break; } } void initVec(int* vec, int n) { for (int i = 0; i < n; i++) vec[i] = i; } void cleanUp(int* h_A, int* h_B, int* h_C, int* d_A, int* d_B, int* d_C) { if (d_A) RUNTIME_API_CALL(cudaFree(d_A)); if (d_B) RUNTIME_API_CALL(cudaFree(d_B)); if (d_C) RUNTIME_API_CALL(cudaFree(d_C)); // Free host memory if (h_A) free(h_A); if (h_B) free(h_B); if (h_C) free(h_C); } void VectorAdd() { int N = 50000; size_t size = N * sizeof(int); int threadsPerBlock = 0; int blocksPerGrid = 0; int* h_A, * h_B, * h_C; int* d_A, * d_B, * d_C; int i, sum; // Allocate input vectors h_A and h_B in host memory h_A = (int*)malloc(size); h_B = (int*)malloc(size); h_C = (int*)malloc(size); // Initialize input vectors initVec(h_A, N); initVec(h_B, N); memset(h_C, 0, size); // Allocate vectors in device memory RUNTIME_API_CALL(cudaMalloc((void**)&d_A, size)); RUNTIME_API_CALL(cudaMalloc((void**)&d_B, size)); RUNTIME_API_CALL(cudaMalloc((void**)&d_C, size)); // Copy vectors from host memory to device memory RUNTIME_API_CALL(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice)); RUNTIME_API_CALL(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice)); // Invoke kernel threadsPerBlock = 256; blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; printf("Launching kernel: blocks %d, thread/block %d\n", blocksPerGrid, threadsPerBlock); VecAdd << > > (d_A, d_B, d_C, N); // Copy result from device memory to host memory // h_C contains the result in host memory RUNTIME_API_CALL(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost)); // Verify result for (i = 0; i < N; ++i) { sum = h_A[i] + h_B[i]; if (h_C[i] != sum) { fprintf(stderr, "error: result verification failed\n"); exit(EXIT_FAILURE); } } cleanUp(h_A, h_B, h_C, d_A, d_B, d_C); } int main(int argc, char* argv[]) { CUdevice cuDevice = 0; int deviceCount, deviceNum = 0; int computeCapabilityMajor = 0, computeCapabilityMinor = 0; DRIVER_API_CALL(cuInit(0)); DRIVER_API_CALL(cuDeviceGetCount(&deviceCount)); if (deviceCount == 0) { printf("There is no device supporting CUDA.\n"); exit(EXIT_WAIVED); } DRIVER_API_CALL(cuDeviceGet(&cuDevice, deviceNum)); DRIVER_API_CALL(cuDeviceGetAttribute(&computeCapabilityMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); DRIVER_API_CALL(cuDeviceGetAttribute(&computeCapabilityMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); printf("Compute Capability of Device: %d.%d\n", computeCapabilityMajor, computeCapabilityMinor); // Initialize profiler API support and test device compatibility CUpti_Profiler_Initialize_Params profilerInitializeParams = {CUpti_Profiler_Initialize_Params_STRUCT_SIZE}; CUPTI_API_CALL(cuptiProfilerInitialize(&profilerInitializeParams)); CUpti_Profiler_DeviceSupported_Params params = { CUpti_Profiler_DeviceSupported_Params_STRUCT_SIZE }; params.cuDevice = deviceNum; CUPTI_API_CALL(cuptiProfilerDeviceSupported(¶ms)); if (params.isSupported != CUPTI_PROFILER_CONFIGURATION_SUPPORTED) { ::std::cerr << "Unable to profile on device " << deviceNum << ::std::endl; if (params.architecture == CUPTI_PROFILER_CONFIGURATION_UNSUPPORTED) { ::std::cerr << "\tdevice architecture is not supported" << ::std::endl; } if (params.sli == CUPTI_PROFILER_CONFIGURATION_UNSUPPORTED) { ::std::cerr << "\tdevice sli configuration is not supported" << ::std::endl; } if (params.vGpu == CUPTI_PROFILER_CONFIGURATION_UNSUPPORTED) { ::std::cerr << "\tdevice vgpu configuration is not supported" << ::std::endl; } else if (params.vGpu == CUPTI_PROFILER_CONFIGURATION_DISABLED) { ::std::cerr << "\tdevice vgpu configuration disabled profiling support" << ::std::endl; } if (params.confidentialCompute == CUPTI_PROFILER_CONFIGURATION_UNSUPPORTED) { ::std::cerr << "\tdevice confidential compute configuration is not supported" << ::std::endl; } if (params.cmp == CUPTI_PROFILER_CONFIGURATION_UNSUPPORTED) { ::std::cerr << "\tNVIDIA Crypto Mining Processors (CMP) are not supported" << ::std::endl; } exit(EXIT_WAIVED); } ProfilingData_t* profilingData = new ProfilingData_t(); for (int i = 1; i < argc; ++i) { char* arg = argv[i]; if (strcmp(arg, "--help") == 0 || strcmp(arg, "-h") == 0) { printf("Usage: %s -d [device_num] -m [metric_names comma separated] -n [num of ranges] -r [kernel or user] -o [counterdata filename]\n", argv[0]); exit(EXIT_SUCCESS); } if (strcmp(arg, "--device") == 0 || strcmp(arg, "-d") == 0) { deviceNum = atoi(argv[i + 1]); printf("CUDA Device Number: %d\n", deviceNum); i++; } else if (strcmp(arg, "--metrics") == 0 || strcmp(arg, "-m") == 0) { char* metricName = strtok(argv[i + 1], ","); while (metricName != NULL) { profilingData->metricNames.push_back(metricName); metricName = strtok(NULL, ","); } i++; } else if (strcmp(arg, "--numRanges") == 0 || strcmp(arg, "-n") == 0) { int numRanges = atoi(argv[i + 1]); profilingData->numRanges = numRanges; i++; } else if (strcmp(arg, "--replayMode") == 0 || strcmp(arg, "-r") == 0) { std::string replayMode(argv[i + 1]); if (replayMode == "kernel") profilingData->profilerReplayMode = CUPTI_KernelReplay; else if (replayMode == "user") profilingData->profilerReplayMode = CUPTI_UserReplay; else { printf("Invalid --replayMode argument supported replayMode type 'kernel' or 'user'\n"); exit(EXIT_FAILURE); } i++; } else if (strcmp(arg, "--outputCounterData") == 0 || strcmp(arg, "-o") == 0) { std::string outputCounterData(argv[i + 1]); profilingData->CounterDataFileName = outputCounterData; profilingData->CounterDataSBFileName = outputCounterData + "SB"; i++; } else { printf("Error!! Invalid Arguments\n"); printf("Usage: %s -d [device_num] -m [metric_names comma separated] -n [num of ranges] -r [kernel or user] -o [counterdata filename]\n", argv[0]); exit(EXIT_FAILURE); } } if (profilingData->metricNames.empty()) profilingData->metricNames.push_back(METRIC_NAME); CUpti_Device_GetChipName_Params getChipNameParams = { CUpti_Device_GetChipName_Params_STRUCT_SIZE }; getChipNameParams.deviceIndex = deviceNum; CUPTI_API_CALL(cuptiDeviceGetChipName(&getChipNameParams)); profilingData->chipName = getChipNameParams.pChipName; CUpti_SubscriberHandle subscriber; CUPTI_API_CALL(cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)callbackHandler, profilingData)); CUPTI_API_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RESOURCE, CUPTI_CBID_RESOURCE_CONTEXT_CREATED)); CUPTI_API_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)); do { VectorAdd(); } while (!profilingData->allPassesSubmitted); if (profilingData->bProfiling) { stopProfiling(profilingData); profilingData->bProfiling = false; /* Evaluation of metrics collected in counterDataImage, this can also be done offline*/ NV::Metric::Eval::PrintMetricValues(profilingData->chipName, profilingData->counterDataImage, profilingData->metricNames); } delete profilingData; exit(EXIT_SUCCESS); }