|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "cuda.h" |
|
|
#include "cuda_runtime_api.h" |
|
|
#include "cupti_profiler_target.h" |
|
|
#include "cupti_target.h" |
|
|
#include "driver_types.h" |
|
|
#include "nvperf_host.h" |
|
|
|
|
|
|
|
|
#include <Eval.h> |
|
|
using ::NV::Metric::Eval::PrintMetricValues; |
|
|
|
|
|
#include <Metric.h> |
|
|
using ::NV::Metric::Config::GetConfigImage; |
|
|
using ::NV::Metric::Config::GetCounterDataPrefixImage; |
|
|
|
|
|
#include <Utils.h> |
|
|
using ::NV::Metric::Utils::GetNVPWResultString; |
|
|
|
|
|
|
|
|
#include <stdlib.h> |
|
|
#include <chrono> |
|
|
#include <cstdint> |
|
|
#include <iostream> |
|
|
using ::std::cerr; |
|
|
using ::std::cout; |
|
|
using ::std::endl; |
|
|
|
|
|
#include <string> |
|
|
using ::std::string; |
|
|
|
|
|
#include <thread> |
|
|
using ::std::thread; |
|
|
|
|
|
#include <vector> |
|
|
using ::std::vector; |
|
|
|
|
|
#ifndef EXIT_WAIVED |
|
|
#define EXIT_WAIVED 2 |
|
|
#endif |
|
|
|
|
|
|
|
|
#define CUPTI_API_CALL(apiFuncCall) \ |
|
|
do { \ |
|
|
CUptiResult _status = apiFuncCall; \ |
|
|
if (_status != CUPTI_SUCCESS) { \ |
|
|
const char *errstr; \ |
|
|
cuptiGetResultString(_status, &errstr); \ |
|
|
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ |
|
|
__FILE__, __LINE__, #apiFuncCall, errstr); \ |
|
|
exit(EXIT_FAILURE); \ |
|
|
} \ |
|
|
} while (0) |
|
|
|
|
|
#define RUNTIME_API_CALL(apiFuncCall) \ |
|
|
do { \ |
|
|
cudaError_t _status = apiFuncCall; \ |
|
|
if (_status != cudaSuccess) { \ |
|
|
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ |
|
|
__FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\ |
|
|
exit(EXIT_FAILURE); \ |
|
|
} \ |
|
|
} while (0) |
|
|
|
|
|
#define MEMORY_ALLOCATION_CALL(var) \ |
|
|
do { \ |
|
|
if (var == NULL) { \ |
|
|
fprintf(stderr, "%s:%d: Error: Memory Allocation Failed \n", \ |
|
|
__FILE__, __LINE__); \ |
|
|
exit(EXIT_FAILURE); \ |
|
|
} \ |
|
|
} while (0) |
|
|
|
|
|
#define DRIVER_API_CALL(apiFuncCall) \ |
|
|
do { \ |
|
|
CUresult _status = apiFuncCall; \ |
|
|
if (_status != CUDA_SUCCESS) { \ |
|
|
fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \ |
|
|
__FILE__, __LINE__, #apiFuncCall, _status); \ |
|
|
exit(EXIT_FAILURE); \ |
|
|
} \ |
|
|
} while (0) |
|
|
|
|
|
#define NVPW_API_CALL(apiFuncCall) \ |
|
|
do { \ |
|
|
NVPA_Status _status = apiFuncCall; \ |
|
|
if (_status != NVPA_STATUS_SUCCESS) { \ |
|
|
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ |
|
|
__FILE__, __LINE__, #apiFuncCall, NV::Metric::Utils::GetNVPWResultString(_status)); \ |
|
|
exit(EXIT_FAILURE); \ |
|
|
} \ |
|
|
} while (0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
typedef struct |
|
|
{ |
|
|
char const * chipName; |
|
|
int device; |
|
|
int maxLaunchesPerPass; |
|
|
int maxNumRanges; |
|
|
int maxRangeNameLength; |
|
|
int maxRangesPerPass; |
|
|
int minNestingLevels; |
|
|
int numNestingLevels; |
|
|
CUpti_ProfilerRange rangeMode; |
|
|
CUpti_ProfilerReplayMode replayMode; |
|
|
CUcontext context; |
|
|
} profilingConfig; |
|
|
|
|
|
|
|
|
typedef struct |
|
|
{ |
|
|
int deviceID; |
|
|
profilingConfig config; |
|
|
vector<uint8_t> counterDataImage; |
|
|
vector<uint8_t> counterDataPrefixImage; |
|
|
vector<uint8_t> counterDataScratchBufferImage; |
|
|
vector<uint8_t> configImage; |
|
|
vector<cudaStream_t> streams; |
|
|
vector<double *> d_x; |
|
|
vector<double *> d_y; |
|
|
} perDeviceData; |
|
|
|
|
|
bool explicitlyInitialized = false; |
|
|
|
|
|
|
|
|
void explicitInitialization() |
|
|
{ |
|
|
if (explicitlyInitialized == false) |
|
|
{ |
|
|
|
|
|
CUpti_Profiler_Initialize_Params profilerInitializeParams = { CUpti_Profiler_Initialize_Params_STRUCT_SIZE }; |
|
|
CUPTI_API_CALL(cuptiProfilerInitialize(&profilerInitializeParams)); |
|
|
|
|
|
|
|
|
NVPW_InitializeHost_Params initializeHostParams = { NVPW_InitializeHost_Params_STRUCT_SIZE }; |
|
|
NVPW_API_CALL(NVPW_InitializeHost(&initializeHostParams)); |
|
|
|
|
|
explicitlyInitialized = true; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void targetInitProfiling(perDeviceData &deviceData, |
|
|
vector<string> const &metricNames) |
|
|
{ |
|
|
|
|
|
explicitInitialization(); |
|
|
|
|
|
|
|
|
CUpti_Profiler_GetCounterAvailability_Params getCounterAvailabilityParams = { CUpti_Profiler_GetCounterAvailability_Params_STRUCT_SIZE }; |
|
|
getCounterAvailabilityParams.ctx = deviceData.config.context; |
|
|
CUPTI_API_CALL(cuptiProfilerGetCounterAvailability(&getCounterAvailabilityParams)); |
|
|
|
|
|
|
|
|
vector<uint8_t> counterAvailabilityImage; |
|
|
counterAvailabilityImage.resize(getCounterAvailabilityParams.counterAvailabilityImageSize); |
|
|
|
|
|
|
|
|
getCounterAvailabilityParams.pCounterAvailabilityImage = counterAvailabilityImage.data(); |
|
|
CUPTI_API_CALL(cuptiProfilerGetCounterAvailability(&getCounterAvailabilityParams)); |
|
|
|
|
|
|
|
|
CUpti_Device_GetChipName_Params getChipNameParams = { CUpti_Device_GetChipName_Params_STRUCT_SIZE }; |
|
|
getChipNameParams.deviceIndex = deviceData.config.device; |
|
|
CUPTI_API_CALL(cuptiDeviceGetChipName(&getChipNameParams)); |
|
|
deviceData.config.chipName = strdup(getChipNameParams.pChipName); |
|
|
|
|
|
|
|
|
if (!NV::Metric::Config::GetConfigImage(deviceData.config.chipName, metricNames, deviceData.configImage, counterAvailabilityImage.data())) |
|
|
{ |
|
|
cerr << "Failed to create configImage" << endl; |
|
|
exit(EXIT_FAILURE); |
|
|
} |
|
|
|
|
|
|
|
|
if (!NV::Metric::Config::GetCounterDataPrefixImage(deviceData.config.chipName, metricNames, deviceData.counterDataPrefixImage, counterAvailabilityImage.data())) |
|
|
{ |
|
|
cerr << "Failed to create counterDataPrefixImage" << endl; |
|
|
exit(EXIT_FAILURE); |
|
|
} |
|
|
|
|
|
|
|
|
CUpti_Profiler_CounterDataImageOptions counterDataImageOptions; |
|
|
counterDataImageOptions.pCounterDataPrefix = deviceData.counterDataPrefixImage.data(); |
|
|
counterDataImageOptions.counterDataPrefixSize = deviceData.counterDataPrefixImage.size(); |
|
|
counterDataImageOptions.maxNumRanges = deviceData.config.maxNumRanges; |
|
|
counterDataImageOptions.maxNumRangeTreeNodes = deviceData.config.maxNumRanges; |
|
|
counterDataImageOptions.maxRangeNameLength = deviceData.config.maxRangeNameLength; |
|
|
|
|
|
|
|
|
CUpti_Profiler_CounterDataImage_CalculateSize_Params calculateSizeParams = { CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE }; |
|
|
calculateSizeParams.pOptions = &counterDataImageOptions; |
|
|
calculateSizeParams.sizeofCounterDataImageOptions = CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE; |
|
|
CUPTI_API_CALL(cuptiProfilerCounterDataImageCalculateSize(&calculateSizeParams)); |
|
|
|
|
|
deviceData.counterDataImage.resize(calculateSizeParams.counterDataImageSize); |
|
|
|
|
|
|
|
|
CUpti_Profiler_CounterDataImage_Initialize_Params initializeParams = { CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE }; |
|
|
initializeParams.pOptions = &counterDataImageOptions; |
|
|
initializeParams.sizeofCounterDataImageOptions = CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE; |
|
|
initializeParams.counterDataImageSize = deviceData.counterDataImage.size(); |
|
|
initializeParams.pCounterDataImage = deviceData.counterDataImage.data(); |
|
|
CUPTI_API_CALL(cuptiProfilerCounterDataImageInitialize(&initializeParams)); |
|
|
|
|
|
|
|
|
CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params scratchBufferSizeParams = { CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE }; |
|
|
scratchBufferSizeParams.counterDataImageSize = deviceData.counterDataImage.size(); |
|
|
scratchBufferSizeParams.pCounterDataImage = deviceData.counterDataImage.data(); |
|
|
CUPTI_API_CALL(cuptiProfilerCounterDataImageCalculateScratchBufferSize(&scratchBufferSizeParams)); |
|
|
|
|
|
deviceData.counterDataScratchBufferImage.resize(scratchBufferSizeParams.counterDataScratchBufferSize); |
|
|
|
|
|
|
|
|
CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params initScratchBufferParams = { CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE }; |
|
|
initScratchBufferParams.counterDataImageSize = deviceData.counterDataImage.size(); |
|
|
initScratchBufferParams.pCounterDataImage = deviceData.counterDataImage.data(); |
|
|
initScratchBufferParams.counterDataScratchBufferSize = deviceData.counterDataScratchBufferImage.size();; |
|
|
initScratchBufferParams.pCounterDataScratchBuffer = deviceData.counterDataScratchBufferImage.data(); |
|
|
CUPTI_API_CALL(cuptiProfilerCounterDataImageInitializeScratchBuffer(&initScratchBufferParams)); |
|
|
} |
|
|
|
|
|
void startSession(profilingConfig &config, |
|
|
vector<uint8_t> &counterDataImage, |
|
|
vector<uint8_t> &counterDataScratchBuffer, |
|
|
vector<uint8_t> &configImage) |
|
|
{ |
|
|
|
|
|
explicitInitialization(); |
|
|
|
|
|
|
|
|
CUpti_Profiler_BeginSession_Params beginSessionParams = { CUpti_Profiler_BeginSession_Params_STRUCT_SIZE }; |
|
|
beginSessionParams.counterDataImageSize = counterDataImage.size(); |
|
|
beginSessionParams.pCounterDataImage = counterDataImage.data(); |
|
|
beginSessionParams.counterDataScratchBufferSize = counterDataScratchBuffer.size(); |
|
|
beginSessionParams.pCounterDataScratchBuffer = counterDataScratchBuffer.data(); |
|
|
beginSessionParams.ctx = config.context; |
|
|
beginSessionParams.maxLaunchesPerPass = config.maxLaunchesPerPass; |
|
|
beginSessionParams.maxRangesPerPass = config.maxRangesPerPass; |
|
|
beginSessionParams.pPriv = NULL; |
|
|
beginSessionParams.range = config.rangeMode; |
|
|
beginSessionParams.replayMode = config.replayMode; |
|
|
CUPTI_API_CALL(cuptiProfilerBeginSession(&beginSessionParams)); |
|
|
|
|
|
CUpti_Profiler_SetConfig_Params setConfigParams = { CUpti_Profiler_SetConfig_Params_STRUCT_SIZE }; |
|
|
setConfigParams.pConfig = configImage.data(); |
|
|
setConfigParams.configSize = configImage.size(); |
|
|
setConfigParams.passIndex = 0; |
|
|
setConfigParams.minNestingLevel = config.minNestingLevels; |
|
|
setConfigParams.numNestingLevels = config.numNestingLevels; |
|
|
setConfigParams.targetNestingLevel = config.minNestingLevels; |
|
|
CUPTI_API_CALL(cuptiProfilerSetConfig(&setConfigParams)); |
|
|
} |
|
|
|
|
|
|
|
|
void beginPass(profilingConfig const &config) |
|
|
{ |
|
|
CUpti_Profiler_BeginPass_Params beginPassParams = { CUpti_Profiler_BeginPass_Params_STRUCT_SIZE }; |
|
|
beginPassParams.ctx = config.context; |
|
|
CUPTI_API_CALL(cuptiProfilerBeginPass(&beginPassParams)); |
|
|
} |
|
|
|
|
|
void enableProfiling(profilingConfig const &config) |
|
|
{ |
|
|
CUpti_Profiler_EnableProfiling_Params enableProfilingParams = { CUpti_Profiler_EnableProfiling_Params_STRUCT_SIZE }; |
|
|
enableProfilingParams.ctx = config.context; |
|
|
CUPTI_API_CALL(cuptiProfilerEnableProfiling(&enableProfilingParams)); |
|
|
} |
|
|
|
|
|
void pushRange(profilingConfig const &config, char const * rangeName) |
|
|
{ |
|
|
CUpti_Profiler_PushRange_Params pushRangeParams = { CUpti_Profiler_PushRange_Params_STRUCT_SIZE }; |
|
|
pushRangeParams.ctx = config.context; |
|
|
pushRangeParams.pRangeName = rangeName; |
|
|
pushRangeParams.rangeNameLength = strlen(rangeName); |
|
|
CUPTI_API_CALL(cuptiProfilerPushRange(&pushRangeParams)); |
|
|
} |
|
|
|
|
|
void popRange(profilingConfig const &config) |
|
|
{ |
|
|
CUpti_Profiler_PopRange_Params popRangeParams = { CUpti_Profiler_PopRange_Params_STRUCT_SIZE }; |
|
|
popRangeParams.ctx = config.context; |
|
|
CUPTI_API_CALL(cuptiProfilerPopRange(&popRangeParams)); |
|
|
} |
|
|
|
|
|
void disableProfiling(profilingConfig &config) |
|
|
{ |
|
|
CUpti_Profiler_DisableProfiling_Params disableProfilingParams = { CUpti_Profiler_DisableProfiling_Params_STRUCT_SIZE }; |
|
|
disableProfilingParams.ctx = config.context; |
|
|
CUPTI_API_CALL(cuptiProfilerDisableProfiling(&disableProfilingParams)); |
|
|
} |
|
|
|
|
|
bool endPass(profilingConfig & config) |
|
|
{ |
|
|
CUpti_Profiler_EndPass_Params endPassParams = { CUpti_Profiler_EndPass_Params_STRUCT_SIZE }; |
|
|
endPassParams.ctx = config.context; |
|
|
CUPTI_API_CALL(cuptiProfilerEndPass(&endPassParams)); |
|
|
return endPassParams.allPassesSubmitted; |
|
|
} |
|
|
|
|
|
void endSession(profilingConfig &config) |
|
|
{ |
|
|
CUpti_Profiler_UnsetConfig_Params unsetConfigParams = { CUpti_Profiler_UnsetConfig_Params_STRUCT_SIZE }; |
|
|
unsetConfigParams.ctx = config.context; |
|
|
CUPTI_API_CALL(cuptiProfilerUnsetConfig(&unsetConfigParams)); |
|
|
|
|
|
CUpti_Profiler_EndSession_Params endSessionParams = { CUpti_Profiler_EndSession_Params_STRUCT_SIZE }; |
|
|
endSessionParams.ctx = config.context; |
|
|
CUPTI_API_CALL(cuptiProfilerEndSession(&endSessionParams)); |
|
|
} |
|
|
|
|
|
void flushData(profilingConfig &config) |
|
|
{ |
|
|
CUpti_Profiler_FlushCounterData_Params flushCounterDataParams = { CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE }; |
|
|
flushCounterDataParams.ctx = config.context; |
|
|
CUPTI_API_CALL(cuptiProfilerFlushCounterData(&flushCounterDataParams)); |
|
|
if (flushCounterDataParams.numRangesDropped != 0 || flushCounterDataParams.numTraceBytesDropped) |
|
|
{ |
|
|
cerr << "WARNING: " << flushCounterDataParams.numTraceBytesDropped << " trace bytes dropped due to full TraceBuffer" << endl; |
|
|
cerr << "WARNING: " << flushCounterDataParams.numRangesDropped << " ranges dropped in pass" << endl; |
|
|
} |
|
|
} |
|
|
|
|
|
#define DAXPY_REPEAT 32768 |
|
|
|
|
|
|
|
|
__global__ void daxpyKernel(int elements, double a, double * x, double * y) |
|
|
{ |
|
|
for (int i = threadIdx.x; i < elements; i += blockDim.x) |
|
|
// Artificially increase kernel runtime to emphasize concurrency |
|
|
for (int j = 0; j < DAXPY_REPEAT; j++) |
|
|
y[i] = a * x[i] + y[i]; |
|
|
} |
|
|
|
|
|
|
|
|
double a = 2.5; |
|
|
|
|
|
|
|
|
|
|
|
int threadsPerBlock = 32; |
|
|
int threadBlocks = 1; |
|
|
|
|
|
|
|
|
int const numKernels = 4; |
|
|
int const numStreams = numKernels; |
|
|
vector<size_t> elements(numKernels); |
|
|
|
|
|
|
|
|
|
|
|
int const blockSize = 4 * 1024; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void profileKernels(perDeviceData &d, char const * const rangeName, bool serial) |
|
|
{ |
|
|
|
|
|
RUNTIME_API_CALL(cudaSetDevice(d.deviceID)); |
|
|
DRIVER_API_CALL(cuCtxSetCurrent(d.config.context)); |
|
|
|
|
|
|
|
|
startSession(d.config, d.counterDataImage, d.counterDataScratchBufferImage, d.configImage); |
|
|
int numPasses = 0; |
|
|
bool lastPass = false; |
|
|
|
|
|
|
|
|
do |
|
|
{ |
|
|
beginPass(d.config); |
|
|
numPasses++; |
|
|
enableProfiling(d.config); |
|
|
|
|
|
|
|
|
|
|
|
pushRange(d.config, rangeName); |
|
|
for (unsigned int stream = 0; stream < d.streams.size(); stream++) |
|
|
{ |
|
|
cudaStream_t streamId = (serial ? 0 : d.streams[stream]); |
|
|
daxpyKernel <<<threadBlocks, threadsPerBlock, 0, streamId>>> (elements[stream], a, d.d_x[stream], d.d_y[stream]); |
|
|
} |
|
|
|
|
|
|
|
|
if (serial == false) |
|
|
{ |
|
|
for (unsigned int stream = 0; stream < d.streams.size(); stream++) |
|
|
{ |
|
|
RUNTIME_API_CALL(cudaStreamSynchronize(d.streams[stream])); |
|
|
} |
|
|
} |
|
|
else |
|
|
{ |
|
|
RUNTIME_API_CALL(cudaStreamSynchronize(0)); |
|
|
} |
|
|
popRange(d.config); |
|
|
|
|
|
disableProfiling(d.config); |
|
|
lastPass = endPass(d.config); |
|
|
} while (lastPass == false); |
|
|
|
|
|
|
|
|
flushData(d.config); |
|
|
endSession(d.config); |
|
|
} |
|
|
|
|
|
|
|
|
int main(int argc, char * argv[]) |
|
|
{ |
|
|
|
|
|
vector<string> metricNames; |
|
|
metricNames.push_back("sm__cycles_active.sum"); |
|
|
metricNames.push_back("sm__cycles_elapsed.max"); |
|
|
|
|
|
metricNames.push_back("smsp__sass_thread_inst_executed_op_dfma_pred_on.sum"); |
|
|
|
|
|
int numDevices; |
|
|
RUNTIME_API_CALL(cudaGetDeviceCount(&numDevices)); |
|
|
|
|
|
|
|
|
vector<int> device_ids; |
|
|
|
|
|
|
|
|
explicitInitialization(); |
|
|
|
|
|
|
|
|
for (int i = 0; i < numDevices; i++) |
|
|
{ |
|
|
// Get device compatibility |
|
|
CUpti_Profiler_DeviceSupported_Params params = { CUpti_Profiler_DeviceSupported_Params_STRUCT_SIZE }; |
|
|
params.cuDevice = i; |
|
|
CUPTI_API_CALL(cuptiProfilerDeviceSupported(¶ms)); |
|
|
if (params.isSupported == CUPTI_PROFILER_CONFIGURATION_SUPPORTED) |
|
|
{ |
|
|
|
|
|
device_ids.push_back(i); |
|
|
} |
|
|
else |
|
|
{ |
|
|
cerr << "Unable to profile on device " << i << ":" << endl; |
|
|
|
|
|
if (params.architecture == CUPTI_PROFILER_CONFIGURATION_UNSUPPORTED) |
|
|
{ |
|
|
cerr << "\tDevice architecture is not supported" << endl; |
|
|
} |
|
|
|
|
|
if (params.sli == CUPTI_PROFILER_CONFIGURATION_UNSUPPORTED) |
|
|
{ |
|
|
cerr << "\tDevice SLI configuration is not supported" << endl; |
|
|
} |
|
|
|
|
|
if (params.vGpu == CUPTI_PROFILER_CONFIGURATION_UNSUPPORTED) |
|
|
{ |
|
|
cerr << "\tDevice VGPU configuration is not supported" << endl; |
|
|
} |
|
|
else if (params.vGpu == CUPTI_PROFILER_CONFIGURATION_DISABLED) |
|
|
{ |
|
|
cerr << "\tDevice VGPU configuration disabled profiling support" << endl; |
|
|
} |
|
|
|
|
|
if (params.confidentialCompute == CUPTI_PROFILER_CONFIGURATION_UNSUPPORTED) |
|
|
{ |
|
|
cerr << "\tDevice Confidential Compute configuration is not supported" << endl; |
|
|
} |
|
|
|
|
|
if (params.cmp == CUPTI_PROFILER_CONFIGURATION_UNSUPPORTED) |
|
|
{ |
|
|
cerr << "\tNVIDIA Crypto Mining Processors (CMP) are not supported" << endl; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
numDevices = device_ids.size(); |
|
|
cout << "Found " << numDevices << " compatible devices" << endl; |
|
|
|
|
|
|
|
|
if (numDevices == 0) |
|
|
{ |
|
|
cerr << "No devices detected compatible with CUPTI Profiling" << endl; |
|
|
exit(EXIT_WAIVED); |
|
|
} |
|
|
|
|
|
|
|
|
vector<double> h_x(blockSize * numKernels); |
|
|
vector<double> h_y(blockSize * numKernels); |
|
|
for (size_t i = 0; i < blockSize * numKernels; i++) |
|
|
{ |
|
|
h_x[i] = 1.5 * i; |
|
|
h_y[i] = 2.0 * (i - 3000); |
|
|
} |
|
|
|
|
|
|
|
|
vector<cudaStream_t> defaultStreams(numStreams); |
|
|
for (int stream = 0; stream < numStreams; stream++) |
|
|
{ |
|
|
defaultStreams[stream] = 0; |
|
|
} |
|
|
|
|
|
|
|
|
for (int stream = 0; stream < numStreams; stream++) |
|
|
{ |
|
|
elements[stream] = blockSize * (stream + 1); |
|
|
} |
|
|
|
|
|
|
|
|
vector<perDeviceData> deviceData(numDevices); |
|
|
|
|
|
for (int device = 0; device < numDevices; device++) |
|
|
{ |
|
|
int device_id = device_ids[device]; |
|
|
RUNTIME_API_CALL(cudaSetDevice(device_id)); |
|
|
cout << "Configuring device " << device_id << endl; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
profilingConfig config; |
|
|
config.device = device_id; |
|
|
config.maxLaunchesPerPass = 1; |
|
|
|
|
|
|
|
|
if (device == 0) |
|
|
{ |
|
|
config.maxNumRanges = 3; |
|
|
} |
|
|
else |
|
|
{ |
|
|
config.maxNumRanges = 1; |
|
|
} |
|
|
|
|
|
config.maxRangeNameLength = 64; |
|
|
config.maxRangesPerPass = 1; |
|
|
config.minNestingLevels = 1; |
|
|
config.numNestingLevels = 1; |
|
|
config.rangeMode = CUPTI_UserRange; |
|
|
config.replayMode = CUPTI_UserReplay; |
|
|
DRIVER_API_CALL(cuCtxCreate(&(config.context), 0, device)); |
|
|
deviceData[device].config = config; |
|
|
|
|
|
|
|
|
targetInitProfiling(deviceData[device], metricNames); |
|
|
|
|
|
|
|
|
deviceData[device].streams.resize(numStreams); |
|
|
deviceData[device].d_x.resize(numStreams); |
|
|
deviceData[device].d_y.resize(numStreams); |
|
|
for (int stream = 0; stream < numStreams; stream++) |
|
|
{ |
|
|
RUNTIME_API_CALL(cudaStreamCreate(&(deviceData[device].streams[stream]))); |
|
|
|
|
|
|
|
|
size_t size = elements[stream] * sizeof(double); |
|
|
|
|
|
RUNTIME_API_CALL(cudaMalloc(&(deviceData[device].d_x[stream]), size)); |
|
|
MEMORY_ALLOCATION_CALL(deviceData[device].d_x[stream]); |
|
|
RUNTIME_API_CALL(cudaMemcpy(deviceData[device].d_x[stream], h_x.data(), size, cudaMemcpyHostToDevice)); |
|
|
|
|
|
RUNTIME_API_CALL(cudaMalloc(&(deviceData[device].d_y[stream]), size)); |
|
|
MEMORY_ALLOCATION_CALL(deviceData[device].d_y[stream]); |
|
|
RUNTIME_API_CALL(cudaMemcpy(deviceData[device].d_y[stream], h_x.data(), size, cudaMemcpyHostToDevice)); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
auto begin_time = ::std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
|
|
|
profileKernels(deviceData[0], "single_device_serial", true); |
|
|
|
|
|
auto end_time = ::std::chrono::high_resolution_clock::now(); |
|
|
auto elapsed_serial_ms = ::std::chrono::duration_cast<::std::chrono::milliseconds>(end_time - begin_time); |
|
|
int numBlocks = 0; |
|
|
for (int i = 1; i <= numKernels; i++) |
|
|
{ |
|
|
numBlocks += i; |
|
|
} |
|
|
cout << "It took " << elapsed_serial_ms.count() << "ms on the host to profile " << numKernels << " kernels in serial." << endl; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
begin_time = ::std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
|
|
|
profileKernels(deviceData[0], "single_device_async", false); |
|
|
|
|
|
end_time = ::std::chrono::high_resolution_clock::now(); |
|
|
auto elapsed_single_device_ms = ::std::chrono::duration_cast<::std::chrono::milliseconds>(end_time - begin_time); |
|
|
cout << "It took " << elapsed_single_device_ms.count() << "ms on the host to profile " << numKernels << " kernels on a single device on separate streams." << endl; |
|
|
cout << "--> If the separate stream wallclock time is less than the serial version, the streams were profiling concurrently." << endl; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (numDevices == 1) |
|
|
{ |
|
|
cout << "Only one compatible device found; skipping the multi-threaded test." << endl; |
|
|
} |
|
|
else |
|
|
{ |
|
|
cout << "Running on " << numDevices << " devices, one thread per device." << endl; |
|
|
|
|
|
|
|
|
vector<::std::thread> threads; |
|
|
begin_time = ::std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
|
|
|
for (int thread = 0; thread < numDevices; thread++) |
|
|
{ |
|
|
threads.push_back(::std::thread(profileKernels, ::std::ref(deviceData[thread]), "multi_device_async", false)); |
|
|
} |
|
|
|
|
|
|
|
|
for (auto &t: threads) |
|
|
{ |
|
|
t.join(); |
|
|
} |
|
|
|
|
|
|
|
|
end_time = ::std::chrono::high_resolution_clock::now(); |
|
|
auto elapsed_multiple_device_ms = ::std::chrono::duration_cast<::std::chrono::milliseconds>(end_time - begin_time); |
|
|
cout << "It took " << elapsed_multiple_device_ms.count() << "ms on the host to profile the same " << numKernels << " kernels on each of the " << numDevices << " devices in parallel" << endl; |
|
|
cout << "--> Wallclock ratio of parallel device launch to single device launch is " << elapsed_multiple_device_ms.count() / static_cast<double>(elapsed_single_device_ms.count()) << endl; |
|
|
cout << "--> If the ratio is close to 1, that means there was little overhead to profile in parallel on multiple devices compared to profiling on a single device." << endl; |
|
|
cout << "--> If the devices have different performance, the ratio may not be close to one, and this should be limited by the slowest device." << endl; |
|
|
} |
|
|
|
|
|
|
|
|
for (int i = 0; i < numDevices; i++) |
|
|
{ |
|
|
for (int j = 0; j < numKernels; j++) |
|
|
{ |
|
|
RUNTIME_API_CALL(cudaFree(deviceData[i].d_x[j])); |
|
|
RUNTIME_API_CALL(cudaFree(deviceData[i].d_y[j])); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
cout << endl << "Metrics for device #0:" << endl; |
|
|
cout << "Look at the sm__cycles_elapsed.max values for each test." << endl; |
|
|
cout << "This value represents the time spent on device to run the kernels in each case, and should be longest for the serial range, and roughly equal for the single and multi device concurrent ranges." << endl; |
|
|
PrintMetricValues(deviceData[0].config.chipName, deviceData[0].counterDataImage, metricNames); |
|
|
|
|
|
|
|
|
if (numDevices > 1) |
|
|
{ |
|
|
cout << endl << "Metrics for the remaining devices only display the multi device async case and should all be similar to the first device's values if the device has similar performance characteristics." << endl; |
|
|
cout << "If devices have different performance characteristics, the runtime cycles calculation may vary by device." << endl; |
|
|
} |
|
|
for (int i = 1; i < numDevices; i++) |
|
|
{ |
|
|
cout << endl << "Metrics for device #" << i << ":" << endl; |
|
|
PrintMetricValues(deviceData[i].config.chipName, deviceData[i].counterDataImage, metricNames); |
|
|
} |
|
|
|
|
|
exit(EXIT_SUCCESS); |
|
|
} |
|
|
|