|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <inttypes.h> |
|
|
#include <stdint.h> |
|
|
#include <stdio.h> |
|
|
#include <stdlib.h> |
|
|
#include <sys/types.h> |
|
|
#include <string> |
|
|
#include <iostream> |
|
|
#include <vector> |
|
|
#include <map> |
|
|
#include <unordered_set> |
|
|
#include <queue> |
|
|
#include <thread> |
|
|
#include <mutex> |
|
|
|
|
|
#ifndef EXIT_WAIVED |
|
|
#define EXIT_WAIVED 2 |
|
|
#endif |
|
|
|
|
|
#ifdef _WIN32 |
|
|
#include <windows.h> |
|
|
#include "detours.h" |
|
|
#else |
|
|
#include <unistd.h> |
|
|
#include <pthread.h> |
|
|
#endif |
|
|
|
|
|
#include <cupti_pcsampling_util.h> |
|
|
#include <cupti_pcsampling.h> |
|
|
#include "cupti.h" |
|
|
#include "cuda.h" |
|
|
|
|
|
using namespace CUPTI::PcSamplingUtil; |
|
|
|
|
|
#define CUPTI_CALL(call) \ |
|
|
{ \ |
|
|
CUptiResult _status = call; \ |
|
|
if (_status != CUPTI_SUCCESS) \ |
|
|
{ \ |
|
|
const char* errstr; \ |
|
|
cuptiGetResultString(_status, &errstr); \ |
|
|
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ |
|
|
__FILE__, \ |
|
|
__LINE__, \ |
|
|
#call, \ |
|
|
errstr); \ |
|
|
exit(EXIT_FAILURE); \ |
|
|
} \ |
|
|
} |
|
|
|
|
|
#define MEMORY_ALLOCATION_CALL(var) \ |
|
|
do { \ |
|
|
if (var == NULL) { \ |
|
|
fprintf(stderr, "%s:%d: Error: Memory Allocation Failed \n", \ |
|
|
__FILE__, __LINE__); \ |
|
|
exit(EXIT_FAILURE); \ |
|
|
} \ |
|
|
} while (0) |
|
|
|
|
|
#define THREAD_SLEEP_TIME 100 |
|
|
|
|
|
typedef struct contextInfo |
|
|
{ |
|
|
uint32_t contextUid; |
|
|
CUpti_PCSamplingData pcSamplingData; |
|
|
std::vector<CUpti_PCSamplingConfigurationInfo> pcSamplingConfigurationInfo; |
|
|
PcSamplingStallReasons pcSamplingStallReasons; |
|
|
} ContextInfo; |
|
|
|
|
|
|
|
|
|
|
|
size_t stallReasonsCount = 0; |
|
|
|
|
|
bool g_collectedStallReasonsCount = false; |
|
|
std::mutex g_stallReasonsCountMutex; |
|
|
|
|
|
|
|
|
std::vector<CUpti_PCSamplingData> g_circularBuffer; |
|
|
std::unordered_set<char*> functions; |
|
|
int g_put = 0; |
|
|
int g_get = 0; |
|
|
std::vector<bool> g_bufferEmptyTrackerArray; |
|
|
std::mutex g_circularBufferMutex; |
|
|
bool g_buffersGetUtilisedFasterThanStore = false; |
|
|
bool g_allocatedCircularBuffers = false; |
|
|
|
|
|
|
|
|
std::map<CUcontext, ContextInfo*> g_contextInfoMap; |
|
|
std::mutex g_contextInfoMutex; |
|
|
std::vector<ContextInfo*> g_contextInfoToFreeInEndVector; |
|
|
|
|
|
|
|
|
std::string g_fileName = "pcsampling.dat"; |
|
|
std::thread g_storeDataInFileThreadHandle; |
|
|
std::queue<std::pair<CUpti_PCSamplingData*, ContextInfo*>> g_pcSampDataQueue; |
|
|
bool g_waitAtJoin = false; |
|
|
std::mutex g_pcSampDataQueueMutex; |
|
|
bool g_createdWorkerThread = false; |
|
|
std::mutex g_workerThreadMutex; |
|
|
|
|
|
|
|
|
bool g_initializedInjection = false; |
|
|
std::mutex g_initializeInjectionMutex; |
|
|
|
|
|
|
|
|
CUpti_PCSamplingCollectionMode g_pcSamplingCollectionMode = CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS; |
|
|
uint32_t g_samplingPeriod = 0; |
|
|
size_t g_scratchBufSize = 0; |
|
|
size_t g_hwBufSize = 0; |
|
|
size_t g_pcConfigBufRecordCount = 5000; |
|
|
size_t g_circularbufCount = 10; |
|
|
size_t g_circularbufSize = 500; |
|
|
bool g_disableFileDump = false; |
|
|
bool g_verbose = false; |
|
|
|
|
|
bool g_running = false; |
|
|
|
|
|
static void ReadInputParams() |
|
|
{ |
|
|
char* injectionParam = getenv("INJECTION_PARAM"); |
|
|
|
|
|
if (injectionParam == NULL) |
|
|
{ |
|
|
g_circularBuffer.resize(g_circularbufCount); |
|
|
g_bufferEmptyTrackerArray.resize(g_circularbufCount, false); |
|
|
return; |
|
|
} |
|
|
|
|
|
char *token = strtok(injectionParam, " "); |
|
|
|
|
|
while (token != NULL) |
|
|
{ |
|
|
if(!strcmp(token, "--collection-mode")) |
|
|
{ |
|
|
token = strtok(NULL," "); |
|
|
g_pcSamplingCollectionMode = (CUpti_PCSamplingCollectionMode)atoi(token); |
|
|
} |
|
|
else if(!strcmp(token, "--sampling-period")) |
|
|
{ |
|
|
token = strtok(NULL," "); |
|
|
g_samplingPeriod = (uint32_t)atoi(token); |
|
|
} |
|
|
else if(!strcmp(token, "--scratch-buf-size")) |
|
|
{ |
|
|
token = strtok(NULL," "); |
|
|
g_scratchBufSize = (size_t)atoi(token); |
|
|
} |
|
|
else if(!strcmp(token, "--hw-buf-size")) |
|
|
{ |
|
|
token = strtok(NULL," "); |
|
|
g_hwBufSize = (size_t)atoi(token); |
|
|
} |
|
|
else if(!strcmp(token, "--pc-config-buf-record-count")) |
|
|
{ |
|
|
token = strtok(NULL," "); |
|
|
g_pcConfigBufRecordCount = (size_t)atoi(token); |
|
|
} |
|
|
else if(!strcmp(token, "--pc-circular-buf-record-count")) |
|
|
{ |
|
|
token = strtok(NULL," "); |
|
|
g_circularbufSize = (size_t)atoi(token); |
|
|
} |
|
|
else if(!strcmp(token, "--circular-buf-count")) |
|
|
{ |
|
|
token = strtok(NULL," "); |
|
|
g_circularbufCount = (size_t)atoi(token); |
|
|
} |
|
|
else if(!strcmp(token, "--file-name")) |
|
|
{ |
|
|
token = strtok(NULL," "); |
|
|
std::string file(token); |
|
|
g_fileName = file; |
|
|
} |
|
|
else if(!strcmp(token, "--disable-file-dump")) |
|
|
{ |
|
|
g_disableFileDump = true; |
|
|
} |
|
|
else if(!strcmp(token, "--verbose")) |
|
|
{ |
|
|
g_verbose = true; |
|
|
} |
|
|
token = strtok(NULL," "); |
|
|
} |
|
|
g_circularBuffer.resize(g_circularbufCount); |
|
|
g_bufferEmptyTrackerArray.resize(g_circularbufCount, false); |
|
|
} |
|
|
|
|
|
static void GetPcSamplingDataFromCupti(CUpti_PCSamplingGetDataParams &pcSamplingGetDataParams, ContextInfo *contextInfo) |
|
|
{ |
|
|
CUpti_PCSamplingData *pPcSamplingData = NULL; |
|
|
|
|
|
g_circularBufferMutex.lock(); |
|
|
while (g_bufferEmptyTrackerArray[g_put]) |
|
|
{ |
|
|
g_buffersGetUtilisedFasterThanStore = true; |
|
|
} |
|
|
|
|
|
pcSamplingGetDataParams.pcSamplingData = (void *)&g_circularBuffer[g_put]; |
|
|
pPcSamplingData = &g_circularBuffer[g_put]; |
|
|
|
|
|
if (!g_disableFileDump) |
|
|
{ |
|
|
g_bufferEmptyTrackerArray[g_put] = true; |
|
|
g_put = (g_put+1) % g_circularbufCount; |
|
|
} |
|
|
g_circularBufferMutex.unlock(); |
|
|
|
|
|
CUPTI_CALL(cuptiPCSamplingGetData(&pcSamplingGetDataParams)); |
|
|
|
|
|
if (!g_disableFileDump) |
|
|
{ |
|
|
g_pcSampDataQueueMutex.lock(); |
|
|
g_pcSampDataQueue.push(std::make_pair(pPcSamplingData, contextInfo)); |
|
|
g_pcSampDataQueueMutex.unlock(); |
|
|
} |
|
|
} |
|
|
|
|
|
static void StorePcSampDataInFile() |
|
|
{ |
|
|
CUptiUtilResult utilResult; |
|
|
ContextInfo *contextInfo; |
|
|
CUpti_PCSamplingData *pcSamplingData; |
|
|
|
|
|
g_pcSampDataQueueMutex.lock(); |
|
|
pcSamplingData = g_pcSampDataQueue.front().first; |
|
|
contextInfo = g_pcSampDataQueue.front().second; |
|
|
g_pcSampDataQueue.pop(); |
|
|
g_pcSampDataQueueMutex.unlock(); |
|
|
|
|
|
std::string file = std::to_string((long int)contextInfo->contextUid) + "_" + g_fileName; |
|
|
|
|
|
CUptiUtil_PutPcSampDataParams pPutPcSampDataParams = {}; |
|
|
pPutPcSampDataParams.size = CUptiUtil_PutPcSampDataParamsSize; |
|
|
pPutPcSampDataParams.bufferType = PC_SAMPLING_BUFFER_PC_TO_COUNTER_DATA; |
|
|
pPutPcSampDataParams.pSamplingData = (void*)pcSamplingData; |
|
|
pPutPcSampDataParams.numAttributes = contextInfo->pcSamplingConfigurationInfo.size(); |
|
|
pPutPcSampDataParams.pPCSamplingConfigurationInfo = contextInfo->pcSamplingConfigurationInfo.data(); |
|
|
pPutPcSampDataParams.pPcSamplingStallReasons = &contextInfo->pcSamplingStallReasons; |
|
|
pPutPcSampDataParams.fileName = file.c_str(); |
|
|
|
|
|
utilResult = CuptiUtilPutPcSampData(&pPutPcSampDataParams); |
|
|
if (utilResult != CUPTI_UTIL_SUCCESS) |
|
|
{ |
|
|
std::cout << "error in StorePcSampDataInFile(), failed with error : " << utilResult << std::endl; |
|
|
exit (EXIT_FAILURE); |
|
|
} |
|
|
for (size_t i = 0; i < pcSamplingData->totalNumPcs; i++) |
|
|
{ |
|
|
functions.insert(pcSamplingData->pPcData[i].functionName); |
|
|
} |
|
|
g_bufferEmptyTrackerArray[g_get] = false; |
|
|
g_get = (g_get + 1) % g_circularbufCount; |
|
|
} |
|
|
|
|
|
static void StorePcSampDataInFileThread() |
|
|
{ |
|
|
while(1) |
|
|
{ |
|
|
if (g_waitAtJoin) |
|
|
{ |
|
|
while (!g_pcSampDataQueue.empty()) |
|
|
{ |
|
|
StorePcSampDataInFile(); |
|
|
} |
|
|
break; |
|
|
} |
|
|
else |
|
|
{ |
|
|
while(!g_pcSampDataQueue.empty()) |
|
|
{ |
|
|
StorePcSampDataInFile(); |
|
|
} |
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(THREAD_SLEEP_TIME)); |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
static void PreallocateBuffersForRecords() |
|
|
{ |
|
|
for (size_t buffers=0; buffers<g_circularbufCount; buffers++) |
|
|
{ |
|
|
g_circularBuffer[buffers].size = sizeof(CUpti_PCSamplingData); |
|
|
g_circularBuffer[buffers].collectNumPcs = g_circularbufSize; |
|
|
g_circularBuffer[buffers].pPcData = (CUpti_PCSamplingPCData *)malloc(g_circularBuffer[buffers].collectNumPcs * sizeof(CUpti_PCSamplingPCData)); |
|
|
MEMORY_ALLOCATION_CALL(g_circularBuffer[buffers].pPcData); |
|
|
for (size_t i = 0; i < g_circularBuffer[buffers].collectNumPcs; i++) |
|
|
{ |
|
|
g_circularBuffer[buffers].pPcData[i].stallReason = (CUpti_PCSamplingStallReason *)malloc(stallReasonsCount * sizeof(CUpti_PCSamplingStallReason)); |
|
|
MEMORY_ALLOCATION_CALL(g_circularBuffer[buffers].pPcData[i].stallReason); |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
static void FreePreallocatedMemory() |
|
|
{ |
|
|
for (size_t buffers=0; buffers<g_circularbufCount; buffers++) |
|
|
{ |
|
|
for (size_t i = 0; i < g_circularBuffer[buffers].collectNumPcs; i++) |
|
|
{ |
|
|
free(g_circularBuffer[buffers].pPcData[i].stallReason); |
|
|
} |
|
|
|
|
|
free(g_circularBuffer[buffers].pPcData); |
|
|
} |
|
|
|
|
|
for(auto& itr: g_contextInfoMap) |
|
|
{ |
|
|
|
|
|
for (uint32_t i = 0; i < g_pcConfigBufRecordCount; i++) |
|
|
{ |
|
|
free(itr.second->pcSamplingData.pPcData[i].stallReason); |
|
|
} |
|
|
free(itr.second->pcSamplingData.pPcData); |
|
|
|
|
|
for (size_t i = 0; i < itr.second->pcSamplingStallReasons.numStallReasons; i++) |
|
|
{ |
|
|
free(itr.second->pcSamplingStallReasons.stallReasons[i]); |
|
|
} |
|
|
free(itr.second->pcSamplingStallReasons.stallReasons); |
|
|
free(itr.second->pcSamplingStallReasons.stallReasonIndex); |
|
|
|
|
|
free(itr.second); |
|
|
} |
|
|
|
|
|
for(auto& itr: g_contextInfoToFreeInEndVector) |
|
|
{ |
|
|
|
|
|
for (uint32_t i = 0; i < g_pcConfigBufRecordCount; i++) |
|
|
{ |
|
|
free(itr->pcSamplingData.pPcData[i].stallReason); |
|
|
} |
|
|
free(itr->pcSamplingData.pPcData); |
|
|
|
|
|
for (size_t i = 0; i < itr->pcSamplingStallReasons.numStallReasons; i++) |
|
|
{ |
|
|
free(itr->pcSamplingStallReasons.stallReasons[i]); |
|
|
} |
|
|
free(itr->pcSamplingStallReasons.stallReasons); |
|
|
free(itr->pcSamplingStallReasons.stallReasonIndex); |
|
|
|
|
|
free(itr); |
|
|
} |
|
|
|
|
|
for(auto it = functions.begin(); it != functions.end(); ++it) |
|
|
{ |
|
|
free(*it); |
|
|
} |
|
|
functions.clear(); |
|
|
} |
|
|
|
|
|
void ConfigureActivity(CUcontext cuCtx) |
|
|
{ |
|
|
std::map<CUcontext, ContextInfo*>::iterator contextStateMapItr = g_contextInfoMap.find(cuCtx); |
|
|
if (contextStateMapItr == g_contextInfoMap.end()) |
|
|
{ |
|
|
std::cout << "Error : No ctx found" << std::endl; |
|
|
exit (EXIT_FAILURE); |
|
|
} |
|
|
|
|
|
CUpti_PCSamplingConfigurationInfo sampPeriod = {}; |
|
|
CUpti_PCSamplingConfigurationInfo stallReason = {}; |
|
|
CUpti_PCSamplingConfigurationInfo scratchBufferSize = {}; |
|
|
CUpti_PCSamplingConfigurationInfo hwBufferSize = {}; |
|
|
CUpti_PCSamplingConfigurationInfo collectionMode = {}; |
|
|
CUpti_PCSamplingConfigurationInfo enableStartStop = {}; |
|
|
CUpti_PCSamplingConfigurationInfo outputDataFormat = {}; |
|
|
|
|
|
|
|
|
size_t numStallReasons = 0; |
|
|
CUpti_PCSamplingGetNumStallReasonsParams numStallReasonsParams = {}; |
|
|
numStallReasonsParams.size = CUpti_PCSamplingGetNumStallReasonsParamsSize; |
|
|
numStallReasonsParams.ctx = cuCtx; |
|
|
numStallReasonsParams.numStallReasons = &numStallReasons; |
|
|
|
|
|
g_stallReasonsCountMutex.lock(); |
|
|
CUPTI_CALL(cuptiPCSamplingGetNumStallReasons(&numStallReasonsParams)); |
|
|
|
|
|
if (!g_collectedStallReasonsCount) |
|
|
{ |
|
|
stallReasonsCount = numStallReasons; |
|
|
g_collectedStallReasonsCount = true; |
|
|
} |
|
|
g_stallReasonsCountMutex.unlock(); |
|
|
|
|
|
char **pStallReasons = (char **)malloc(numStallReasons * sizeof(char*)); |
|
|
MEMORY_ALLOCATION_CALL(pStallReasons); |
|
|
for (size_t i = 0; i < numStallReasons; i++) |
|
|
{ |
|
|
pStallReasons[i] = (char *)malloc(CUPTI_STALL_REASON_STRING_SIZE * sizeof(char)); |
|
|
MEMORY_ALLOCATION_CALL(pStallReasons[i]); |
|
|
} |
|
|
uint32_t *pStallReasonIndex = (uint32_t *)malloc(numStallReasons * sizeof(uint32_t)); |
|
|
MEMORY_ALLOCATION_CALL(pStallReasonIndex); |
|
|
|
|
|
CUpti_PCSamplingGetStallReasonsParams stallReasonsParams = {}; |
|
|
stallReasonsParams.size = CUpti_PCSamplingGetStallReasonsParamsSize; |
|
|
stallReasonsParams.ctx = cuCtx; |
|
|
stallReasonsParams.numStallReasons = numStallReasons; |
|
|
stallReasonsParams.stallReasonIndex = pStallReasonIndex; |
|
|
stallReasonsParams.stallReasons = pStallReasons; |
|
|
CUPTI_CALL(cuptiPCSamplingGetStallReasons(&stallReasonsParams)); |
|
|
|
|
|
|
|
|
size_t pcSamplingDataSize = sizeof(CUpti_PCSamplingData); |
|
|
contextStateMapItr->second->pcSamplingData.size = pcSamplingDataSize; |
|
|
contextStateMapItr->second->pcSamplingData.collectNumPcs = g_pcConfigBufRecordCount; |
|
|
contextStateMapItr->second->pcSamplingData.pPcData = (CUpti_PCSamplingPCData *)malloc(g_pcConfigBufRecordCount * sizeof(CUpti_PCSamplingPCData)); |
|
|
MEMORY_ALLOCATION_CALL(contextStateMapItr->second->pcSamplingData.pPcData); |
|
|
for (uint32_t i = 0; i < g_pcConfigBufRecordCount; i++) |
|
|
{ |
|
|
contextStateMapItr->second->pcSamplingData.pPcData[i].stallReason = (CUpti_PCSamplingStallReason *)malloc(numStallReasons * sizeof(CUpti_PCSamplingStallReason)); |
|
|
MEMORY_ALLOCATION_CALL(contextStateMapItr->second->pcSamplingData.pPcData[i].stallReason); |
|
|
} |
|
|
|
|
|
std::vector<CUpti_PCSamplingConfigurationInfo> pcSamplingConfigurationInfo; |
|
|
|
|
|
stallReason.attributeType = CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON; |
|
|
stallReason.attributeData.stallReasonData.stallReasonCount = numStallReasons; |
|
|
stallReason.attributeData.stallReasonData.pStallReasonIndex = pStallReasonIndex; |
|
|
|
|
|
CUpti_PCSamplingConfigurationInfo samplingDataBuffer = {}; |
|
|
samplingDataBuffer.attributeType = CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER; |
|
|
samplingDataBuffer.attributeData.samplingDataBufferData.samplingDataBuffer = (void *)&contextStateMapItr->second->pcSamplingData; |
|
|
|
|
|
sampPeriod.attributeType = CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD; |
|
|
if (g_samplingPeriod) |
|
|
{ |
|
|
sampPeriod.attributeData.samplingPeriodData.samplingPeriod = g_samplingPeriod; |
|
|
pcSamplingConfigurationInfo.push_back(sampPeriod); |
|
|
} |
|
|
|
|
|
scratchBufferSize.attributeType = CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE; |
|
|
if (g_scratchBufSize) |
|
|
{ |
|
|
scratchBufferSize.attributeData.scratchBufferSizeData.scratchBufferSize = g_scratchBufSize; |
|
|
pcSamplingConfigurationInfo.push_back(scratchBufferSize); |
|
|
} |
|
|
|
|
|
hwBufferSize.attributeType = CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE; |
|
|
if (g_hwBufSize) |
|
|
{ |
|
|
hwBufferSize.attributeData.hardwareBufferSizeData.hardwareBufferSize = g_hwBufSize; |
|
|
pcSamplingConfigurationInfo.push_back(hwBufferSize); |
|
|
} |
|
|
|
|
|
collectionMode.attributeType = CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE; |
|
|
collectionMode.attributeData.collectionModeData.collectionMode = g_pcSamplingCollectionMode; |
|
|
pcSamplingConfigurationInfo.push_back(collectionMode); |
|
|
|
|
|
pcSamplingConfigurationInfo.push_back(stallReason); |
|
|
pcSamplingConfigurationInfo.push_back(samplingDataBuffer); |
|
|
|
|
|
CUpti_PCSamplingConfigurationInfoParams pcSamplingConfigurationInfoParams = {}; |
|
|
pcSamplingConfigurationInfoParams.size = CUpti_PCSamplingConfigurationInfoParamsSize; |
|
|
pcSamplingConfigurationInfoParams.pPriv = NULL; |
|
|
pcSamplingConfigurationInfoParams.ctx = cuCtx; |
|
|
pcSamplingConfigurationInfoParams.numAttributes = pcSamplingConfigurationInfo.size(); |
|
|
pcSamplingConfigurationInfoParams.pPCSamplingConfigurationInfo = pcSamplingConfigurationInfo.data(); |
|
|
|
|
|
CUPTI_CALL(cuptiPCSamplingSetConfigurationAttribute(&pcSamplingConfigurationInfoParams)); |
|
|
|
|
|
|
|
|
contextStateMapItr->second->pcSamplingStallReasons.numStallReasons = numStallReasons; |
|
|
contextStateMapItr->second->pcSamplingStallReasons.stallReasons = pStallReasons; |
|
|
contextStateMapItr->second->pcSamplingStallReasons.stallReasonIndex = pStallReasonIndex; |
|
|
|
|
|
|
|
|
scratchBufferSize.attributeType = CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE; |
|
|
hwBufferSize.attributeType = CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE; |
|
|
enableStartStop.attributeType = CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL; |
|
|
outputDataFormat.attributeType = CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_OUTPUT_DATA_FORMAT; |
|
|
outputDataFormat.attributeData.outputDataFormatData.outputDataFormat = CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_PARSED; |
|
|
|
|
|
std::vector<CUpti_PCSamplingConfigurationInfo> pcSamplingRetrieveConfigurationInfo; |
|
|
pcSamplingRetrieveConfigurationInfo.push_back(collectionMode); |
|
|
pcSamplingRetrieveConfigurationInfo.push_back(sampPeriod); |
|
|
pcSamplingRetrieveConfigurationInfo.push_back(scratchBufferSize); |
|
|
pcSamplingRetrieveConfigurationInfo.push_back(hwBufferSize); |
|
|
pcSamplingRetrieveConfigurationInfo.push_back(enableStartStop); |
|
|
|
|
|
CUpti_PCSamplingConfigurationInfoParams getPcSamplingConfigurationInfoParams = {}; |
|
|
getPcSamplingConfigurationInfoParams.size = CUpti_PCSamplingConfigurationInfoParamsSize; |
|
|
getPcSamplingConfigurationInfoParams.pPriv = NULL; |
|
|
getPcSamplingConfigurationInfoParams.ctx = cuCtx; |
|
|
getPcSamplingConfigurationInfoParams.numAttributes = pcSamplingRetrieveConfigurationInfo.size(); |
|
|
getPcSamplingConfigurationInfoParams.pPCSamplingConfigurationInfo = pcSamplingRetrieveConfigurationInfo.data(); |
|
|
|
|
|
CUPTI_CALL(cuptiPCSamplingGetConfigurationAttribute(&getPcSamplingConfigurationInfoParams)); |
|
|
|
|
|
for (size_t i = 0; i < getPcSamplingConfigurationInfoParams.numAttributes; i++) |
|
|
{ |
|
|
contextStateMapItr->second->pcSamplingConfigurationInfo.push_back(getPcSamplingConfigurationInfoParams.pPCSamplingConfigurationInfo[i]); |
|
|
} |
|
|
|
|
|
contextStateMapItr->second->pcSamplingConfigurationInfo.push_back(outputDataFormat); |
|
|
contextStateMapItr->second->pcSamplingConfigurationInfo.push_back(stallReason); |
|
|
|
|
|
g_workerThreadMutex.lock(); |
|
|
if (!g_disableFileDump && !g_createdWorkerThread) |
|
|
{ |
|
|
g_storeDataInFileThreadHandle = std::thread(StorePcSampDataInFileThread); |
|
|
g_createdWorkerThread = true; |
|
|
} |
|
|
g_workerThreadMutex.unlock(); |
|
|
|
|
|
if (g_verbose) |
|
|
{ |
|
|
std::cout << std::endl; |
|
|
std::cout << "============ Configuration Details : ============" << std::endl; |
|
|
std::cout << "requested stall reason count : " << numStallReasons << std::endl; |
|
|
std::cout << "collection mode : " << getPcSamplingConfigurationInfoParams.pPCSamplingConfigurationInfo[0].attributeData.collectionModeData.collectionMode << std::endl; |
|
|
std::cout << "sampling period : " << getPcSamplingConfigurationInfoParams.pPCSamplingConfigurationInfo[1].attributeData.samplingPeriodData.samplingPeriod << std::endl; |
|
|
std::cout << "scratch buffer size (Bytes) : " << getPcSamplingConfigurationInfoParams.pPCSamplingConfigurationInfo[2].attributeData.scratchBufferSizeData.scratchBufferSize << std::endl; |
|
|
std::cout << "hardware buffer size (Bytes) : " << getPcSamplingConfigurationInfoParams.pPCSamplingConfigurationInfo[3].attributeData.hardwareBufferSizeData.hardwareBufferSize << std::endl; |
|
|
std::cout << "start stop control : " << getPcSamplingConfigurationInfoParams.pPCSamplingConfigurationInfo[4].attributeData.enableStartStopControlData.enableStartStopControl << std::endl; |
|
|
std::cout << "configuration buffer size : " << g_pcConfigBufRecordCount << std::endl; |
|
|
std::cout << "circular buffer count : " << g_circularbufCount << std::endl; |
|
|
std::cout << "circular buffer record count : " << g_circularbufSize << std::endl; |
|
|
std::cout << "File name : <context id>_" << g_fileName << std::endl; |
|
|
std::cout << "=================================================" << std::endl; |
|
|
std::cout << std::endl; |
|
|
} |
|
|
|
|
|
return; |
|
|
} |
|
|
|
|
|
void AtExitHandler() |
|
|
{ |
|
|
|
|
|
CUPTI_CALL(cuptiGetLastError()); |
|
|
if (g_running) |
|
|
{ |
|
|
g_running = false; |
|
|
|
|
|
|
|
|
for(auto& itr: g_contextInfoMap) |
|
|
{ |
|
|
CUpti_PCSamplingGetDataParams pcSamplingGetDataParams = {}; |
|
|
pcSamplingGetDataParams.size = CUpti_PCSamplingGetDataParamsSize; |
|
|
pcSamplingGetDataParams.ctx = itr.first; |
|
|
|
|
|
while (itr.second->pcSamplingData.remainingNumPcs > 0 || itr.second->pcSamplingData.totalNumPcs > 0) |
|
|
{ |
|
|
GetPcSamplingDataFromCupti(pcSamplingGetDataParams, itr.second); |
|
|
} |
|
|
|
|
|
CUpti_PCSamplingDisableParams pcSamplingDisableParams = {}; |
|
|
pcSamplingDisableParams.size = CUpti_PCSamplingDisableParamsSize; |
|
|
pcSamplingDisableParams.ctx = itr.first; |
|
|
CUPTI_CALL(cuptiPCSamplingDisable(&pcSamplingDisableParams)); |
|
|
|
|
|
if (!g_disableFileDump && itr.second->pcSamplingData.totalNumPcs > 0) |
|
|
{ |
|
|
size_t remainingNumPcs = itr.second->pcSamplingData.remainingNumPcs; |
|
|
if (remainingNumPcs) |
|
|
{ |
|
|
std::cout << "WARNING : " << remainingNumPcs |
|
|
<< " records are discarded during cuptiPCSamplingDisable() since these can't be accommodated " |
|
|
<< "in the PC sampling buffer provided during the PC sampling configuration. Bigger buffer can mitigate this issue." << std::endl; |
|
|
} |
|
|
|
|
|
g_pcSampDataQueueMutex.lock(); |
|
|
|
|
|
|
|
|
g_pcSampDataQueue.push(std::make_pair(&itr.second->pcSamplingData, itr.second)); |
|
|
g_pcSampDataQueueMutex.unlock(); |
|
|
} |
|
|
} |
|
|
|
|
|
if (g_buffersGetUtilisedFasterThanStore) |
|
|
{ |
|
|
std::cout << "WARNING : Buffers get used faster than get stored in file. " |
|
|
<< "Suggestion is either increase size of buffer or increase number of buffers" << std::endl; |
|
|
} |
|
|
|
|
|
g_waitAtJoin = true; |
|
|
|
|
|
if (g_storeDataInFileThreadHandle.joinable()) |
|
|
{ |
|
|
g_storeDataInFileThreadHandle.join(); |
|
|
} |
|
|
|
|
|
FreePreallocatedMemory(); |
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
#ifdef _WIN32 |
|
|
typedef void (WINAPI* rtlExitUserProcess_t)(uint32_t exitCode); |
|
|
rtlExitUserProcess_t Real_RtlExitUserProcess = NULL; |
|
|
|
|
|
|
|
|
void WINAPI Detour_RtlExitUserProcess(uint32_t exitCode) |
|
|
{ |
|
|
AtExitHandler(); |
|
|
|
|
|
Real_RtlExitUserProcess(exitCode); |
|
|
} |
|
|
#endif |
|
|
|
|
|
void registerAtExitHandler(void) { |
|
|
#ifdef _WIN32 |
|
|
{ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int detourStatus = 0; |
|
|
FARPROC proc; |
|
|
|
|
|
|
|
|
HMODULE ntDll = GetModuleHandle(TEXT("ntdll.dll")); |
|
|
if (!ntDll) { |
|
|
detourStatus = 1; |
|
|
goto DetourError; |
|
|
} |
|
|
|
|
|
proc = GetProcAddress(ntDll, "RtlExitUserProcess"); |
|
|
if (!proc) { |
|
|
detourStatus = 1; |
|
|
goto DetourError; |
|
|
} |
|
|
Real_RtlExitUserProcess = (rtlExitUserProcess_t)proc; |
|
|
|
|
|
|
|
|
if (DetourTransactionBegin() != ERROR_SUCCESS) { |
|
|
detourStatus = 1; |
|
|
goto DetourError; |
|
|
} |
|
|
|
|
|
if (DetourUpdateThread(GetCurrentThread()) != ERROR_SUCCESS) { |
|
|
detourStatus = 1; |
|
|
goto DetourError; |
|
|
} |
|
|
|
|
|
DetourSetIgnoreTooSmall(TRUE); |
|
|
|
|
|
if (DetourAttach((void**)&Real_RtlExitUserProcess, (void*)Detour_RtlExitUserProcess) != ERROR_SUCCESS) { |
|
|
detourStatus = 1; |
|
|
goto DetourError; |
|
|
} |
|
|
|
|
|
|
|
|
if (DetourTransactionCommit() != ERROR_SUCCESS) { |
|
|
detourStatus = 1; |
|
|
goto DetourError; |
|
|
} |
|
|
DetourError: |
|
|
if (detourStatus != 0) { |
|
|
atexit(&AtExitHandler); |
|
|
} |
|
|
} |
|
|
#else |
|
|
atexit(&AtExitHandler); |
|
|
#endif |
|
|
} |
|
|
|
|
|
void CallbackHandler(void* userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, void* cbdata) |
|
|
{ |
|
|
switch (domain) |
|
|
{ |
|
|
case CUPTI_CB_DOMAIN_DRIVER_API: |
|
|
{ |
|
|
const CUpti_CallbackData* cbInfo = (CUpti_CallbackData*)cbdata; |
|
|
|
|
|
switch (cbid) |
|
|
{ |
|
|
case CUPTI_DRIVER_TRACE_CBID_cuLaunch: |
|
|
case CUPTI_DRIVER_TRACE_CBID_cuLaunchGrid: |
|
|
case CUPTI_DRIVER_TRACE_CBID_cuLaunchGridAsync: |
|
|
case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: |
|
|
case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz: |
|
|
case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel: |
|
|
case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel_ptsz: |
|
|
case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: |
|
|
{ |
|
|
if (cbInfo->callbackSite == CUPTI_API_EXIT) |
|
|
{ |
|
|
std::map<CUcontext, ContextInfo*>::iterator contextStateMapItr = g_contextInfoMap.find(cbInfo->context); |
|
|
if (contextStateMapItr == g_contextInfoMap.end()) |
|
|
{ |
|
|
std::cout << "Error : Context not found in map" << std::endl; |
|
|
exit(EXIT_FAILURE); |
|
|
} |
|
|
if (!contextStateMapItr->second->contextUid) |
|
|
{ |
|
|
contextStateMapItr->second->contextUid = cbInfo->contextUid; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (g_pcSamplingCollectionMode == CUPTI_PC_SAMPLING_COLLECTION_MODE_KERNEL_SERIALIZED) |
|
|
{ |
|
|
|
|
|
CUpti_PCSamplingGetDataParams pcSamplingGetDataParams = {}; |
|
|
pcSamplingGetDataParams.size = CUpti_PCSamplingGetDataParamsSize; |
|
|
pcSamplingGetDataParams.ctx = cbInfo->context; |
|
|
|
|
|
|
|
|
while (contextStateMapItr->second->pcSamplingData.totalNumPcs > 0) |
|
|
{ |
|
|
GetPcSamplingDataFromCupti(pcSamplingGetDataParams, contextStateMapItr->second); |
|
|
} |
|
|
|
|
|
while (contextStateMapItr->second->pcSamplingData.remainingNumPcs > 0) |
|
|
{ |
|
|
GetPcSamplingDataFromCupti(pcSamplingGetDataParams, contextStateMapItr->second); |
|
|
} |
|
|
} |
|
|
else if(contextStateMapItr->second->pcSamplingData.remainingNumPcs >= g_circularbufSize) |
|
|
{ |
|
|
CUpti_PCSamplingGetDataParams pcSamplingGetDataParams = {}; |
|
|
pcSamplingGetDataParams.size = CUpti_PCSamplingGetDataParamsSize; |
|
|
pcSamplingGetDataParams.ctx = cbInfo->context; |
|
|
|
|
|
GetPcSamplingDataFromCupti(pcSamplingGetDataParams, contextStateMapItr->second); |
|
|
} |
|
|
} |
|
|
} |
|
|
break; |
|
|
} |
|
|
} |
|
|
break; |
|
|
case CUPTI_CB_DOMAIN_RESOURCE: |
|
|
{ |
|
|
const CUpti_ResourceData* resourceData = (CUpti_ResourceData*)cbdata; |
|
|
g_running = true; |
|
|
|
|
|
switch(cbid) |
|
|
{ |
|
|
case CUPTI_CBID_RESOURCE_CONTEXT_CREATED: |
|
|
{ |
|
|
{ |
|
|
if (g_verbose) |
|
|
{ |
|
|
std::cout << "Injection - Context created" << std::endl; |
|
|
} |
|
|
|
|
|
|
|
|
ContextInfo *contextInfo = (ContextInfo *)calloc(1, sizeof(ContextInfo)); |
|
|
MEMORY_ALLOCATION_CALL(contextInfo); |
|
|
g_contextInfoMutex.lock(); |
|
|
g_contextInfoMap.insert(std::make_pair(resourceData->context, contextInfo)); |
|
|
g_contextInfoMutex.unlock(); |
|
|
|
|
|
CUpti_PCSamplingEnableParams pcSamplingEnableParams = {}; |
|
|
pcSamplingEnableParams.size = CUpti_PCSamplingEnableParamsSize; |
|
|
pcSamplingEnableParams.ctx = resourceData->context; |
|
|
CUPTI_CALL(cuptiPCSamplingEnable(&pcSamplingEnableParams)); |
|
|
|
|
|
ConfigureActivity(resourceData->context); |
|
|
|
|
|
g_circularBufferMutex.lock(); |
|
|
if (!g_allocatedCircularBuffers) |
|
|
{ |
|
|
PreallocateBuffersForRecords(); |
|
|
g_allocatedCircularBuffers = true; |
|
|
} |
|
|
g_circularBufferMutex.unlock(); |
|
|
} |
|
|
} |
|
|
break; |
|
|
case CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING: |
|
|
{ |
|
|
if (g_verbose) |
|
|
{ |
|
|
std::cout << "Injection - Context destroy starting" << std::endl; |
|
|
} |
|
|
std::map<CUcontext, ContextInfo*>::iterator itr; |
|
|
g_contextInfoMutex.lock(); |
|
|
itr = g_contextInfoMap.find(resourceData->context); |
|
|
if (itr == g_contextInfoMap.end()) |
|
|
{ |
|
|
std::cout << "Warning : This context not found in map of context which enabled PC sampling." << std::endl; |
|
|
} |
|
|
g_contextInfoMutex.unlock(); |
|
|
|
|
|
CUpti_PCSamplingGetDataParams pcSamplingGetDataParams = {}; |
|
|
pcSamplingGetDataParams.size = CUpti_PCSamplingGetDataParamsSize; |
|
|
pcSamplingGetDataParams.ctx = itr->first; |
|
|
|
|
|
while (itr->second->pcSamplingData.remainingNumPcs > 0 || itr->second->pcSamplingData.totalNumPcs > 0) |
|
|
{ |
|
|
GetPcSamplingDataFromCupti(pcSamplingGetDataParams, itr->second); |
|
|
} |
|
|
|
|
|
CUpti_PCSamplingDisableParams pcSamplingDisableParams = {}; |
|
|
pcSamplingDisableParams.size = CUpti_PCSamplingDisableParamsSize; |
|
|
pcSamplingDisableParams.ctx = resourceData->context; |
|
|
CUPTI_CALL(cuptiPCSamplingDisable(&pcSamplingDisableParams)); |
|
|
|
|
|
|
|
|
|
|
|
if (!g_disableFileDump && itr->second->pcSamplingData.totalNumPcs > 0) |
|
|
{ |
|
|
g_pcSampDataQueueMutex.lock(); |
|
|
g_pcSampDataQueue.push(std::make_pair(&itr->second->pcSamplingData, itr->second)); |
|
|
g_pcSampDataQueueMutex.unlock(); |
|
|
} |
|
|
|
|
|
g_contextInfoMutex.lock(); |
|
|
g_contextInfoToFreeInEndVector.push_back(itr->second); |
|
|
g_contextInfoMap.erase(itr); |
|
|
g_contextInfoMutex.unlock(); |
|
|
} |
|
|
break; |
|
|
case CUPTI_CBID_RESOURCE_MODULE_LOADED: |
|
|
{ |
|
|
g_contextInfoMutex.lock(); |
|
|
std::map<CUcontext, ContextInfo*>::iterator contextStateMapItr = g_contextInfoMap.find(resourceData->context); |
|
|
if (contextStateMapItr == g_contextInfoMap.end()) |
|
|
{ |
|
|
std::cout << "Error : Context not found in map" << std::endl; |
|
|
exit(EXIT_FAILURE); |
|
|
} |
|
|
g_contextInfoMutex.unlock(); |
|
|
|
|
|
|
|
|
|
|
|
CUpti_PCSamplingGetDataParams pcSamplingGetDataParams = {}; |
|
|
pcSamplingGetDataParams.size = CUpti_PCSamplingGetDataParamsSize; |
|
|
pcSamplingGetDataParams.ctx = resourceData->context; |
|
|
|
|
|
|
|
|
while (contextStateMapItr->second->pcSamplingData.totalNumPcs > 0) |
|
|
{ |
|
|
GetPcSamplingDataFromCupti(pcSamplingGetDataParams, contextStateMapItr->second); |
|
|
} |
|
|
|
|
|
while (contextStateMapItr->second->pcSamplingData.remainingNumPcs > 0) |
|
|
{ |
|
|
GetPcSamplingDataFromCupti(pcSamplingGetDataParams, contextStateMapItr->second); |
|
|
} |
|
|
} |
|
|
break; |
|
|
} |
|
|
} |
|
|
break; |
|
|
default : |
|
|
break; |
|
|
} |
|
|
} |
|
|
|
|
|
#ifdef _WIN32 |
|
|
extern "C" __declspec(dllexport) int InitializeInjection(void) |
|
|
#else |
|
|
extern "C" int InitializeInjection(void) |
|
|
#endif |
|
|
{ |
|
|
g_initializeInjectionMutex.lock(); |
|
|
if (!g_initializedInjection) |
|
|
{ |
|
|
std::cout << "... Initialize injection ..." << std::endl; |
|
|
|
|
|
ReadInputParams(); |
|
|
|
|
|
CUpti_SubscriberHandle subscriber; |
|
|
CUPTI_CALL(cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)&CallbackHandler, NULL)); |
|
|
|
|
|
|
|
|
CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuLaunch)); |
|
|
CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuLaunchGrid)); |
|
|
CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuLaunchGridAsync)); |
|
|
CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)); |
|
|
CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz)); |
|
|
CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel)); |
|
|
CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel_ptsz)); |
|
|
CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice)); |
|
|
|
|
|
CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RESOURCE, CUPTI_CBID_RESOURCE_MODULE_LOADED)); |
|
|
CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RESOURCE, CUPTI_CBID_RESOURCE_CONTEXT_CREATED)); |
|
|
CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RESOURCE, CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING)); |
|
|
|
|
|
g_initializedInjection = true; |
|
|
} |
|
|
|
|
|
registerAtExitHandler(); |
|
|
g_initializeInjectionMutex.unlock(); |
|
|
|
|
|
return 1; |
|
|
} |
|
|
|