| |
| |
| |
| |
| |
| |
| |
|
|
| #include <stdio.h> |
| #include <cuda.h> |
| #include <cupti.h> |
| #include <stdlib.h> |
|
|
| #define METRIC_NAME "ipc" |
|
|
| #ifndef EXIT_WAIVED |
| #define EXIT_WAIVED 2 |
| #endif |
|
|
| #define DRIVER_API_CALL(apiFuncCall) \ |
| do { \ |
| CUresult _status = apiFuncCall |
| if (_status != CUDA_SUCCESS) { \ |
| fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \ |
| __FILE__, __LINE__, #apiFuncCall, _status); \ |
| exit(EXIT_FAILURE) |
| } \ |
| } while (0) |
|
|
| #define RUNTIME_API_CALL(apiFuncCall) \ |
| do { \ |
| cudaError_t _status = apiFuncCall |
| if (_status != cudaSuccess) { \ |
| fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ |
| __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\ |
| exit(EXIT_FAILURE) |
| } \ |
| } while (0) |
|
|
| #define CUPTI_CALL(call) \ |
| do { \ |
| CUptiResult _status = call |
| if (_status != CUPTI_SUCCESS) { \ |
| const char *errstr |
| cuptiGetResultString(_status, &errstr) |
| fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ |
| __FILE__, __LINE__, #call, errstr); \ |
| exit(EXIT_FAILURE) |
| } \ |
| } while (0) |
|
|
| #define ALIGN_SIZE (8) |
| #define ALIGN_BUFFER(buffer, align) \ |
| (((uintptr_t) (buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t) (buffer) & ((align)-1))) : (buffer)) |
|
|
| |
| typedef struct MetricData_st { |
| |
| CUdevice device |
| |
| CUpti_EventGroupSet *eventGroups |
| |
| |
| uint32_t eventIdx |
| |
| uint32_t numEvents |
| |
| CUpti_EventID *eventIdArray |
| |
| uint64_t *eventValueArray |
| } MetricData_t |
|
|
| static uint64_t kernelDuration |
|
|
| |
| __global__ void VecAdd(const int* A, const int* B, int* C, int N) |
| { |
| int i = blockDim.x * blockIdx.x + threadIdx.x |
| if (i < N) |
| C[i] = A[i] + B[i] |
| } |
|
|
| static void |
| initVec(int *vec, int n) |
| { |
| for (int i=0 |
| vec[i] = i |
| } |
|
|
| void CUPTIAPI |
| getMetricValueCallback(void *userdata, CUpti_CallbackDomain domain, |
| CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo) |
| { |
| MetricData_t *metricData = (MetricData_t*)userdata |
| unsigned int i, j, k |
|
|
| |
| |
| if ((cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) && |
| (cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000)) |
| { |
| printf("%s:%d: unexpected cbid %d\n", __FILE__, __LINE__, cbid) |
| exit(EXIT_FAILURE) |
| } |
|
|
| |
| |
| if (cbInfo->callbackSite == CUPTI_API_ENTER) { |
| cudaDeviceSynchronize() |
|
|
| CUPTI_CALL(cuptiSetEventCollectionMode(cbInfo->context, |
| CUPTI_EVENT_COLLECTION_MODE_KERNEL)) |
|
|
| for (i = 0 |
| uint32_t all = 1 |
| CUPTI_CALL(cuptiEventGroupSetAttribute(metricData->eventGroups->eventGroups[i], |
| CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES, |
| sizeof(all), &all)) |
| CUPTI_CALL(cuptiEventGroupEnable(metricData->eventGroups->eventGroups[i])) |
| } |
| } |
|
|
| |
| if (cbInfo->callbackSite == CUPTI_API_EXIT) { |
| cudaDeviceSynchronize() |
|
|
| |
| |
| for (i = 0 |
| CUpti_EventGroup group = metricData->eventGroups->eventGroups[i] |
| CUpti_EventDomainID groupDomain |
| uint32_t numEvents, numInstances, numTotalInstances |
| CUpti_EventID *eventIds |
| size_t groupDomainSize = sizeof(groupDomain) |
| size_t numEventsSize = sizeof(numEvents) |
| size_t numInstancesSize = sizeof(numInstances) |
| size_t numTotalInstancesSize = sizeof(numTotalInstances) |
| uint64_t *values, normalized, *sum |
| size_t valuesSize, eventIdsSize |
| size_t numCountersRead = 0 |
|
|
| CUPTI_CALL(cuptiEventGroupGetAttribute(group, |
| CUPTI_EVENT_GROUP_ATTR_EVENT_DOMAIN_ID, |
| &groupDomainSize, &groupDomain)) |
| CUPTI_CALL(cuptiDeviceGetEventDomainAttribute(metricData->device, groupDomain, |
| CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT, |
| &numTotalInstancesSize, &numTotalInstances)) |
| CUPTI_CALL(cuptiEventGroupGetAttribute(group, |
| CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT, |
| &numInstancesSize, &numInstances)) |
| CUPTI_CALL(cuptiEventGroupGetAttribute(group, |
| CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS, |
| &numEventsSize, &numEvents)) |
| eventIdsSize = numEvents * sizeof(CUpti_EventID) |
| eventIds = (CUpti_EventID *)malloc(eventIdsSize) |
| CUPTI_CALL(cuptiEventGroupGetAttribute(group, |
| CUPTI_EVENT_GROUP_ATTR_EVENTS, |
| &eventIdsSize, eventIds)) |
|
|
| valuesSize = sizeof(uint64_t) * numInstances * numEvents |
| values = (uint64_t *)malloc(valuesSize) |
|
|
| CUPTI_CALL(cuptiEventGroupReadAllEvents(group, |
| CUPTI_EVENT_READ_FLAG_NONE, |
| &valuesSize, |
| values, |
| &eventIdsSize, |
| eventIds, |
| &numCountersRead)) |
|
|
| if (metricData->eventIdx >= metricData->numEvents) { |
| fprintf(stderr, "error: too many events collected, metric expects only %d\n", |
| (int)metricData->numEvents) |
| exit(EXIT_FAILURE) |
| } |
|
|
| sum = (uint64_t *)calloc(sizeof(uint64_t), numEvents) |
| |
| for (k = 0 |
| for (j = 0 |
| sum[j] += values[(k * numEvents) + j] |
| } |
| } |
|
|
| for (j = 0 |
| |
| |
| normalized = (sum[j] * numTotalInstances) / numInstances |
|
|
| metricData->eventIdArray[metricData->eventIdx] = eventIds[j] |
| metricData->eventValueArray[metricData->eventIdx] = normalized |
| metricData->eventIdx++ |
|
|
| |
| { |
| char eventName[128] |
| size_t eventNameSize = sizeof(eventName) - 1 |
| CUPTI_CALL(cuptiEventGetAttribute(eventIds[j], CUPTI_EVENT_ATTR_NAME, |
| &eventNameSize, eventName)) |
| eventName[127] = '\0' |
| printf("\t%s = %llu (", eventName, (unsigned long long)sum[j]) |
| if (numInstances > 1) { |
| for (k = 0 |
| if (k != 0) |
| printf(", ") |
| printf("%llu", (unsigned long long)values[(k * numEvents) + j]) |
| } |
| } |
|
|
| printf(")\n") |
| printf("\t%s (normalized) (%llu * %u) / %u = %llu\n", |
| eventName, (unsigned long long)sum[j], |
| numTotalInstances, numInstances, |
| (unsigned long long)normalized) |
| } |
| } |
|
|
| free(values) |
| free(sum) |
| } |
|
|
| for (i = 0 |
| CUPTI_CALL(cuptiEventGroupDisable(metricData->eventGroups->eventGroups[i])) |
| } |
| } |
|
|
| static void |
| cleanUp(int *h_A, int *h_B, int *h_C, int *d_A, int *d_B, int *d_C) |
| { |
| if (d_A) |
| cudaFree(d_A) |
| if (d_B) |
| cudaFree(d_B) |
| if (d_C) |
| cudaFree(d_C) |
|
|
| |
| if (h_A) |
| free(h_A) |
| if (h_B) |
| free(h_B) |
| if (h_C) |
| free(h_C) |
| } |
|
|
| static void |
| runPass() |
| { |
| int N = 50000 |
| size_t size = N * sizeof(int) |
| int threadsPerBlock = 0 |
| int blocksPerGrid = 0 |
| int *h_A, *h_B, *h_C |
| int *d_A, *d_B, *d_C |
| int i, sum |
|
|
| |
| h_A = (int*)malloc(size) |
| h_B = (int*)malloc(size) |
| h_C = (int*)malloc(size) |
|
|
| |
| initVec(h_A, N) |
| initVec(h_B, N) |
| memset(h_C, 0, size) |
|
|
| |
| cudaMalloc((void**)&d_A, size) |
| cudaMalloc((void**)&d_B, size) |
| cudaMalloc((void**)&d_C, size) |
|
|
| |
| cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) |
| cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice) |
|
|
| |
| threadsPerBlock = 256 |
| blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock |
| printf("Launching kernel: blocks %d, thread/block %d\n", |
| blocksPerGrid, threadsPerBlock) |
|
|
| VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N) |
|
|
| |
| |
| cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) |
|
|
| |
| for (i = 0 |
| sum = h_A[i] + h_B[i] |
| if (h_C[i] != sum) { |
| fprintf(stderr, "error: result verification failed\n") |
| exit(EXIT_FAILURE) |
| } |
| } |
|
|
| cleanUp(h_A, h_B, h_C, d_A, d_B, d_C) |
| } |
|
|
| static void CUPTIAPI |
| bufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords) |
| { |
| uint8_t *rawBuffer |
|
|
| *size = 16 * 1024 |
| rawBuffer = (uint8_t *)malloc(*size + ALIGN_SIZE) |
|
|
| *buffer = ALIGN_BUFFER(rawBuffer, ALIGN_SIZE) |
| *maxNumRecords = 0 |
|
|
| if (*buffer == NULL) { |
| printf("Error: out of memory\n") |
| exit(EXIT_FAILURE) |
| } |
| } |
|
|
| static void CUPTIAPI |
| bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize) |
| { |
| CUpti_Activity *record = NULL |
| CUpti_ActivityKernel8 *kernel |
|
|
| |
| CUPTI_CALL(cuptiActivityGetNextRecord(buffer, validSize, &record)) |
|
|
| kernel = (CUpti_ActivityKernel8 *)record |
| if (kernel->kind != CUPTI_ACTIVITY_KIND_KERNEL) { |
| fprintf(stderr, "Error: expected kernel activity record, got %d\n", (int)kernel->kind) |
| exit(EXIT_FAILURE) |
| } |
|
|
| kernelDuration = kernel->end - kernel->start |
| free(buffer) |
| } |
|
|
| int |
| main(int argc, char *argv[]) |
| { |
| CUpti_SubscriberHandle subscriber |
| CUcontext context = 0 |
| CUdevice device = 0 |
| int deviceNum |
| int deviceCount |
| char deviceName[256] |
| const char *metricName |
| CUpti_MetricID metricId |
| CUpti_EventGroupSets *passData |
| MetricData_t metricData |
| unsigned int pass |
| CUpti_MetricValue metricValue |
|
|
| printf("Usage: %s [device_num] [metric_name]\n", argv[0]) |
|
|
| |
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL)) |
|
|
| DRIVER_API_CALL(cuInit(0)) |
| DRIVER_API_CALL(cuDeviceGetCount(&deviceCount)) |
| if (deviceCount == 0) { |
| printf("There is no device supporting CUDA.\n") |
| exit(EXIT_WAIVED) |
| } |
|
|
| if (argc > 1) |
| deviceNum = atoi(argv[1]) |
| else |
| deviceNum = 0 |
| printf("CUDA Device Number: %d\n", deviceNum) |
|
|
| DRIVER_API_CALL(cuDeviceGet(&device, deviceNum)) |
| DRIVER_API_CALL(cuDeviceGetName(deviceName, 256, device)) |
| printf("CUDA Device Name: %s\n", deviceName) |
|
|
| int major, minor |
| DRIVER_API_CALL(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device)) |
| DRIVER_API_CALL(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device)) |
|
|
| int deviceComputeCapability = 10 * major + minor |
| printf("Compute Capability of Device: %d.%d\n", major,minor) |
| if(deviceComputeCapability > 72) { |
| printf("Sample unsupported on Device with compute capability > 7.2\n") |
| exit(EXIT_WAIVED) |
| } |
|
|
| DRIVER_API_CALL(cuCtxCreate(&context, 0, device)) |
|
|
| |
| if (argc > 2) |
| metricName = argv[2] |
| else { |
| metricName = METRIC_NAME |
| } |
|
|
| |
| |
| |
| |
| { |
| CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted)) |
| runPass() |
| cudaDeviceSynchronize() |
| CUPTI_CALL(cuptiActivityFlushAll(0)) |
| } |
|
|
| |
| CUPTI_CALL(cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)getMetricValueCallback, &metricData)) |
| CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API, |
| CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020)) |
| CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API, |
| CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000)) |
|
|
| |
| CUPTI_CALL(cuptiMetricGetIdFromName(device, metricName, &metricId)) |
| CUPTI_CALL(cuptiMetricGetNumEvents(metricId, &metricData.numEvents)) |
| metricData.device = device |
| metricData.eventIdArray = (CUpti_EventID *)malloc(metricData.numEvents * sizeof(CUpti_EventID)) |
| metricData.eventValueArray = (uint64_t *)malloc(metricData.numEvents * sizeof(uint64_t)) |
| metricData.eventIdx = 0 |
|
|
| |
| |
| CUPTI_CALL(cuptiMetricCreateEventGroupSets(context, sizeof(metricId), &metricId, &passData)) |
| for (pass = 0 |
| printf("Pass %u\n", pass) |
| metricData.eventGroups = passData->sets + pass |
| runPass() |
| } |
|
|
| if (metricData.eventIdx != metricData.numEvents) { |
| fprintf(stderr, "error: expected %u metric events, got %u\n", |
| metricData.numEvents, metricData.eventIdx) |
| exit(EXIT_FAILURE) |
| } |
|
|
| |
| CUPTI_CALL(cuptiMetricGetValue(device, metricId, |
| metricData.numEvents * sizeof(CUpti_EventID), |
| metricData.eventIdArray, |
| metricData.numEvents * sizeof(uint64_t), |
| metricData.eventValueArray, |
| kernelDuration, &metricValue)) |
|
|
| |
| { |
| CUpti_MetricValueKind valueKind |
| size_t valueKindSize = sizeof(valueKind) |
| CUPTI_CALL(cuptiMetricGetAttribute(metricId, CUPTI_METRIC_ATTR_VALUE_KIND, |
| &valueKindSize, &valueKind)) |
| switch (valueKind) { |
| case CUPTI_METRIC_VALUE_KIND_DOUBLE: |
| printf("Metric %s = %f\n", metricName, metricValue.metricValueDouble) |
| break |
| case CUPTI_METRIC_VALUE_KIND_UINT64: |
| printf("Metric %s = %llu\n", metricName, |
| (unsigned long long)metricValue.metricValueUint64) |
| break |
| case CUPTI_METRIC_VALUE_KIND_INT64: |
| printf("Metric %s = %lld\n", metricName, |
| (long long)metricValue.metricValueInt64) |
| break |
| case CUPTI_METRIC_VALUE_KIND_PERCENT: |
| printf("Metric %s = %f%%\n", metricName, metricValue.metricValuePercent) |
| break |
| case CUPTI_METRIC_VALUE_KIND_THROUGHPUT: |
| printf("Metric %s = %llu bytes/sec\n", metricName, |
| (unsigned long long)metricValue.metricValueThroughput) |
| break |
| case CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL: |
| printf("Metric %s = utilization level %u\n", metricName, |
| (unsigned int)metricValue.metricValueUtilizationLevel) |
| break |
| default: |
| fprintf(stderr, "error: unknown value kind\n") |
| exit(EXIT_FAILURE) |
| } |
| } |
|
|
| CUPTI_CALL(cuptiUnsubscribe(subscriber)) |
| exit(EXIT_SUCCESS) |
| } |
|
|