| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #include "cupti_external_correlation.h" |
|
|
| __global__ void VecAdd(const int* A, const int* B, int* C, int N) { |
| int i = blockDim.x * blockIdx.x + threadIdx.x; |
| if (i < N) |
| C[i] = A[i] + B[i]; |
| } |
|
|
| static void initVec(int *vec, int n) { |
| for (int i = 0; i < n; i++) |
| vec[i] = i; |
| } |
|
|
| void vectorAdd() { |
| CUcontext context = 0; |
| CUdevice device = 0; |
| int N = 50000; |
| size_t size = N * sizeof (int); |
| int threadsPerBlock = 0; |
| int blocksPerGrid = 0; |
| int *h_A = 0, *h_B = 0, *h_C = 0; |
| int *d_A = 0, *d_B = 0, *d_C = 0; |
| uint64_t id = 0; |
|
|
| DRIVER_API_CALL(cuDeviceGet(&device, 0)); |
|
|
| |
| h_A = (int*) malloc(size); |
| h_B = (int*) malloc(size); |
| h_C = (int*) malloc(size); |
|
|
| if (!h_A || !h_B || !h_C) { |
| printf("Error: Out of memory\n"); |
| return; |
| } |
|
|
| |
| initVec(h_A, N); |
| initVec(h_B, N); |
| memset(h_C, 0, size); |
|
|
| |
| CUPTI_CALL(cuptiActivityPushExternalCorrelationId(CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, static_cast<uint64_t>(INITIALIZATION_EXTERNAL_ID))); |
|
|
| DRIVER_API_CALL(cuCtxCreate(&context, 0, device)); |
|
|
| |
| RUNTIME_API_CALL(cudaMalloc((void**) &d_A, size)); |
| RUNTIME_API_CALL(cudaMalloc((void**) &d_B, size)); |
| RUNTIME_API_CALL(cudaMalloc((void**) &d_C, size)); |
|
|
| |
| RUNTIME_API_CALL(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice)); |
| RUNTIME_API_CALL(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice)); |
|
|
| |
| CUPTI_CALL(cuptiActivityPopExternalCorrelationId(CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, &id)); |
|
|
| |
| CUPTI_CALL(cuptiActivityPushExternalCorrelationId(CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, static_cast<uint64_t>(EXECUTION_EXTERNAL_ID))); |
|
|
| |
| threadsPerBlock = 256; |
| blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; |
|
|
| VecAdd << <blocksPerGrid, threadsPerBlock >> >(d_A, d_B, d_C, N); |
| DRIVER_API_CALL(cuCtxSynchronize()); |
|
|
| |
| |
| RUNTIME_API_CALL(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost)); |
|
|
| |
| CUPTI_CALL(cuptiActivityPopExternalCorrelationId(CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, &id)); |
|
|
| |
| CUPTI_CALL(cuptiActivityPushExternalCorrelationId(CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, static_cast<uint64_t>(CLEANUP_EXTERNAL_ID))); |
|
|
| |
| RUNTIME_API_CALL(cudaFree(d_A)); |
| RUNTIME_API_CALL(cudaFree(d_B)); |
| RUNTIME_API_CALL(cudaFree(d_C)); |
|
|
| |
| CUPTI_CALL(cuptiActivityPopExternalCorrelationId(CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, &id)); |
|
|
| |
| if (h_A) { |
| free(h_A); |
| } |
| if (h_B) { |
| free(h_B); |
| } |
| if (h_C) { |
| free(h_C); |
| } |
|
|
| DRIVER_API_CALL(cuCtxSynchronize()); |
| DRIVER_API_CALL(cuCtxDestroy(context)); |
| } |
|
|
| int main(int argc, char *argv[]) { |
| initTrace(); |
|
|
| DRIVER_API_CALL(cuInit(0)); |
| vectorAdd(); |
|
|
| finiTrace(); |
|
|
| exit(EXIT_SUCCESS); |
| } |