|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "cuda.h" |
|
|
#include "cuda_runtime_api.h" |
|
|
#include "driver_types.h" |
|
|
|
|
|
|
|
|
#include <chrono> |
|
|
#include <cstdint> |
|
|
#include <iostream> |
|
|
using ::std::cerr; |
|
|
using ::std::cout; |
|
|
using ::std::endl; |
|
|
|
|
|
#include <string> |
|
|
using ::std::string; |
|
|
|
|
|
#include <thread> |
|
|
using ::std::thread; |
|
|
|
|
|
#include <vector> |
|
|
using ::std::vector; |
|
|
|
|
|
#include <stdlib.h> |
|
|
|
|
|
#ifndef EXIT_WAIVED |
|
|
#define EXIT_WAIVED 2 |
|
|
#endif |
|
|
|
|
|
|
|
|
#define RUNTIME_API_CALL(apiFuncCall) \ |
|
|
do { \ |
|
|
cudaError_t _status = apiFuncCall; \ |
|
|
if (_status != cudaSuccess) { \ |
|
|
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ |
|
|
__FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\ |
|
|
exit(EXIT_FAILURE); \ |
|
|
} \ |
|
|
} while (0) |
|
|
|
|
|
#define MEMORY_ALLOCATION_CALL(var) \ |
|
|
do { \ |
|
|
if (var == NULL) { \ |
|
|
fprintf(stderr, "%s:%d: Error: Memory Allocation Failed \n", \ |
|
|
__FILE__, __LINE__); \ |
|
|
exit(EXIT_FAILURE); \ |
|
|
} \ |
|
|
} while (0) |
|
|
|
|
|
#define DRIVER_API_CALL(apiFuncCall) \ |
|
|
do { \ |
|
|
CUresult _status = apiFuncCall; \ |
|
|
if (_status != CUDA_SUCCESS) { \ |
|
|
fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \ |
|
|
__FILE__, __LINE__, #apiFuncCall, _status); \ |
|
|
exit(EXIT_FAILURE); \ |
|
|
} \ |
|
|
} while (0) |
|
|
|
|
|
|
|
|
typedef struct { |
|
|
int deviceID; |
|
|
CUcontext context; |
|
|
vector<cudaStream_t> streams; |
|
|
vector<double *> d_x; |
|
|
vector<double *> d_y; |
|
|
} perDeviceData; |
|
|
|
|
|
#define DAXPY_REPEAT 32768 |
|
|
|
|
|
|
|
|
__global__ void daxpyKernel(int elements, double a, double * x, double * y) |
|
|
{ |
|
|
for (int i = threadIdx.x; i < elements; i += blockDim.x) |
|
|
|
|
|
for (int j = 0; j < DAXPY_REPEAT; j++) |
|
|
y[i] = a * x[i] + y[i]; |
|
|
} |
|
|
|
|
|
|
|
|
double a = 2.5; |
|
|
|
|
|
|
|
|
|
|
|
int threadsPerBlock = 32; |
|
|
int threadBlocks = 1; |
|
|
|
|
|
|
|
|
int const numKernels = 4; |
|
|
int const numStreams = numKernels; |
|
|
vector<size_t> elements(numKernels); |
|
|
|
|
|
|
|
|
|
|
|
int const blockSize = 4 * 1024; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void launchKernels(perDeviceData &d, char const * const rangeName, bool serial) |
|
|
{ |
|
|
|
|
|
RUNTIME_API_CALL(cudaSetDevice(d.deviceID)); |
|
|
DRIVER_API_CALL(cuCtxSetCurrent(d.context)); |
|
|
|
|
|
for (unsigned int stream = 0; stream < d.streams.size(); stream++) |
|
|
{ |
|
|
cudaStream_t streamId = (serial ? 0 : d.streams[stream]); |
|
|
daxpyKernel <<<threadBlocks, threadsPerBlock, 0, streamId>>> (elements[stream], a, d.d_x[stream], d.d_y[stream]); |
|
|
RUNTIME_API_CALL(cudaGetLastError()); |
|
|
} |
|
|
|
|
|
|
|
|
if (serial == false) |
|
|
{ |
|
|
for (unsigned int stream = 0; stream < d.streams.size(); stream++) |
|
|
{ |
|
|
RUNTIME_API_CALL(cudaStreamSynchronize(d.streams[stream])); |
|
|
} |
|
|
} |
|
|
else |
|
|
{ |
|
|
RUNTIME_API_CALL(cudaStreamSynchronize(0)); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
int main(int argc, char * argv[]) |
|
|
{ |
|
|
int numDevices; |
|
|
RUNTIME_API_CALL(cudaGetDeviceCount(&numDevices)); |
|
|
|
|
|
|
|
|
vector<int> device_ids; |
|
|
|
|
|
|
|
|
for (int i = 0; i < numDevices; i++) |
|
|
{ |
|
|
|
|
|
device_ids.push_back(i); |
|
|
} |
|
|
|
|
|
numDevices = device_ids.size(); |
|
|
cout << "Found " << numDevices << " devices" << endl; |
|
|
|
|
|
|
|
|
if (numDevices == 0) |
|
|
{ |
|
|
cerr << "No devices detected" << endl; |
|
|
exit(EXIT_WAIVED); |
|
|
} |
|
|
|
|
|
|
|
|
vector<double> h_x(blockSize * numKernels); |
|
|
vector<double> h_y(blockSize * numKernels); |
|
|
for (size_t i = 0; i < blockSize * numKernels; i++) |
|
|
{ |
|
|
h_x[i] = 1.5 * i; |
|
|
h_y[i] = 2.0 * (i - 3000); |
|
|
} |
|
|
|
|
|
|
|
|
vector<cudaStream_t> defaultStreams(numStreams); |
|
|
for (int stream = 0; stream < numStreams; stream++) |
|
|
{ |
|
|
defaultStreams[stream] = 0; |
|
|
} |
|
|
|
|
|
|
|
|
for (int stream = 0; stream < numStreams; stream++) |
|
|
{ |
|
|
elements[stream] = blockSize * (stream + 1); |
|
|
} |
|
|
|
|
|
|
|
|
vector<perDeviceData> deviceData(numDevices); |
|
|
|
|
|
for (int device = 0; device < numDevices; device++) |
|
|
{ |
|
|
RUNTIME_API_CALL(cudaSetDevice(device_ids[device])); |
|
|
cout << "Configuring device " << device_ids[device] << endl; |
|
|
|
|
|
|
|
|
deviceData[device].deviceID = device_ids[device]; |
|
|
|
|
|
DRIVER_API_CALL(cuCtxCreate(&(deviceData[device].context), 0, device_ids[device])); |
|
|
|
|
|
|
|
|
deviceData[device].streams.resize(numStreams); |
|
|
deviceData[device].d_x.resize(numStreams); |
|
|
deviceData[device].d_y.resize(numStreams); |
|
|
for (int stream = 0; stream < numStreams; stream++) |
|
|
{ |
|
|
RUNTIME_API_CALL(cudaStreamCreate(&(deviceData[device].streams[stream]))); |
|
|
|
|
|
|
|
|
size_t size = elements[stream] * sizeof(double); |
|
|
|
|
|
RUNTIME_API_CALL(cudaMalloc(&(deviceData[device].d_x[stream]), size)); |
|
|
MEMORY_ALLOCATION_CALL(deviceData[device].d_x[stream]); |
|
|
RUNTIME_API_CALL(cudaMemcpy(deviceData[device].d_x[stream], h_x.data(), size, cudaMemcpyHostToDevice)); |
|
|
|
|
|
RUNTIME_API_CALL(cudaMalloc(&(deviceData[device].d_y[stream]), size)); |
|
|
MEMORY_ALLOCATION_CALL(deviceData[device].d_y[stream]); |
|
|
RUNTIME_API_CALL(cudaMemcpy(deviceData[device].d_y[stream], h_x.data(), size, cudaMemcpyHostToDevice)); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
auto begin_time = ::std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
|
|
|
launchKernels(deviceData[0], "single_gpu_serial", true); |
|
|
|
|
|
auto end_time = ::std::chrono::high_resolution_clock::now(); |
|
|
auto elapsed_serial_ms = ::std::chrono::duration_cast<::std::chrono::milliseconds>(end_time - begin_time); |
|
|
cout << "It took " << elapsed_serial_ms.count() << "ms on the host to launch " << numKernels << " kernels in serial" << endl; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
begin_time = ::std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
|
|
|
launchKernels(deviceData[0], "single_gpu_async", false); |
|
|
|
|
|
end_time = ::std::chrono::high_resolution_clock::now(); |
|
|
auto elapsed_single_device_ms = ::std::chrono::duration_cast<::std::chrono::milliseconds>(end_time - begin_time); |
|
|
cout << "It took " << elapsed_single_device_ms.count() << "ms on the host to launch " << numKernels << " kernels on a single device on separate streams" << endl; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vector<::std::thread> threads; |
|
|
begin_time = ::std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
|
|
|
for (int device = 0; device < numDevices; device++) |
|
|
{ |
|
|
threads.push_back(::std::thread(launchKernels, ::std::ref(deviceData[device]), "multi_gpu_async", false)); |
|
|
} |
|
|
|
|
|
|
|
|
for (auto &t: threads) |
|
|
{ |
|
|
t.join(); |
|
|
} |
|
|
|
|
|
|
|
|
end_time = ::std::chrono::high_resolution_clock::now(); |
|
|
auto elapsed_multiple_device_ms = ::std::chrono::duration_cast<::std::chrono::milliseconds>(end_time - begin_time); |
|
|
cout << "It took " << elapsed_multiple_device_ms.count() << "ms on the host to launch the same " << numKernels << " kernels on each of the " << numDevices << " devices in parallel" << endl; |
|
|
|
|
|
|
|
|
for (int i = 0; i < numDevices; i++) |
|
|
{ |
|
|
for (int j = 0; j < numKernels; j++) |
|
|
{ |
|
|
RUNTIME_API_CALL(cudaFree(deviceData[i].d_x[j])); |
|
|
RUNTIME_API_CALL(cudaFree(deviceData[i].d_y[j])); |
|
|
} |
|
|
} |
|
|
|
|
|
exit(EXIT_SUCCESS); |
|
|
} |
|
|
|