MNghia's picture
Add files using upload-large-folder tool
563c80f verified
// Copyright 2021 NVIDIA Corporation. All rights reserved
//
// This is a sample CUDA application with several different kernel launch
// patterns - launching on the default stream, multple streams, and multiple
// threads on different devices, if more than one device is present.
//
// The injection sample shared library can be used on this sample application,
// demonstrating that the injection code handles multple streams and multiple
// threads.
// Standard CUDA
#include "cuda.h"
#include "cuda_runtime_api.h"
#include "driver_types.h"
// Standard STL headers
#include <chrono>
#include <cstdint>
#include <iostream>
using ::std::cerr;
using ::std::cout;
using ::std::endl;
#include <string>
using ::std::string;
#include <thread>
using ::std::thread;
#include <vector>
using ::std::vector;
#include <stdlib.h>
#ifndef EXIT_WAIVED
#define EXIT_WAIVED 2
#endif
// Helpful error handlers for standard CUDA runtime calls
#define RUNTIME_API_CALL(apiFuncCall) \
do { \
cudaError_t _status = apiFuncCall; \
if (_status != cudaSuccess) { \
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
__FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\
exit(EXIT_FAILURE); \
} \
} while (0)
#define MEMORY_ALLOCATION_CALL(var) \
do { \
if (var == NULL) { \
fprintf(stderr, "%s:%d: Error: Memory Allocation Failed \n", \
__FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define DRIVER_API_CALL(apiFuncCall) \
do { \
CUresult _status = apiFuncCall; \
if (_status != CUDA_SUCCESS) { \
fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \
__FILE__, __LINE__, #apiFuncCall, _status); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Per-device configuration, buffers, stream and device information, and device pointers
typedef struct {
int deviceID;
CUcontext context; //!< CUDA driver context, or NULL if default context has already been initialized
vector<cudaStream_t> streams; // Each device needs its own streams
vector<double *> d_x; // And device memory allocation
vector<double *> d_y; // ..
} perDeviceData;
#define DAXPY_REPEAT 32768
// Loop over array of elements performing daxpy multiple times
// To be launched with only one block (artificially increasing serial time to better demonstrate overlapping replay)
__global__ void daxpyKernel(int elements, double a, double * x, double * y)
{
for (int i = threadIdx.x; i < elements; i += blockDim.x)
// Artificially increase kernel runtime to emphasize concurrency
for (int j = 0; j < DAXPY_REPEAT; j++)
y[i] = a * x[i] + y[i]; // daxpy
}
// Initialize kernel values
double a = 2.5;
// Normally you would want multiple warps, but to emphasize concurrency with streams and multiple devices
// we run the kernels on a single warp.
int threadsPerBlock = 32;
int threadBlocks = 1;
// Configurable number of kernels (streams, when running concurrently)
int const numKernels = 4;
int const numStreams = numKernels;
vector<size_t> elements(numKernels);
// Each kernel call allocates and computes (call number) * (blockSize) elements
// For 4 calls, this is 4k elements * 2 arrays * (1 + 2 + 3 + 4 stream mul) * 8B/elem =~ 640KB
int const blockSize = 4 * 1024;
// Wrapper which will launch numKernel kernel calls on a single device
// The device streams vector is used to control which stream each call is made on
// If 'serial' is non-zero, the device streams are ignored and instead the default stream is used
void launchKernels(perDeviceData &d, char const * const rangeName, bool serial)
{
// Switch to desired device
RUNTIME_API_CALL(cudaSetDevice(d.deviceID));
DRIVER_API_CALL(cuCtxSetCurrent(d.context));
for (unsigned int stream = 0; stream < d.streams.size(); stream++)
{
cudaStream_t streamId = (serial ? 0 : d.streams[stream]);
daxpyKernel <<<threadBlocks, threadsPerBlock, 0, streamId>>> (elements[stream], a, d.d_x[stream], d.d_y[stream]);
RUNTIME_API_CALL(cudaGetLastError());
}
// After launching all work, synchronize all streams
if (serial == false)
{
for (unsigned int stream = 0; stream < d.streams.size(); stream++)
{
RUNTIME_API_CALL(cudaStreamSynchronize(d.streams[stream]));
}
}
else
{
RUNTIME_API_CALL(cudaStreamSynchronize(0));
}
}
int main(int argc, char * argv[])
{
int numDevices;
RUNTIME_API_CALL(cudaGetDeviceCount(&numDevices));
// Per-device information
vector<int> device_ids;
// Find all devices
for (int i = 0; i < numDevices; i++)
{
// Record device number
device_ids.push_back(i);
}
numDevices = device_ids.size();
cout << "Found " << numDevices << " devices" << endl;
// Ensure we found at least one device
if (numDevices == 0)
{
cerr << "No devices detected" << endl;
exit(EXIT_WAIVED);
}
// Initialize kernel input to some known numbers
vector<double> h_x(blockSize * numKernels);
vector<double> h_y(blockSize * numKernels);
for (size_t i = 0; i < blockSize * numKernels; i++)
{
h_x[i] = 1.5 * i;
h_y[i] = 2.0 * (i - 3000);
}
// Initialize a vector of 'default stream' values to demonstrate serialized kernels
vector<cudaStream_t> defaultStreams(numStreams);
for (int stream = 0; stream < numStreams; stream++)
{
defaultStreams[stream] = 0;
}
// Scale per-kernel work by stream number
for (int stream = 0; stream < numStreams; stream++)
{
elements[stream] = blockSize * (stream + 1);
}
// For each device, configure profiling, set up buffers, copy kernel data
vector<perDeviceData> deviceData(numDevices);
for (int device = 0; device < numDevices; device++)
{
RUNTIME_API_CALL(cudaSetDevice(device_ids[device]));
cout << "Configuring device " << device_ids[device] << endl;
// For simplicity's sake, in this sample, a single config struct is created per device
deviceData[device].deviceID = device_ids[device];// GPU device ID
DRIVER_API_CALL(cuCtxCreate(&(deviceData[device].context), 0, device_ids[device])); // Either set to a context, or may be NULL if a default context has been created
// Per-stream initialization & memory allocation - copy from constant host array to each device array
deviceData[device].streams.resize(numStreams);
deviceData[device].d_x.resize(numStreams);
deviceData[device].d_y.resize(numStreams);
for (int stream = 0; stream < numStreams; stream++)
{
RUNTIME_API_CALL(cudaStreamCreate(&(deviceData[device].streams[stream])));
// Each kernel does (stream #) * blockSize work on doubles
size_t size = elements[stream] * sizeof(double);
RUNTIME_API_CALL(cudaMalloc(&(deviceData[device].d_x[stream]), size));
MEMORY_ALLOCATION_CALL(deviceData[device].d_x[stream]); // Validate pointer
RUNTIME_API_CALL(cudaMemcpy(deviceData[device].d_x[stream], h_x.data(), size, cudaMemcpyHostToDevice));
RUNTIME_API_CALL(cudaMalloc(&(deviceData[device].d_y[stream]), size));
MEMORY_ALLOCATION_CALL(deviceData[device].d_y[stream]); // Validate pointer
RUNTIME_API_CALL(cudaMemcpy(deviceData[device].d_y[stream], h_x.data(), size, cudaMemcpyHostToDevice));
}
}
//
// First version - single device, kernel calls serialized on default stream
//
// Use wallclock time to measure performance
auto begin_time = ::std::chrono::high_resolution_clock::now();
// Run on first device and use default streams - will show runtime without any concurrency
launchKernels(deviceData[0], "single_gpu_serial", true);
auto end_time = ::std::chrono::high_resolution_clock::now();
auto elapsed_serial_ms = ::std::chrono::duration_cast<::std::chrono::milliseconds>(end_time - begin_time);
cout << "It took " << elapsed_serial_ms.count() << "ms on the host to launch " << numKernels << " kernels in serial" << endl;
//
// Second version - same kernel calls as before on the same device, but now using separate streams for concurrency
// (Should be limited by the longest running kernel)
//
begin_time = ::std::chrono::high_resolution_clock::now();
// Still only use first device, but this time use its allocated streams for parallelism
launchKernels(deviceData[0], "single_gpu_async", false);
end_time = ::std::chrono::high_resolution_clock::now();
auto elapsed_single_device_ms = ::std::chrono::duration_cast<::std::chrono::milliseconds>(end_time - begin_time);
cout << "It took " << elapsed_single_device_ms.count() << "ms on the host to launch " << numKernels << " kernels on a single device on separate streams" << endl;
//
// Third version - same as the second case, but duplicate the work across devices to show cross-device concurrency
// This is done using threads so no serialization is needed between devices
// (Should have roughly the same runtime as second case)
//
// Time creation of the same multiple streams * multiple devices
vector<::std::thread> threads;
begin_time = ::std::chrono::high_resolution_clock::now();
// Now launch parallel thread work, duplicated on one thread per gpu
for (int device = 0; device < numDevices; device++)
{
threads.push_back(::std::thread(launchKernels, ::std::ref(deviceData[device]), "multi_gpu_async", false));
}
// Wait for all threads to finish
for (auto &t: threads)
{
t.join();
}
// Record time used when launching on multiple devices
end_time = ::std::chrono::high_resolution_clock::now();
auto elapsed_multiple_device_ms = ::std::chrono::duration_cast<::std::chrono::milliseconds>(end_time - begin_time);
cout << "It took " << elapsed_multiple_device_ms.count() << "ms on the host to launch the same " << numKernels << " kernels on each of the " << numDevices << " devices in parallel" << endl;
// Free stream memory for each device
for (int i = 0; i < numDevices; i++)
{
for (int j = 0; j < numKernels; j++)
{
RUNTIME_API_CALL(cudaFree(deviceData[i].d_x[j]));
RUNTIME_API_CALL(cudaFree(deviceData[i].d_y[j]));
}
}
exit(EXIT_SUCCESS);
}