MNghia's picture
Add files using upload-large-folder tool
6be3106 verified
// Copyright 2021 NVIDIA Corporation. All rights reserved
//
// This sample demonstrates a very simple use case of the Checkpoint API -
// An array is saved to device, a checkpoint is saved capturing these initial
// values, the device memory is update with a new value, then restored to
// initial value using the previously saved checkpoint. By validating that
// the device values return the initial values, this demonstrates that the
// checkpoint API worked as expected.
#include <cuda.h>
#include <iostream>
#include <stdlib.h>
using namespace std;
#include <cupti_checkpoint.h>
using namespace NV::Cupti::Checkpoint;
#define CHECKPOINT_API_CALL(apiFuncCall) \
do { \
CUptiResult _status = apiFuncCall; \
if (_status != CUPTI_SUCCESS) { \
const char *errstr; \
cuptiGetResultString(_status, &errstr); \
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
__FILE__, __LINE__, #apiFuncCall, errstr); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define RUNTIME_API_CALL(apiFuncCall) \
do { \
cudaError_t _status = apiFuncCall; \
if (_status != cudaSuccess) { \
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
__FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\
exit(EXIT_FAILURE); \
} \
} while (0)
#define MEMORY_ALLOCATION_CALL(var) \
do { \
if (var == NULL) { \
fprintf(stderr, "%s:%d: Error: Memory Allocation Failed \n", \
__FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define DRIVER_API_CALL(apiFuncCall) \
do { \
CUresult _status = apiFuncCall; \
if (_status != CUDA_SUCCESS) { \
fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \
__FILE__, __LINE__, #apiFuncCall, _status); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Basic example of a kernel which may overwrite its own input data
// This is not intended to show how to write a well-designed reduction,
// but to demonstrate that a kernel which modifies its input data can be
// replayed using the checkpoint API and get reproducible results.
//
// Sums n values, returning the total sum in data[0]
__global__ void reduce(float * data, size_t n)
{
float thd_sum = 0.0;
// Each thread sums its elements locally
for (int i = threadIdx.x; i < n; i+= blockDim.x)
{
thd_sum += data[i];
}
// And saves the per-thread sum back to the thread's first element
data[threadIdx.x] = thd_sum;
__syncthreads();
// Then, thread 0 reduces those per-thread sums to a single value in data[0]
if (threadIdx.x == 0)
{
float total_sum = 0.0;
size_t set_elems = (blockDim.x < n ? blockDim.x : n);
for (int i = 0; i < set_elems; i++)
{
total_sum += data[i];
}
data[0] = total_sum;
}
}
int main()
{
CUcontext ctx;
// Set up a context for device 0
RUNTIME_API_CALL(cudaSetDevice(0));
DRIVER_API_CALL(cuCtxCreate(&ctx, 0, 0));
// Allocate host and device arrays and initialize to known values
float * d_A;
size_t el_A = 1024 * 1024;
size_t sz_A = el_A * sizeof(float);
RUNTIME_API_CALL(cudaMalloc(&d_A, sz_A));
MEMORY_ALLOCATION_CALL(d_A);
float * h_A = (float *)malloc(sz_A);
MEMORY_ALLOCATION_CALL(h_A);
for (size_t i = 0; i < el_A; i++)
{
h_A[i] = 1.0;
}
RUNTIME_API_CALL(cudaMemcpy(d_A, h_A, sz_A, cudaMemcpyHostToDevice));
cout << "Initially, d_A[0] = " << h_A[0] << endl;
// Demonstrate a case where calling a kernel repeatedly may cause incorrect
// behavior due to internally modifying its input data
cout << "Without checkpoint:" << endl;
for (int repeat = 0; repeat < 3; repeat++)
{
reduce<<<1, 64>>>(d_A, el_A);
// Test return value - should change each iteration due to not resetting input array
float ret;
RUNTIME_API_CALL(cudaMemcpy(&ret, d_A, sizeof(float), cudaMemcpyDeviceToHost));
cout << "After " << (repeat + 1) << " iteration" << (repeat > 0 ? "s" : "") << ", d_A[0] = " << ret << endl;
}
// Re-initialize input array
RUNTIME_API_CALL(cudaMemcpy(d_A, h_A, sz_A, cudaMemcpyHostToDevice));
cout << "Reset device array - d_A[0] = " << h_A[0] << endl;
// Configure a checkpoint object
CUpti_Checkpoint cp = { CUpti_Checkpoint_STRUCT_SIZE };
cp.ctx = ctx;
cp.optimizations = 1;
float expected;
cout << "With checkpoint:" << endl;
for (int repeat = 0; repeat < 3; repeat++)
{
// Save or restore the checkpoint as needed
if (repeat == 0)
{
CHECKPOINT_API_CALL(cuptiCheckpointSave(&cp));
}
else
{
CHECKPOINT_API_CALL(cuptiCheckpointRestore(&cp));
}
// Call reduction kernel that modifies its own input
reduce<<<1, 64>>>(d_A, el_A);
// Check the output value (d_A[0])
float ret;
RUNTIME_API_CALL(cudaMemcpy(&ret, d_A, sizeof(float), cudaMemcpyDeviceToHost));
// The first call to the kernel produces the expected result - with checkpoint, every subsequent call should also return this
if (repeat == 0)
{
expected = ret;
}
cout << "After " << (repeat + 1) << " iteration" << (repeat > 0 ? "s" : "") << ", d_A[0] = " << ret << endl;
// Verify that this iteration's output value matches the expected value from the first iteration
if (ret != expected)
{
cerr << "Error - repeat " << repeat << " did not match expected value (" << ret << " != " << expected << "), did checkpoint not restore input data correctly?" << endl;
CHECKPOINT_API_CALL(cuptiCheckpointFree(&cp));
exit(EXIT_FAILURE);
}
}
// Clean up
CHECKPOINT_API_CALL(cuptiCheckpointFree(&cp));
exit(EXIT_SUCCESS);
}