Add files using upload-large-folder tool

6be3106 verified 6 months ago

7.43 kB

	// Copyright 2021 NVIDIA Corporation. All rights reserved
	//
	// This sample demonstrates a very simple use case of the Checkpoint API -
	// An array is saved to device, a checkpoint is saved capturing these initial
	// values, the device memory is update with a new value, then restored to
	// initial value using the previously saved checkpoint. By validating that
	// the device values return the initial values, this demonstrates that the
	// checkpoint API worked as expected.

	#include <cuda.h>
	#include <iostream>
	#include <stdlib.h>
	using namespace std;

	#include <cupti_checkpoint.h>
	using namespace NV::Cupti::Checkpoint;

	#define CHECKPOINT_API_CALL(apiFuncCall) \
	do { \
	CUptiResult _status = apiFuncCall; \
	if (_status != CUPTI_SUCCESS) { \
	const char *errstr; \
	cuptiGetResultString(_status, &errstr); \
	fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
	__FILE__, __LINE__, #apiFuncCall, errstr); \
	exit(EXIT_FAILURE); \
	} \
	} while (0)

	#define RUNTIME_API_CALL(apiFuncCall) \
	do { \
	cudaError_t _status = apiFuncCall; \
	if (_status != cudaSuccess) { \
	fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
	__FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\
	exit(EXIT_FAILURE); \
	} \
	} while (0)

	#define MEMORY_ALLOCATION_CALL(var) \
	do { \
	if (var == NULL) { \
	fprintf(stderr, "%s:%d: Error: Memory Allocation Failed \n", \
	__FILE__, __LINE__); \
	exit(EXIT_FAILURE); \
	} \
	} while (0)

	#define DRIVER_API_CALL(apiFuncCall) \
	do { \
	CUresult _status = apiFuncCall; \
	if (_status != CUDA_SUCCESS) { \
	fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \
	__FILE__, __LINE__, #apiFuncCall, _status); \
	exit(EXIT_FAILURE); \
	} \
	} while (0)

	// Basic example of a kernel which may overwrite its own input data
	// This is not intended to show how to write a well-designed reduction,
	// but to demonstrate that a kernel which modifies its input data can be
	// replayed using the checkpoint API and get reproducible results.
	//
	// Sums n values, returning the total sum in data[0]
	__global__ void reduce(float * data, size_t n)
	{
	float thd_sum = 0.0;

	// Each thread sums its elements locally
	for (int i = threadIdx.x; i < n; i+= blockDim.x)
	{
	thd_sum += data[i];
	}

	// And saves the per-thread sum back to the thread's first element
	data[threadIdx.x] = thd_sum;

	__syncthreads();

	// Then, thread 0 reduces those per-thread sums to a single value in data[0]
	if (threadIdx.x == 0)
	{
	float total_sum = 0.0;

	size_t set_elems = (blockDim.x < n ? blockDim.x : n);

	for (int i = 0; i < set_elems; i++)
	{
	total_sum += data[i];
	}

	data[0] = total_sum;
	}
	}

	int main()
	{
	CUcontext ctx;

	// Set up a context for device 0
	RUNTIME_API_CALL(cudaSetDevice(0));
	DRIVER_API_CALL(cuCtxCreate(&ctx, 0, 0));

	// Allocate host and device arrays and initialize to known values
	float * d_A;
	size_t el_A = 1024 * 1024;
	size_t sz_A = el_A * sizeof(float);
	RUNTIME_API_CALL(cudaMalloc(&d_A, sz_A));
	MEMORY_ALLOCATION_CALL(d_A);
	float * h_A = (float *)malloc(sz_A);
	MEMORY_ALLOCATION_CALL(h_A);
	for (size_t i = 0; i < el_A; i++)
	{
	h_A[i] = 1.0;
	}
	RUNTIME_API_CALL(cudaMemcpy(d_A, h_A, sz_A, cudaMemcpyHostToDevice));

	cout << "Initially, d_A[0] = " << h_A[0] << endl;

	// Demonstrate a case where calling a kernel repeatedly may cause incorrect
	// behavior due to internally modifying its input data
	cout << "Without checkpoint:" << endl;
	for (int repeat = 0; repeat < 3; repeat++)
	{
	reduce<<<1, 64>>>(d_A, el_A);

	// Test return value - should change each iteration due to not resetting input array
	float ret;
	RUNTIME_API_CALL(cudaMemcpy(&ret, d_A, sizeof(float), cudaMemcpyDeviceToHost));
	cout << "After " << (repeat + 1) << " iteration" << (repeat > 0 ? "s" : "") << ", d_A[0] = " << ret << endl;
	}

	// Re-initialize input array
	RUNTIME_API_CALL(cudaMemcpy(d_A, h_A, sz_A, cudaMemcpyHostToDevice));
	cout << "Reset device array - d_A[0] = " << h_A[0] << endl;

	// Configure a checkpoint object
	CUpti_Checkpoint cp = { CUpti_Checkpoint_STRUCT_SIZE };
	cp.ctx = ctx;
	cp.optimizations = 1;

	float expected;

	cout << "With checkpoint:" << endl;
	for (int repeat = 0; repeat < 3; repeat++)
	{
	// Save or restore the checkpoint as needed
	if (repeat == 0)
	{
	CHECKPOINT_API_CALL(cuptiCheckpointSave(&cp));
	}
	else
	{
	CHECKPOINT_API_CALL(cuptiCheckpointRestore(&cp));
	}

	// Call reduction kernel that modifies its own input
	reduce<<<1, 64>>>(d_A, el_A);

	// Check the output value (d_A[0])
	float ret;
	RUNTIME_API_CALL(cudaMemcpy(&ret, d_A, sizeof(float), cudaMemcpyDeviceToHost));

	// The first call to the kernel produces the expected result - with checkpoint, every subsequent call should also return this
	if (repeat == 0)
	{
	expected = ret;
	}

	cout << "After " << (repeat + 1) << " iteration" << (repeat > 0 ? "s" : "") << ", d_A[0] = " << ret << endl;

	// Verify that this iteration's output value matches the expected value from the first iteration
	if (ret != expected)
	{
	cerr << "Error - repeat " << repeat << " did not match expected value (" << ret << " != " << expected << "), did checkpoint not restore input data correctly?" << endl;
	CHECKPOINT_API_CALL(cuptiCheckpointFree(&cp));
	exit(EXIT_FAILURE);
	}
	}

	// Clean up
	CHECKPOINT_API_CALL(cuptiCheckpointFree(&cp));

	exit(EXIT_SUCCESS);
	}