thrust / dependencies /cub /examples /block /example_block_reduce.cu

thanks to nvidia ❤

8ae5fc5 over 2 years ago

9.6 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/******************************************************************************
	* Simple demonstration of cub::BlockReduce
	*
	* To compile using the command line:
	* nvcc -arch=sm_XX example_block_reduce.cu -I../.. -lcudart -O3
	*
	******************************************************************************/

	// Ensure printing of CUDA runtime errors to console (define before including cub.h)
	#define CUB_STDERR

	#include <stdio.h>
	#include <iostream>

	#include <cub/block/block_load.cuh>
	#include <cub/block/block_store.cuh>
	#include <cub/block/block_reduce.cuh>

	#include "../../test/test_util.h"

	using namespace cub;

	//---------------------------------------------------------------------
	// Globals, constants and typedefs
	//---------------------------------------------------------------------

	/// Verbose output
	bool g_verbose = false;

	/// Timing iterations
	int g_timing_iterations = 100;

	/// Default grid size
	int g_grid_size = 1;



	//---------------------------------------------------------------------
	// Kernels
	//---------------------------------------------------------------------

	/**
	* Simple kernel for performing a block-wide reduction.
	*/
	template <
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD,
	BlockReduceAlgorithm ALGORITHM>
	__global__ void BlockReduceKernel(
	int *d_in, // Tile of input
	int *d_out, // Tile aggregate
	clock_t *d_elapsed) // Elapsed cycle count of block reduction
	{
	// Specialize BlockReduce type for our thread block
	typedef BlockReduce<int, BLOCK_THREADS, ALGORITHM> BlockReduceT;

	// Shared memory
	__shared__ typename BlockReduceT::TempStorage temp_storage;

	// Per-thread tile data
	int data[ITEMS_PER_THREAD];
	LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in, data);

	// Start cycle timer
	clock_t start = clock();

	// Compute sum
	int aggregate = BlockReduceT(temp_storage).Sum(data);

	// Stop cycle timer
	clock_t stop = clock();

	// Store aggregate and elapsed clocks
	if (threadIdx.x == 0)
	{
	*d_elapsed = (start > stop) ? start - stop : stop - start;
	*d_out = aggregate;
	}
	}



	//---------------------------------------------------------------------
	// Host utilities
	//---------------------------------------------------------------------

	/**
	* Initialize reduction problem (and solution).
	* Returns the aggregate
	*/
	int Initialize(int *h_in, int num_items)
	{
	int inclusive = 0;

	for (int i = 0; i < num_items; ++i)
	{
	h_in[i] = i % 17;
	inclusive += h_in[i];
	}

	return inclusive;
	}


	/**
	* Test thread block reduction
	*/
	template <
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD,
	BlockReduceAlgorithm ALGORITHM>
	void Test()
	{
	const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;

	// Allocate host arrays
	int *h_in = new int[TILE_SIZE];
	int *h_gpu = new int[TILE_SIZE + 1];

	// Initialize problem and reference output on host
	int h_aggregate = Initialize(h_in, TILE_SIZE);

	// Initialize device arrays
	int *d_in = NULL;
	int *d_out = NULL;
	clock_t *d_elapsed = NULL;
	cudaMalloc((void*)&d_in, sizeof(int) TILE_SIZE);
	cudaMalloc((void*)&d_out, sizeof(int) 1);
	cudaMalloc((void**)&d_elapsed, sizeof(clock_t));

	// Display input problem data
	if (g_verbose)
	{
	printf("Input data: ");
	for (int i = 0; i < TILE_SIZE; i++)
	printf("%d, ", h_in[i]);
	printf("\n\n");
	}

	// Kernel props
	int max_sm_occupancy;
	CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockReduceKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));

	// Copy problem to device
	cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);

	printf("BlockReduce algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
	(ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : "BLOCK_REDUCE_WARP_REDUCTIONS",
	TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);

	// Run kernel
	BlockReduceKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
	d_in,
	d_out,
	d_elapsed);

	// Check total aggregate
	printf("\tAggregate: ");
	int compare = CompareDeviceResults(&h_aggregate, d_out, 1, g_verbose, g_verbose);
	printf("%s\n", compare ? "FAIL" : "PASS");
	AssertEquals(0, compare);

	// Run this several times and average the performance results
	GpuTimer timer;
	float elapsed_millis = 0.0;
	clock_t elapsed_clocks = 0;

	for (int i = 0; i < g_timing_iterations; ++i)
	{
	// Copy problem to device
	cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);

	timer.Start();

	// Run kernel
	BlockReduceKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
	d_in,
	d_out,
	d_elapsed);

	timer.Stop();
	elapsed_millis += timer.ElapsedMillis();

	// Copy clocks from device
	clock_t clocks;
	CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
	elapsed_clocks += clocks;

	}

	// Check for kernel errors and STDIO from the kernel, if any
	CubDebugExit(cudaPeekAtLastError());
	CubDebugExit(cudaDeviceSynchronize());

	// Display timing results
	float avg_millis = elapsed_millis / g_timing_iterations;
	float avg_items_per_sec = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
	float avg_clocks = float(elapsed_clocks) / g_timing_iterations;
	float avg_clocks_per_item = avg_clocks / TILE_SIZE;

	printf("\tAverage BlockReduce::Sum clocks: %.3f\n", avg_clocks);
	printf("\tAverage BlockReduce::Sum clocks per item: %.3f\n", avg_clocks_per_item);
	printf("\tAverage kernel millis: %.4f\n", avg_millis);
	printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);

	// Cleanup
	if (h_in) delete[] h_in;
	if (h_gpu) delete[] h_gpu;
	if (d_in) cudaFree(d_in);
	if (d_out) cudaFree(d_out);
	if (d_elapsed) cudaFree(d_elapsed);
	}


	/**
	* Main
	*/
	int main(int argc, char** argv)
	{
	// Initialize command line
	CommandLineArgs args(argc, argv);
	g_verbose = args.CheckCmdLineFlag("v");
	args.GetCmdLineArgument("i", g_timing_iterations);
	args.GetCmdLineArgument("grid-size", g_grid_size);

	// Print usage
	if (args.CheckCmdLineFlag("help"))
	{
	printf("%s "
	"[--device=<device-id>] "
	"[--i=<timing iterations>] "
	"[--grid-size=<grid size>] "
	"[--v] "
	"\n", argv[0]);
	exit(0);
	}

	// Initialize device
	CubDebugExit(args.DeviceInit());

	// Run tests
	Test<1024, 1, BLOCK_REDUCE_RAKING>();
	Test<512, 2, BLOCK_REDUCE_RAKING>();
	Test<256, 4, BLOCK_REDUCE_RAKING>();
	Test<128, 8, BLOCK_REDUCE_RAKING>();
	Test<64, 16, BLOCK_REDUCE_RAKING>();
	Test<32, 32, BLOCK_REDUCE_RAKING>();
	Test<16, 64, BLOCK_REDUCE_RAKING>();

	printf("-------------\n");

	Test<1024, 1, BLOCK_REDUCE_WARP_REDUCTIONS>();
	Test<512, 2, BLOCK_REDUCE_WARP_REDUCTIONS>();
	Test<256, 4, BLOCK_REDUCE_WARP_REDUCTIONS>();
	Test<128, 8, BLOCK_REDUCE_WARP_REDUCTIONS>();
	Test<64, 16, BLOCK_REDUCE_WARP_REDUCTIONS>();
	Test<32, 32, BLOCK_REDUCE_WARP_REDUCTIONS>();
	Test<16, 64, BLOCK_REDUCE_WARP_REDUCTIONS>();

	return 0;
	}