/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple demonstration of cub::BlockReduce with dynamic shared memory * * To compile using the command line: * nvcc -arch=sm_XX example_block_reduce_dyn_smem.cu -I../.. -lcudart -O3 -std=c++14 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console (define before including cub.h) #define CUB_STDERR #include #include #include #include #include #include #include "../../test/test_util.h" // Some implementation details rely on c++14 #if CUB_CPP_DIALECT >= 2014 using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- /// Verbose output bool g_verbose = false; /// Default grid size int g_grid_size = 1; //--------------------------------------------------------------------- // Kernels //--------------------------------------------------------------------- /** * Simple kernel for performing a block-wide reduction. */ template __global__ void BlockReduceKernel( int *d_in, // Tile of input int *d_out // Tile aggregate ) { // Specialize BlockReduce type for our thread block using BlockReduceT = cub::BlockReduce; using TempStorageT = typename BlockReduceT::TempStorage; union ShmemLayout { TempStorageT reduce; int aggregate; }; // shared memory byte-array extern __shared__ __align__(alignof(ShmemLayout)) char smem[]; // cast to lvalue reference of expected type auto& temp_storage = reinterpret_cast(smem); int data = d_in[threadIdx.x]; // Compute sum int aggregate = BlockReduceT(temp_storage).Sum(data); // block-wide sync barrier necessary to re-use shared mem safely __syncthreads(); int* smem_integers = reinterpret_cast(smem); if (threadIdx.x == 0) smem_integers[0] = aggregate; // sync to make new shared value available to all threads __syncthreads(); aggregate = smem_integers[0]; // all threads write the aggregate to output d_out[threadIdx.x] = aggregate; } //--------------------------------------------------------------------- // Host utilities //--------------------------------------------------------------------- /** * Initialize reduction problem (and solution). * Returns the aggregate */ int Initialize(int *h_in, int num_items) { int inclusive = 0; for (int i = 0; i < num_items; ++i) { h_in[i] = i % 17; inclusive += h_in[i]; } return inclusive; } /** * Test thread block reduction */ template void Test() { // Allocate host arrays int *h_in = new int[BLOCK_THREADS]; // Initialize problem and reference output on host int h_aggregate = Initialize(h_in, BLOCK_THREADS); // Initialize device arrays int *d_in = NULL; int *d_out = NULL; cudaMalloc((void**)&d_in, sizeof(int) * BLOCK_THREADS); cudaMalloc((void**)&d_out, sizeof(int) * BLOCK_THREADS); // Display input problem data if (g_verbose) { printf("Input data: "); for (int i = 0; i < BLOCK_THREADS; i++) printf("%d, ", h_in[i]); printf("\n\n"); } // Copy problem to device cudaMemcpy(d_in, h_in, sizeof(int) * BLOCK_THREADS, cudaMemcpyHostToDevice); // determine necessary storage size: auto block_reduce_temp_bytes = sizeof(typename cub::BlockReduce::TempStorage); // finally, we need to make sure that we can hold at least one integer // needed in the kernel to exchange data after reduction auto smem_size = (std::max)(1 * sizeof(int), block_reduce_temp_bytes); // use default stream cudaStream_t stream = NULL; // Run reduction kernel BlockReduceKernel <<>>( d_in, d_out); // Check total aggregate printf("\tAggregate: "); int compare = 0; for (int i = 0; i < BLOCK_THREADS; i++) { compare = compare || CompareDeviceResults( &h_aggregate, d_out + i, 1, g_verbose, g_verbose); } printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Check for kernel errors and STDIO from the kernel, if any CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Cleanup if (h_in) delete[] h_in; if (d_in) cudaFree(d_in); if (d_out) cudaFree(d_out); } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("grid-size", g_grid_size); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--grid-size=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Run tests Test<1024>(); Test<512>(); Test<256>(); Test<128>(); Test<64>(); Test<32>(); Test<16>(); return 0; } #else // < C++14 int main() {} #endif