Spaces:
Runtime error
Runtime error
| /****************************************************************************** | |
| * Copyright (c) 2011, Duane Merrill. All rights reserved. | |
| * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. | |
| * | |
| * Redistribution and use in source and binary forms, with or without | |
| * modification, are permitted provided that the following conditions are met: | |
| * * Redistributions of source code must retain the above copyright | |
| * notice, this list of conditions and the following disclaimer. | |
| * * Redistributions in binary form must reproduce the above copyright | |
| * notice, this list of conditions and the following disclaimer in the | |
| * documentation and/or other materials provided with the distribution. | |
| * * Neither the name of the NVIDIA CORPORATION nor the | |
| * names of its contributors may be used to endorse or promote products | |
| * derived from this software without specific prior written permission. | |
| * | |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| * | |
| ******************************************************************************/ | |
| /****************************************************************************** | |
| * Test of BlockReduce utilities | |
| ******************************************************************************/ | |
| // Ensure printing of CUDA runtime errors to console | |
| #define CUB_STDERR | |
| #include <stdio.h> | |
| #include <cuda_runtime_api.h> | |
| #include <typeinfo> | |
| #include <cub/block/block_reduce.cuh> | |
| #include <cub/block/block_load.cuh> | |
| #include <cub/util_ptx.cuh> | |
| #include <cub/util_allocator.cuh> | |
| #include <cub/util_debug.cuh> | |
| #include "test_util.h" | |
| using namespace cub; | |
| //--------------------------------------------------------------------- | |
| // Globals, constants and typedefs | |
| //--------------------------------------------------------------------- | |
| bool g_verbose = false; | |
| int g_repeat = 0; | |
| CachingDeviceAllocator g_allocator(true); | |
| //--------------------------------------------------------------------- | |
| // Test kernels | |
| //--------------------------------------------------------------------- | |
| /// Generic reduction (full, 1) | |
| template <typename BlockReduceT, typename T, typename ReductionOp> | |
| __device__ __forceinline__ T DeviceTest( | |
| BlockReduceT &block_reduce, T (&data)[1], ReductionOp &reduction_op) | |
| { | |
| return block_reduce.Reduce(data[0], reduction_op); | |
| } | |
| /// Generic reduction (full, ITEMS_PER_THREAD) | |
| template <typename BlockReduceT, typename T, int ITEMS_PER_THREAD, typename ReductionOp> | |
| __device__ __forceinline__ T DeviceTest( | |
| BlockReduceT &block_reduce, T (&data)[ITEMS_PER_THREAD], ReductionOp &reduction_op) | |
| { | |
| return block_reduce.Reduce(data, reduction_op); | |
| } | |
| /// Generic reduction (partial, 1) | |
| template <typename BlockReduceT, typename T, typename ReductionOp> | |
| __device__ __forceinline__ T DeviceTest( | |
| BlockReduceT &block_reduce, T &data, ReductionOp &reduction_op, int valid_threads) | |
| { | |
| return block_reduce.Reduce(data, reduction_op, valid_threads); | |
| } | |
| /// Sum reduction (full, 1) | |
| template <typename BlockReduceT, typename T> | |
| __device__ __forceinline__ T DeviceTest( | |
| BlockReduceT &block_reduce, T (&data)[1], Sum &reduction_op) | |
| { | |
| return block_reduce.Sum(data[0]); | |
| } | |
| /// Sum reduction (full, ITEMS_PER_THREAD) | |
| template <typename BlockReduceT, typename T, int ITEMS_PER_THREAD> | |
| __device__ __forceinline__ T DeviceTest( | |
| BlockReduceT &block_reduce, T (&data)[ITEMS_PER_THREAD], Sum &reduction_op) | |
| { | |
| return block_reduce.Sum(data); | |
| } | |
| /// Sum reduction (partial, 1) | |
| template <typename BlockReduceT, typename T> | |
| __device__ __forceinline__ T DeviceTest( | |
| BlockReduceT &block_reduce, T &data, Sum &reduction_op, int valid_threads) | |
| { | |
| return block_reduce.Sum(data, valid_threads); | |
| } | |
| /** | |
| * Test full-tile reduction kernel (where num_items is an even | |
| * multiple of BLOCK_THREADS) | |
| */ | |
| template < | |
| BlockReduceAlgorithm ALGORITHM, | |
| int BLOCK_DIM_X, | |
| int BLOCK_DIM_Y, | |
| int BLOCK_DIM_Z, | |
| int ITEMS_PER_THREAD, | |
| typename T, | |
| typename ReductionOp> | |
| __launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) | |
| __global__ void FullTileReduceKernel( | |
| T *d_in, | |
| T *d_out, | |
| ReductionOp reduction_op, | |
| int tiles, | |
| clock_t *d_elapsed) | |
| { | |
| const int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; | |
| const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; | |
| // Cooperative thread block reduction utility type (returns aggregate in thread 0) | |
| typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockReduceT; | |
| // Allocate temp storage in shared memory | |
| __shared__ typename BlockReduceT::TempStorage temp_storage; | |
| int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z); | |
| // Per-thread tile data | |
| T data[ITEMS_PER_THREAD]; | |
| // Load first tile of data | |
| int block_offset = 0; | |
| if (block_offset < TILE_SIZE * tiles) | |
| { | |
| LoadDirectBlocked(linear_tid, d_in + block_offset, data); | |
| block_offset += TILE_SIZE; | |
| // Start cycle timer | |
| clock_t start = clock(); | |
| // Cooperative reduce first tile | |
| BlockReduceT block_reduce(temp_storage) ; | |
| T block_aggregate = DeviceTest(block_reduce, data, reduction_op); | |
| // Stop cycle timer | |
| #if CUB_PTX_ARCH == 100 | |
| // Bug: recording stop clock causes mis-write of running prefix value | |
| clock_t stop = 0; | |
| #else | |
| clock_t stop = clock(); | |
| #endif // CUB_PTX_ARCH == 100 | |
| clock_t elapsed = (start > stop) ? start - stop : stop - start; | |
| // Loop over input tiles | |
| while (block_offset < TILE_SIZE * tiles) | |
| { | |
| // TestBarrier between thread block reductions | |
| __syncthreads(); | |
| // Load tile of data | |
| LoadDirectBlocked(linear_tid, d_in + block_offset, data); | |
| block_offset += TILE_SIZE; | |
| // Start cycle timer | |
| clock_t start = clock(); | |
| // Cooperatively reduce the tile's aggregate | |
| BlockReduceT block_reduce(temp_storage) ; | |
| T tile_aggregate = DeviceTest(block_reduce, data, reduction_op); | |
| // Stop cycle timer | |
| #if CUB_PTX_ARCH == 100 | |
| // Bug: recording stop clock causes mis-write of running prefix value | |
| clock_t stop = 0; | |
| #else | |
| clock_t stop = clock(); | |
| #endif // CUB_PTX_ARCH == 100 | |
| elapsed += (start > stop) ? start - stop : stop - start; | |
| // Reduce thread block aggregate | |
| block_aggregate = reduction_op(block_aggregate, tile_aggregate); | |
| } | |
| // Store data | |
| if (linear_tid == 0) | |
| { | |
| d_out[0] = block_aggregate; | |
| *d_elapsed = elapsed; | |
| } | |
| } | |
| } | |
| /** | |
| * Test partial-tile reduction kernel (where num_items < BLOCK_THREADS) | |
| */ | |
| template < | |
| BlockReduceAlgorithm ALGORITHM, | |
| int BLOCK_DIM_X, | |
| int BLOCK_DIM_Y, | |
| int BLOCK_DIM_Z, | |
| typename T, | |
| typename ReductionOp> | |
| __launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) | |
| __global__ void PartialTileReduceKernel( | |
| T *d_in, | |
| T *d_out, | |
| int num_items, | |
| ReductionOp reduction_op, | |
| clock_t *d_elapsed) | |
| { | |
| // Cooperative thread block reduction utility type (returns aggregate only in thread-0) | |
| typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockReduceT; | |
| // Allocate temp storage in shared memory | |
| __shared__ typename BlockReduceT::TempStorage temp_storage; | |
| int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z); | |
| // Per-thread tile data | |
| T partial; | |
| // Load partial tile data | |
| if (linear_tid < num_items) | |
| { | |
| partial = d_in[linear_tid]; | |
| } | |
| // Start cycle timer | |
| clock_t start = clock(); | |
| // Cooperatively reduce the tile's aggregate | |
| BlockReduceT block_reduce(temp_storage) ; | |
| T tile_aggregate = DeviceTest(block_reduce, partial, reduction_op, num_items); | |
| // Stop cycle timer | |
| #if CUB_PTX_ARCH == 100 | |
| // Bug: recording stop clock causes mis-write of running prefix value | |
| clock_t stop = 0; | |
| #else | |
| clock_t stop = clock(); | |
| #endif // CUB_PTX_ARCH == 100 | |
| clock_t elapsed = (start > stop) ? start - stop : stop - start; | |
| // Store data | |
| if (linear_tid == 0) | |
| { | |
| d_out[0] = tile_aggregate; | |
| *d_elapsed = elapsed; | |
| } | |
| } | |
| //--------------------------------------------------------------------- | |
| // Host utility subroutines | |
| //--------------------------------------------------------------------- | |
| /** | |
| * Initialize problem (and solution) | |
| */ | |
| template < | |
| typename T, | |
| typename ReductionOp> | |
| void Initialize( | |
| GenMode gen_mode, | |
| T *h_in, | |
| T h_reference[1], | |
| ReductionOp reduction_op, | |
| int num_items) | |
| { | |
| for (int i = 0; i < num_items; ++i) | |
| { | |
| InitValue(gen_mode, h_in[i], i); | |
| if (i == 0) | |
| h_reference[0] = h_in[0]; | |
| else | |
| h_reference[0] = reduction_op(h_reference[0], h_in[i]); | |
| } | |
| if (g_verbose) | |
| { | |
| printf("Input:\n"); | |
| DisplayResults(h_in, num_items); | |
| printf("\n"); | |
| } | |
| } | |
| //--------------------------------------------------------------------- | |
| // Full tile test generation | |
| //--------------------------------------------------------------------- | |
| /** | |
| * Test full-tile reduction. (Specialized for sufficient resources) | |
| */ | |
| template < | |
| BlockReduceAlgorithm ALGORITHM, | |
| int BLOCK_DIM_X, | |
| int BLOCK_DIM_Y, | |
| int BLOCK_DIM_Z, | |
| int ITEMS_PER_THREAD, | |
| typename T, | |
| typename ReductionOp> | |
| void TestFullTile( | |
| GenMode gen_mode, | |
| int tiles, | |
| ReductionOp reduction_op, | |
| Int2Type<true> /*sufficient_resources*/) | |
| { | |
| const int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; | |
| const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; | |
| int num_items = TILE_SIZE * tiles; | |
| // Allocate host arrays | |
| T *h_in = new T[num_items]; | |
| T h_reference[1]; | |
| // Initialize problem | |
| Initialize(gen_mode, h_in, h_reference, reduction_op, num_items); | |
| // Initialize/clear device arrays | |
| T *d_in = NULL; | |
| T *d_out = NULL; | |
| clock_t *d_elapsed = NULL; | |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long))); | |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items)); | |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1)); | |
| CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice)); | |
| CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * 1)); | |
| // Test multi-tile (unguarded) | |
| printf("TestFullTile %s, %s, gen-mode %d, num_items(%d), BLOCK_THREADS(%d) (%d,%d,%d), ITEMS_PER_THREAD(%d), tiles(%d), %s (%d bytes) elements:\n", | |
| Equals<ReductionOp, Sum>::VALUE ? "Sum" : "Max", | |
| (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : (ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY) ? "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY" : "BLOCK_REDUCE_WARP_REDUCTIONS", | |
| gen_mode, | |
| num_items, | |
| BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, | |
| ITEMS_PER_THREAD, | |
| tiles, | |
| typeid(T).name(), | |
| (int) sizeof(T)); | |
| fflush(stdout); | |
| dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z); | |
| FullTileReduceKernel<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD><<<1, block_dims>>>( | |
| d_in, | |
| d_out, | |
| reduction_op, | |
| tiles, | |
| d_elapsed); | |
| CubDebugExit(cudaPeekAtLastError()); | |
| CubDebugExit(cudaDeviceSynchronize()); | |
| // Copy out and display results | |
| printf("\tReduction results: "); | |
| int compare = CompareDeviceResults(h_reference, d_out, 1, g_verbose, g_verbose); | |
| printf("%s\n", compare ? "FAIL" : "PASS"); | |
| AssertEquals(0, compare); | |
| printf("\tElapsed clocks: "); | |
| DisplayDeviceResults(d_elapsed, 1); | |
| // Cleanup | |
| if (h_in) delete[] h_in; | |
| if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); | |
| if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); | |
| if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed)); | |
| } | |
| /** | |
| * Test full-tile reduction. (Specialized for insufficient resources) | |
| */ | |
| template < | |
| BlockReduceAlgorithm ALGORITHM, | |
| int BLOCK_DIM_X, | |
| int BLOCK_DIM_Y, | |
| int BLOCK_DIM_Z, | |
| int ITEMS_PER_THREAD, | |
| typename T, | |
| typename ReductionOp> | |
| void TestFullTile( | |
| GenMode gen_mode, | |
| int tiles, | |
| ReductionOp reduction_op, | |
| Int2Type<false> sufficient_resources) | |
| {} | |
| /** | |
| * Test full-tile reduction. | |
| */ | |
| template < | |
| BlockReduceAlgorithm ALGORITHM, | |
| int BLOCK_DIM_X, | |
| int BLOCK_DIM_Y, | |
| int BLOCK_DIM_Z, | |
| int ITEMS_PER_THREAD, | |
| typename T, | |
| typename ReductionOp> | |
| void TestFullTile( | |
| GenMode gen_mode, | |
| int tiles, | |
| ReductionOp reduction_op) | |
| { | |
| // Check size of smem storage for the target arch to make sure it will fit | |
| typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, TEST_ARCH> BlockReduceT; | |
| enum | |
| { | |
| #if defined(SM100) || defined(SM110) || defined(SM130) | |
| sufficient_smem = (sizeof(typename BlockReduceT::TempStorage) <= 16 * 1024), | |
| sufficient_threads = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 512), | |
| #else | |
| sufficient_smem = (sizeof(typename BlockReduceT::TempStorage) <= 48 * 1024), | |
| sufficient_threads = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 1024), | |
| #endif | |
| }; | |
| TestFullTile<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op, Int2Type<sufficient_smem && sufficient_threads>()); | |
| } | |
| /** | |
| * Run battery of tests for different thread block dimensions | |
| */ | |
| template < | |
| BlockReduceAlgorithm ALGORITHM, | |
| int BLOCK_THREADS, | |
| int ITEMS_PER_THREAD, | |
| typename T, | |
| typename ReductionOp> | |
| void TestFullTile( | |
| GenMode gen_mode, | |
| int tiles, | |
| ReductionOp reduction_op) | |
| { | |
| TestFullTile<ALGORITHM, BLOCK_THREADS, 1, 1, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op); | |
| TestFullTile<ALGORITHM, BLOCK_THREADS, 2, 2, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op); | |
| } | |
| /** | |
| * Run battery of tests for different thread items | |
| */ | |
| template < | |
| BlockReduceAlgorithm ALGORITHM, | |
| int BLOCK_THREADS, | |
| typename T, | |
| typename ReductionOp> | |
| void TestFullTile( | |
| GenMode gen_mode, | |
| int tiles, | |
| ReductionOp reduction_op) | |
| { | |
| TestFullTile<ALGORITHM, BLOCK_THREADS, 1, T>(gen_mode, tiles, reduction_op); | |
| TestFullTile<ALGORITHM, BLOCK_THREADS, 4, T>(gen_mode, tiles, reduction_op); | |
| } | |
| /** | |
| * Run battery of full-tile tests for different numbers of tiles | |
| */ | |
| template < | |
| BlockReduceAlgorithm ALGORITHM, | |
| int BLOCK_THREADS, | |
| typename T, | |
| typename ReductionOp> | |
| void TestFullTile( | |
| GenMode gen_mode, | |
| ReductionOp reduction_op) | |
| { | |
| for (int tiles = 1; tiles < 3; tiles++) | |
| { | |
| TestFullTile<ALGORITHM, BLOCK_THREADS, T>(gen_mode, tiles, reduction_op); | |
| } | |
| } | |
| //--------------------------------------------------------------------- | |
| // Partial-tile test generation | |
| //--------------------------------------------------------------------- | |
| /** | |
| * Test partial-tile reduction. (Specialized for sufficient resources) | |
| */ | |
| template < | |
| BlockReduceAlgorithm ALGORITHM, | |
| int BLOCK_DIM_X, | |
| int BLOCK_DIM_Y, | |
| int BLOCK_DIM_Z, | |
| typename T, | |
| typename ReductionOp> | |
| void TestPartialTile( | |
| GenMode gen_mode, | |
| int num_items, | |
| ReductionOp reduction_op, | |
| Int2Type<true> /*sufficient_resources*/) | |
| { | |
| const int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; | |
| const int TILE_SIZE = BLOCK_THREADS; | |
| // Allocate host arrays | |
| T *h_in = new T[num_items]; | |
| T h_reference[1]; | |
| // Initialize problem | |
| Initialize(gen_mode, h_in, h_reference, reduction_op, num_items); | |
| // Initialize/clear device arrays | |
| T *d_in = NULL; | |
| T *d_out = NULL; | |
| clock_t *d_elapsed = NULL; | |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long))); | |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TILE_SIZE)); | |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1)); | |
| CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice)); | |
| CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * 1)); | |
| printf("TestPartialTile %s, gen-mode %d, num_items(%d), BLOCK_THREADS(%d) (%d,%d,%d), %s (%d bytes) elements:\n", | |
| (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : (ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY) ? "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY" : "BLOCK_REDUCE_WARP_REDUCTIONS", | |
| gen_mode, | |
| num_items, | |
| BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, | |
| typeid(T).name(), | |
| (int) sizeof(T)); | |
| fflush(stdout); | |
| dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z); | |
| PartialTileReduceKernel<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z><<<1, block_dims>>>( | |
| d_in, | |
| d_out, | |
| num_items, | |
| reduction_op, | |
| d_elapsed); | |
| CubDebugExit(cudaPeekAtLastError()); | |
| CubDebugExit(cudaDeviceSynchronize()); | |
| // Copy out and display results | |
| printf("\tReduction results: "); | |
| int compare = CompareDeviceResults(h_reference, d_out, 1, g_verbose, g_verbose); | |
| printf("%s\n", compare ? "FAIL" : "PASS"); | |
| AssertEquals(0, compare); | |
| printf("\tElapsed clocks: "); | |
| DisplayDeviceResults(d_elapsed, 1); | |
| // Cleanup | |
| if (h_in) delete[] h_in; | |
| if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); | |
| if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); | |
| if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed)); | |
| } | |
| /** | |
| * Test partial-tile reduction (specialized for insufficient resources) | |
| */ | |
| template < | |
| BlockReduceAlgorithm ALGORITHM, | |
| int BLOCK_DIM_X, | |
| int BLOCK_DIM_Y, | |
| int BLOCK_DIM_Z, | |
| typename T, | |
| typename ReductionOp> | |
| void TestPartialTile( | |
| GenMode gen_mode, | |
| int num_items, | |
| ReductionOp reduction_op, | |
| Int2Type<false> sufficient_resources) | |
| {} | |
| /** | |
| * Run battery of partial-tile tests for different numbers of effective threads and thread dimensions | |
| */ | |
| template < | |
| BlockReduceAlgorithm ALGORITHM, | |
| int BLOCK_DIM_X, | |
| int BLOCK_DIM_Y, | |
| int BLOCK_DIM_Z, | |
| typename T, | |
| typename ReductionOp> | |
| void TestPartialTile( | |
| GenMode gen_mode, | |
| int num_items, | |
| ReductionOp reduction_op) | |
| { | |
| // Check size of smem storage for the target arch to make sure it will fit | |
| typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, TEST_ARCH> BlockReduceT; | |
| enum | |
| { | |
| #if defined(SM100) || defined(SM110) || defined(SM130) | |
| sufficient_smem = sizeof(typename BlockReduceT::TempStorage) <= 16 * 1024, | |
| sufficient_threads = (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 512, | |
| #else | |
| sufficient_smem = sizeof(typename BlockReduceT::TempStorage) <= 48 * 1024, | |
| sufficient_threads = (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 1024, | |
| #endif | |
| }; | |
| TestPartialTile<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, T>(gen_mode, num_items, reduction_op, Int2Type<sufficient_smem && sufficient_threads>()); | |
| } | |
| /** | |
| * Run battery of partial-tile tests for different numbers of effective threads and thread dimensions | |
| */ | |
| template < | |
| BlockReduceAlgorithm ALGORITHM, | |
| int BLOCK_THREADS, | |
| typename T, | |
| typename ReductionOp> | |
| void TestPartialTile( | |
| GenMode gen_mode, | |
| ReductionOp reduction_op) | |
| { | |
| for ( | |
| int num_items = 1; | |
| num_items < BLOCK_THREADS; | |
| num_items += CUB_MAX(1, BLOCK_THREADS / 5)) | |
| { | |
| TestPartialTile<ALGORITHM, BLOCK_THREADS, 1, 1, T>(gen_mode, num_items, reduction_op); | |
| TestPartialTile<ALGORITHM, BLOCK_THREADS, 2, 2, T>(gen_mode, num_items, reduction_op); | |
| } | |
| } | |
| //--------------------------------------------------------------------- | |
| // Main | |
| //--------------------------------------------------------------------- | |
| /** | |
| * Run battery of full-tile tests for different gen modes | |
| */ | |
| template < | |
| BlockReduceAlgorithm ALGORITHM, | |
| int BLOCK_THREADS, | |
| typename T, | |
| typename ReductionOp> | |
| void Test( | |
| ReductionOp reduction_op) | |
| { | |
| TestFullTile<ALGORITHM, BLOCK_THREADS, T>(UNIFORM, reduction_op); | |
| TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(UNIFORM, reduction_op); | |
| TestFullTile<ALGORITHM, BLOCK_THREADS, T>(INTEGER_SEED, reduction_op); | |
| TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(INTEGER_SEED, reduction_op); | |
| if (Traits<T>::CATEGORY != FLOATING_POINT) | |
| { | |
| // Don't test randomly-generated floats b/c of stability | |
| TestFullTile<ALGORITHM, BLOCK_THREADS, T>(RANDOM, reduction_op); | |
| TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(RANDOM, reduction_op); | |
| } | |
| } | |
| /** | |
| * Run battery of tests for different block-reduction algorithmic variants | |
| */ | |
| template < | |
| int BLOCK_THREADS, | |
| typename T, | |
| typename ReductionOp> | |
| void Test( | |
| ReductionOp reduction_op) | |
| { | |
| (void)reduction_op; | |
| #ifdef TEST_RAKING | |
| Test<BLOCK_REDUCE_RAKING, BLOCK_THREADS, T>(reduction_op); | |
| Test<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, BLOCK_THREADS, T>(reduction_op); | |
| #endif | |
| #ifdef TEST_WARP_REDUCTIONS | |
| Test<BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_THREADS, T>(reduction_op); | |
| #endif | |
| } | |
| /** | |
| * Run battery of tests for different block sizes | |
| */ | |
| template < | |
| typename T, | |
| typename ReductionOp> | |
| void Test( | |
| ReductionOp reduction_op) | |
| { | |
| Test<7, T>(reduction_op); | |
| Test<32, T>(reduction_op); | |
| Test<63, T>(reduction_op); | |
| Test<97, T>(reduction_op); | |
| Test<128, T>(reduction_op); | |
| Test<238, T>(reduction_op); | |
| } | |
| /** | |
| * Run battery of tests for different block sizes | |
| */ | |
| template <typename T> | |
| void Test() | |
| { | |
| Test<T>(Sum()); | |
| Test<T>(Max()); | |
| } | |
| /** | |
| * Main | |
| */ | |
| int main(int argc, char** argv) | |
| { | |
| // Initialize command line | |
| CommandLineArgs args(argc, argv); | |
| g_verbose = args.CheckCmdLineFlag("v"); | |
| args.GetCmdLineArgument("repeat", g_repeat); | |
| // Print usage | |
| if (args.CheckCmdLineFlag("help")) | |
| { | |
| printf("%s " | |
| "[--device=<device-id>] " | |
| "[--repeat=<repetitions of entire test suite>]" | |
| "[--v] " | |
| "\n", argv[0]); | |
| exit(0); | |
| } | |
| // Initialize device | |
| CubDebugExit(args.DeviceInit()); | |
| // Get ptx version | |
| int ptx_version = 0; | |
| CubDebugExit(PtxVersion(ptx_version)); | |
| #ifdef QUICK_TEST | |
| // Compile/run quick tests | |
| printf("\n full tile ------------------------\n\n"); | |
| TestFullTile<BLOCK_REDUCE_RAKING, 128, 1, 1, 4, int>(RANDOM, 1, Sum()); | |
| TestFullTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, 128, 1, 1, 4, int>(RANDOM, 1, Sum()); | |
| TestFullTile<BLOCK_REDUCE_WARP_REDUCTIONS, 128, 1, 1, 4, int>(RANDOM, 1, Sum()); | |
| TestFullTile<BLOCK_REDUCE_RAKING, 128, 1, 1, 1, int>(RANDOM, 1, Sum()); | |
| TestFullTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, 128, 1, 1, 1, int>(RANDOM, 1, Sum()); | |
| TestFullTile<BLOCK_REDUCE_WARP_REDUCTIONS, 128, 1, 1, 1, int>(RANDOM, 1, Sum()); | |
| printf("\n partial tile ------------------------\n\n"); | |
| TestPartialTile<BLOCK_REDUCE_RAKING, 128, 1, 1, int>(RANDOM, 7, Sum()); | |
| TestPartialTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, 128, 1, 1, int>(RANDOM, 7, Sum()); | |
| TestPartialTile<BLOCK_REDUCE_WARP_REDUCTIONS, 128, 1, 1, int>(RANDOM, 7, Sum()); | |
| #else | |
| // Compile/run thorough tests | |
| for (int i = 0; i <= g_repeat; ++i) | |
| { | |
| // primitives | |
| Test<char>(); | |
| Test<short>(); | |
| Test<int>(); | |
| Test<long long>(); | |
| if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted | |
| Test<double>(); | |
| Test<float>(); | |
| // vector types | |
| Test<char2>(); | |
| Test<short2>(); | |
| Test<int2>(); | |
| Test<longlong2>(); | |
| Test<char4>(); | |
| Test<short4>(); | |
| Test<int4>(); | |
| Test<longlong4>(); | |
| // Complex types | |
| Test<TestFoo>(); | |
| Test<TestBar>(); | |
| } | |
| #endif | |
| return 0; | |
| } | |