Spaces:
Runtime error
Runtime error
| /****************************************************************************** | |
| * Copyright (c) 2011, Duane Merrill. All rights reserved. | |
| * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. | |
| * | |
| * Redistribution and use in source and binary forms, with or without | |
| * modification, are permitted provided that the following conditions are met: | |
| * * Redistributions of source code must retain the above copyright | |
| * notice, this list of conditions and the following disclaimer. | |
| * * Redistributions in binary form must reproduce the above copyright | |
| * notice, this list of conditions and the following disclaimer in the | |
| * documentation and/or other materials provided with the distribution. | |
| * * Neither the name of the NVIDIA CORPORATION nor the | |
| * names of its contributors may be used to endorse or promote products | |
| * derived from this software without specific prior written permission. | |
| * | |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| * | |
| ******************************************************************************/ | |
| /****************************************************************************** | |
| * Test of WarpReduce utilities | |
| ******************************************************************************/ | |
| // Ensure printing of CUDA runtime errors to console | |
| #define CUB_STDERR | |
| #include <stdio.h> | |
| #include <typeinfo> | |
| #include <cub/warp/warp_reduce.cuh> | |
| #include <cub/util_allocator.cuh> | |
| #include "test_util.h" | |
| using namespace cub; | |
| //--------------------------------------------------------------------- | |
| // Globals, constants and typedefs | |
| //--------------------------------------------------------------------- | |
| bool g_verbose = false; | |
| int g_repeat = 0; | |
| CachingDeviceAllocator g_allocator(true); | |
| /** | |
| * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants) | |
| */ | |
| template< | |
| typename OpT, | |
| int LOGICAL_WARP_THREADS> | |
| struct WrapperFunctor | |
| { | |
| OpT op; | |
| int num_valid; | |
| inline __host__ __device__ WrapperFunctor(OpT op, int num_valid) : op(op), num_valid(num_valid) {} | |
| template <typename T> | |
| inline __host__ __device__ T operator()(const T &a, const T &b) const | |
| { | |
| #if CUB_PTX_ARCH != 0 | |
| if ((cub::LaneId() % LOGICAL_WARP_THREADS) >= num_valid) | |
| cub::ThreadTrap(); | |
| #endif | |
| return op(a, b); | |
| } | |
| }; | |
| //--------------------------------------------------------------------- | |
| // Test kernels | |
| //--------------------------------------------------------------------- | |
| /** | |
| * Generic reduction | |
| */ | |
| template < | |
| typename T, | |
| typename ReductionOp, | |
| typename WarpReduce, | |
| bool PRIMITIVE = Traits<T>::PRIMITIVE> | |
| struct DeviceTest | |
| { | |
| static __device__ __forceinline__ T Reduce( | |
| typename WarpReduce::TempStorage &temp_storage, | |
| T &data, | |
| ReductionOp &reduction_op) | |
| { | |
| return WarpReduce(temp_storage).Reduce(data, reduction_op); | |
| } | |
| static __device__ __forceinline__ T Reduce( | |
| typename WarpReduce::TempStorage &temp_storage, | |
| T &data, | |
| ReductionOp &reduction_op, | |
| const int &valid_warp_threads) | |
| { | |
| return WarpReduce(temp_storage).Reduce(data, reduction_op, valid_warp_threads); | |
| } | |
| template <typename FlagT> | |
| static __device__ __forceinline__ T HeadSegmentedReduce( | |
| typename WarpReduce::TempStorage &temp_storage, | |
| T &data, | |
| FlagT &flag, | |
| ReductionOp &reduction_op) | |
| { | |
| return WarpReduce(temp_storage).HeadSegmentedReduce(data, flag, reduction_op); | |
| } | |
| template <typename FlagT> | |
| static __device__ __forceinline__ T TailSegmentedReduce( | |
| typename WarpReduce::TempStorage &temp_storage, | |
| T &data, | |
| FlagT &flag, | |
| ReductionOp &reduction_op) | |
| { | |
| return WarpReduce(temp_storage).TailSegmentedReduce(data, flag, reduction_op); | |
| } | |
| }; | |
| /** | |
| * Summation | |
| */ | |
| template < | |
| typename T, | |
| typename WarpReduce> | |
| struct DeviceTest<T, Sum, WarpReduce, true> | |
| { | |
| static __device__ __forceinline__ T Reduce( | |
| typename WarpReduce::TempStorage &temp_storage, | |
| T &data, | |
| Sum &reduction_op) | |
| { | |
| return WarpReduce(temp_storage).Sum(data); | |
| } | |
| static __device__ __forceinline__ T Reduce( | |
| typename WarpReduce::TempStorage &temp_storage, | |
| T &data, | |
| Sum &reduction_op, | |
| const int &valid_warp_threads) | |
| { | |
| return WarpReduce(temp_storage).Sum(data, valid_warp_threads); | |
| } | |
| template <typename FlagT> | |
| static __device__ __forceinline__ T HeadSegmentedReduce( | |
| typename WarpReduce::TempStorage &temp_storage, | |
| T &data, | |
| FlagT &flag, | |
| Sum &reduction_op) | |
| { | |
| return WarpReduce(temp_storage).HeadSegmentedSum(data, flag); | |
| } | |
| template <typename FlagT> | |
| static __device__ __forceinline__ T TailSegmentedReduce( | |
| typename WarpReduce::TempStorage &temp_storage, | |
| T &data, | |
| FlagT &flag, | |
| Sum &reduction_op) | |
| { | |
| return WarpReduce(temp_storage).TailSegmentedSum(data, flag); | |
| } | |
| }; | |
| /** | |
| * Full-tile warp reduction kernel | |
| */ | |
| template < | |
| int WARPS, | |
| int LOGICAL_WARP_THREADS, | |
| typename T, | |
| typename ReductionOp> | |
| __global__ void FullWarpReduceKernel( | |
| T *d_in, | |
| T *d_out, | |
| ReductionOp reduction_op, | |
| clock_t *d_elapsed) | |
| { | |
| // Cooperative warp-reduce utility type (1 warp) | |
| typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce; | |
| // Allocate temp storage in shared memory | |
| __shared__ typename WarpReduce::TempStorage temp_storage[WARPS]; | |
| // Per-thread tile data | |
| T input = d_in[threadIdx.x]; | |
| // Record elapsed clocks | |
| __threadfence_block(); // workaround to prevent clock hoisting | |
| clock_t start = clock(); | |
| __threadfence_block(); // workaround to prevent clock hoisting | |
| // Test warp reduce | |
| int warp_id = threadIdx.x / LOGICAL_WARP_THREADS; | |
| T output = DeviceTest<T, ReductionOp, WarpReduce>::Reduce( | |
| temp_storage[warp_id], input, reduction_op); | |
| // Record elapsed clocks | |
| __threadfence_block(); // workaround to prevent clock hoisting | |
| clock_t stop = clock(); | |
| __threadfence_block(); // workaround to prevent clock hoisting | |
| *d_elapsed = stop - start; | |
| // Store aggregate | |
| d_out[threadIdx.x] = (threadIdx.x % LOGICAL_WARP_THREADS == 0) ? | |
| output : | |
| input; | |
| } | |
| /** | |
| * Partially-full warp reduction kernel | |
| */ | |
| template < | |
| int WARPS, | |
| int LOGICAL_WARP_THREADS, | |
| typename T, | |
| typename ReductionOp> | |
| __global__ void PartialWarpReduceKernel( | |
| T *d_in, | |
| T *d_out, | |
| ReductionOp reduction_op, | |
| clock_t *d_elapsed, | |
| int valid_warp_threads) | |
| { | |
| // Cooperative warp-reduce utility type | |
| typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce; | |
| // Allocate temp storage in shared memory | |
| __shared__ typename WarpReduce::TempStorage temp_storage[WARPS]; | |
| // Per-thread tile data | |
| T input = d_in[threadIdx.x]; | |
| // Record elapsed clocks | |
| __threadfence_block(); // workaround to prevent clock hoisting | |
| clock_t start = clock(); | |
| __threadfence_block(); // workaround to prevent clock hoisting | |
| // Test partial-warp reduce | |
| int warp_id = threadIdx.x / LOGICAL_WARP_THREADS; | |
| T output = DeviceTest<T, ReductionOp, WarpReduce>::Reduce( | |
| temp_storage[warp_id], input, reduction_op, valid_warp_threads); | |
| // Record elapsed clocks | |
| __threadfence_block(); // workaround to prevent clock hoisting | |
| clock_t stop = clock(); | |
| __threadfence_block(); // workaround to prevent clock hoisting | |
| *d_elapsed = stop - start; | |
| // Store aggregate | |
| d_out[threadIdx.x] = (threadIdx.x % LOGICAL_WARP_THREADS == 0) ? | |
| output : | |
| input; | |
| } | |
| /** | |
| * Head-based segmented warp reduction test kernel | |
| */ | |
| template < | |
| int WARPS, | |
| int LOGICAL_WARP_THREADS, | |
| typename T, | |
| typename FlagT, | |
| typename ReductionOp> | |
| __global__ void WarpHeadSegmentedReduceKernel( | |
| T *d_in, | |
| FlagT *d_head_flags, | |
| T *d_out, | |
| ReductionOp reduction_op, | |
| clock_t *d_elapsed) | |
| { | |
| // Cooperative warp-reduce utility type | |
| typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce; | |
| // Allocate temp storage in shared memory | |
| __shared__ typename WarpReduce::TempStorage temp_storage[WARPS]; | |
| // Per-thread tile data | |
| T input = d_in[threadIdx.x]; | |
| FlagT head_flag = d_head_flags[threadIdx.x]; | |
| // Record elapsed clocks | |
| __threadfence_block(); // workaround to prevent clock hoisting | |
| clock_t start = clock(); | |
| __threadfence_block(); // workaround to prevent clock hoisting | |
| // Test segmented warp reduce | |
| int warp_id = threadIdx.x / LOGICAL_WARP_THREADS; | |
| T output = DeviceTest<T, ReductionOp, WarpReduce>::HeadSegmentedReduce( | |
| temp_storage[warp_id], input, head_flag, reduction_op); | |
| // Record elapsed clocks | |
| __threadfence_block(); // workaround to prevent clock hoisting | |
| clock_t stop = clock(); | |
| __threadfence_block(); // workaround to prevent clock hoisting | |
| *d_elapsed = stop - start; | |
| // Store aggregate | |
| d_out[threadIdx.x] = ((threadIdx.x % LOGICAL_WARP_THREADS == 0) || head_flag) ? | |
| output : | |
| input; | |
| } | |
| /** | |
| * Tail-based segmented warp reduction test kernel | |
| */ | |
| template < | |
| int WARPS, | |
| int LOGICAL_WARP_THREADS, | |
| typename T, | |
| typename FlagT, | |
| typename ReductionOp> | |
| __global__ void WarpTailSegmentedReduceKernel( | |
| T *d_in, | |
| FlagT *d_tail_flags, | |
| T *d_out, | |
| ReductionOp reduction_op, | |
| clock_t *d_elapsed) | |
| { | |
| // Cooperative warp-reduce utility type | |
| typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce; | |
| // Allocate temp storage in shared memory | |
| __shared__ typename WarpReduce::TempStorage temp_storage[WARPS]; | |
| // Per-thread tile data | |
| T input = d_in[threadIdx.x]; | |
| FlagT tail_flag = d_tail_flags[threadIdx.x]; | |
| FlagT head_flag = (threadIdx.x == 0) ? | |
| 0 : | |
| d_tail_flags[threadIdx.x - 1]; | |
| // Record elapsed clocks | |
| __threadfence_block(); // workaround to prevent clock hoisting | |
| clock_t start = clock(); | |
| __threadfence_block(); // workaround to prevent clock hoisting | |
| // Test segmented warp reduce | |
| int warp_id = threadIdx.x / LOGICAL_WARP_THREADS; | |
| T output = DeviceTest<T, ReductionOp, WarpReduce>::TailSegmentedReduce( | |
| temp_storage[warp_id], input, tail_flag, reduction_op); | |
| // Record elapsed clocks | |
| __threadfence_block(); // workaround to prevent clock hoisting | |
| clock_t stop = clock(); | |
| __threadfence_block(); // workaround to prevent clock hoisting | |
| *d_elapsed = stop - start; | |
| // Store aggregate | |
| d_out[threadIdx.x] = ((threadIdx.x % LOGICAL_WARP_THREADS == 0) || head_flag) ? | |
| output : | |
| input; | |
| } | |
| //--------------------------------------------------------------------- | |
| // Host utility subroutines | |
| //--------------------------------------------------------------------- | |
| /** | |
| * Initialize reduction problem (and solution) | |
| */ | |
| template < | |
| typename T, | |
| typename ReductionOp> | |
| void Initialize( | |
| GenMode gen_mode, | |
| int flag_entropy, | |
| T *h_in, | |
| int *h_flags, | |
| int warps, | |
| int warp_threads, | |
| int valid_warp_threads, | |
| ReductionOp reduction_op, | |
| T *h_head_out, | |
| T *h_tail_out) | |
| { | |
| for (int i = 0; i < warps * warp_threads; ++i) | |
| { | |
| // Sample a value for this item | |
| InitValue(gen_mode, h_in[i], i); | |
| h_head_out[i] = h_in[i]; | |
| h_tail_out[i] = h_in[i]; | |
| // Sample whether or not this item will be a segment head | |
| char bits; | |
| RandomBits(bits, flag_entropy); | |
| h_flags[i] = bits & 0x1; | |
| } | |
| // Accumulate segments (lane 0 of each warp is implicitly a segment head) | |
| for (int warp = 0; warp < warps; ++warp) | |
| { | |
| int warp_offset = warp * warp_threads; | |
| int item_offset = warp_offset + valid_warp_threads - 1; | |
| // Last item in warp | |
| T head_aggregate = h_in[item_offset]; | |
| T tail_aggregate = h_in[item_offset]; | |
| if (h_flags[item_offset]) | |
| h_head_out[item_offset] = head_aggregate; | |
| item_offset--; | |
| // Work backwards | |
| while (item_offset >= warp_offset) | |
| { | |
| if (h_flags[item_offset + 1]) | |
| { | |
| head_aggregate = h_in[item_offset]; | |
| } | |
| else | |
| { | |
| head_aggregate = reduction_op(head_aggregate, h_in[item_offset]); | |
| } | |
| if (h_flags[item_offset]) | |
| { | |
| h_head_out[item_offset] = head_aggregate; | |
| h_tail_out[item_offset + 1] = tail_aggregate; | |
| tail_aggregate = h_in[item_offset]; | |
| } | |
| else | |
| { | |
| tail_aggregate = reduction_op(tail_aggregate, h_in[item_offset]); | |
| } | |
| item_offset--; | |
| } | |
| // Record last segment head_aggregate to head offset | |
| h_head_out[warp_offset] = head_aggregate; | |
| h_tail_out[warp_offset] = tail_aggregate; | |
| } | |
| } | |
| /** | |
| * Test warp reduction | |
| */ | |
| template < | |
| int WARPS, | |
| int LOGICAL_WARP_THREADS, | |
| typename T, | |
| typename ReductionOp> | |
| void TestReduce( | |
| GenMode gen_mode, | |
| ReductionOp reduction_op, | |
| int valid_warp_threads = LOGICAL_WARP_THREADS) | |
| { | |
| const int BLOCK_THREADS = LOGICAL_WARP_THREADS * WARPS; | |
| // Allocate host arrays | |
| T *h_in = new T[BLOCK_THREADS]; | |
| int *h_flags = new int[BLOCK_THREADS]; | |
| T *h_out = new T[BLOCK_THREADS]; | |
| T *h_tail_out = new T[BLOCK_THREADS]; | |
| // Initialize problem | |
| Initialize(gen_mode, -1, h_in, h_flags, WARPS, LOGICAL_WARP_THREADS, valid_warp_threads, reduction_op, h_out, h_tail_out); | |
| // Initialize/clear device arrays | |
| T *d_in = NULL; | |
| T *d_out = NULL; | |
| clock_t *d_elapsed = NULL; | |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * BLOCK_THREADS)); | |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * BLOCK_THREADS)); | |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t))); | |
| CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * BLOCK_THREADS, cudaMemcpyHostToDevice)); | |
| CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * BLOCK_THREADS)); | |
| if (g_verbose) | |
| { | |
| printf("Data:\n"); | |
| for (int i = 0; i < WARPS; ++i) | |
| DisplayResults(h_in + (i * LOGICAL_WARP_THREADS), valid_warp_threads); | |
| } | |
| // Run kernel | |
| printf("\nGen-mode %d, %d warps, %d warp threads, %d valid lanes, %s (%d bytes) elements:\n", | |
| gen_mode, | |
| WARPS, | |
| LOGICAL_WARP_THREADS, | |
| valid_warp_threads, | |
| typeid(T).name(), | |
| (int) sizeof(T)); | |
| fflush(stdout); | |
| if (valid_warp_threads == LOGICAL_WARP_THREADS) | |
| { | |
| // Run full-warp kernel | |
| FullWarpReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>( | |
| d_in, | |
| d_out, | |
| reduction_op, | |
| d_elapsed); | |
| } | |
| else | |
| { | |
| // Run partial-warp kernel | |
| PartialWarpReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>( | |
| d_in, | |
| d_out, | |
| reduction_op, | |
| d_elapsed, | |
| valid_warp_threads); | |
| } | |
| CubDebugExit(cudaPeekAtLastError()); | |
| CubDebugExit(cudaDeviceSynchronize()); | |
| // Copy out and display results | |
| printf("\tReduction results: "); | |
| int compare = CompareDeviceResults(h_out, d_out, BLOCK_THREADS, g_verbose, g_verbose); | |
| printf("%s\n", compare ? "FAIL" : "PASS"); | |
| AssertEquals(0, compare); | |
| printf("\tElapsed clocks: "); | |
| DisplayDeviceResults(d_elapsed, 1); | |
| // Cleanup | |
| if (h_in) delete[] h_in; | |
| if (h_flags) delete[] h_flags; | |
| if (h_out) delete[] h_out; | |
| if (h_tail_out) delete[] h_tail_out; | |
| if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); | |
| if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); | |
| if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed)); | |
| } | |
| /** | |
| * Test warp segmented reduction | |
| */ | |
| template < | |
| int WARPS, | |
| int LOGICAL_WARP_THREADS, | |
| typename T, | |
| typename ReductionOp> | |
| void TestSegmentedReduce( | |
| GenMode gen_mode, | |
| int flag_entropy, | |
| ReductionOp reduction_op) | |
| { | |
| const int BLOCK_THREADS = LOGICAL_WARP_THREADS * WARPS; | |
| // Allocate host arrays | |
| int compare; | |
| T *h_in = new T[BLOCK_THREADS]; | |
| int *h_flags = new int[BLOCK_THREADS]; | |
| T *h_head_out = new T[BLOCK_THREADS]; | |
| T *h_tail_out = new T[BLOCK_THREADS]; | |
| // Initialize problem | |
| Initialize(gen_mode, flag_entropy, h_in, h_flags, WARPS, LOGICAL_WARP_THREADS, LOGICAL_WARP_THREADS, reduction_op, h_head_out, h_tail_out); | |
| // Initialize/clear device arrays | |
| T *d_in = NULL; | |
| int *d_flags = NULL; | |
| T *d_head_out = NULL; | |
| T *d_tail_out = NULL; | |
| clock_t *d_elapsed = NULL; | |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * BLOCK_THREADS)); | |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(int) * BLOCK_THREADS)); | |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_head_out, sizeof(T) * BLOCK_THREADS)); | |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_tail_out, sizeof(T) * BLOCK_THREADS)); | |
| CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t))); | |
| CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * BLOCK_THREADS, cudaMemcpyHostToDevice)); | |
| CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(int) * BLOCK_THREADS, cudaMemcpyHostToDevice)); | |
| CubDebugExit(cudaMemset(d_head_out, 0, sizeof(T) * BLOCK_THREADS)); | |
| CubDebugExit(cudaMemset(d_tail_out, 0, sizeof(T) * BLOCK_THREADS)); | |
| if (g_verbose) | |
| { | |
| printf("Data:\n"); | |
| for (int i = 0; i < WARPS; ++i) | |
| DisplayResults(h_in + (i * LOGICAL_WARP_THREADS), LOGICAL_WARP_THREADS); | |
| printf("\nFlags:\n"); | |
| for (int i = 0; i < WARPS; ++i) | |
| DisplayResults(h_flags + (i * LOGICAL_WARP_THREADS), LOGICAL_WARP_THREADS); | |
| } | |
| printf("\nGen-mode %d, head flag entropy reduction %d, %d warps, %d warp threads, %s (%d bytes) elements:\n", | |
| gen_mode, | |
| flag_entropy, | |
| WARPS, | |
| LOGICAL_WARP_THREADS, | |
| typeid(T).name(), | |
| (int) sizeof(T)); | |
| fflush(stdout); | |
| // Run head-based kernel | |
| WarpHeadSegmentedReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>( | |
| d_in, | |
| d_flags, | |
| d_head_out, | |
| reduction_op, | |
| d_elapsed); | |
| CubDebugExit(cudaPeekAtLastError()); | |
| CubDebugExit(cudaDeviceSynchronize()); | |
| // Copy out and display results | |
| printf("\tHead-based segmented reduction results: "); | |
| compare = CompareDeviceResults(h_head_out, d_head_out, BLOCK_THREADS, g_verbose, g_verbose); | |
| printf("%s\n", compare ? "FAIL" : "PASS"); | |
| AssertEquals(0, compare); | |
| printf("\tElapsed clocks: "); | |
| DisplayDeviceResults(d_elapsed, 1); | |
| // Run tail-based kernel | |
| WarpTailSegmentedReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>( | |
| d_in, | |
| d_flags, | |
| d_tail_out, | |
| reduction_op, | |
| d_elapsed); | |
| CubDebugExit(cudaPeekAtLastError()); | |
| CubDebugExit(cudaDeviceSynchronize()); | |
| // Copy out and display results | |
| printf("\tTail-based segmented reduction results: "); | |
| compare = CompareDeviceResults(h_tail_out, d_tail_out, BLOCK_THREADS, g_verbose, g_verbose); | |
| printf("%s\n", compare ? "FAIL" : "PASS"); | |
| AssertEquals(0, compare); | |
| printf("\tElapsed clocks: "); | |
| DisplayDeviceResults(d_elapsed, 1); | |
| // Cleanup | |
| if (h_in) delete[] h_in; | |
| if (h_flags) delete[] h_flags; | |
| if (h_head_out) delete[] h_head_out; | |
| if (h_tail_out) delete[] h_tail_out; | |
| if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); | |
| if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags)); | |
| if (d_head_out) CubDebugExit(g_allocator.DeviceFree(d_head_out)); | |
| if (d_tail_out) CubDebugExit(g_allocator.DeviceFree(d_tail_out)); | |
| if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed)); | |
| } | |
| /** | |
| * Run battery of tests for different full and partial tile sizes | |
| */ | |
| template < | |
| int WARPS, | |
| int LOGICAL_WARP_THREADS, | |
| typename T, | |
| typename ReductionOp> | |
| void Test( | |
| GenMode gen_mode, | |
| ReductionOp reduction_op) | |
| { | |
| // Partial tiles | |
| for ( | |
| int valid_warp_threads = 1; | |
| valid_warp_threads < LOGICAL_WARP_THREADS; | |
| valid_warp_threads += CUB_MAX(1, LOGICAL_WARP_THREADS / 5)) | |
| { | |
| // Without wrapper (to test non-excepting PTX POD-op specializations) | |
| TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, reduction_op, valid_warp_threads); | |
| // With wrapper to ensure no ops called on OOB lanes | |
| WrapperFunctor<ReductionOp, LOGICAL_WARP_THREADS> wrapped_op(reduction_op, valid_warp_threads); | |
| TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, wrapped_op, valid_warp_threads); | |
| } | |
| // Full tile | |
| TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, reduction_op, LOGICAL_WARP_THREADS); | |
| // Segmented reduction with different head flags | |
| for (int flag_entropy = 0; flag_entropy < 10; ++flag_entropy) | |
| { | |
| TestSegmentedReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, flag_entropy, reduction_op); | |
| } | |
| } | |
| /** | |
| * Run battery of tests for different data types and reduce ops | |
| */ | |
| template < | |
| int WARPS, | |
| int LOGICAL_WARP_THREADS> | |
| void Test(GenMode gen_mode) | |
| { | |
| // primitive | |
| Test<WARPS, LOGICAL_WARP_THREADS, char>( gen_mode, Sum()); | |
| Test<WARPS, LOGICAL_WARP_THREADS, short>( gen_mode, Sum()); | |
| Test<WARPS, LOGICAL_WARP_THREADS, int>( gen_mode, Sum()); | |
| Test<WARPS, LOGICAL_WARP_THREADS, long long>( gen_mode, Sum()); | |
| Test<WARPS, LOGICAL_WARP_THREADS, unsigned char>( gen_mode, Sum()); | |
| Test<WARPS, LOGICAL_WARP_THREADS, unsigned short>( gen_mode, Sum()); | |
| Test<WARPS, LOGICAL_WARP_THREADS, unsigned int>( gen_mode, Sum()); | |
| Test<WARPS, LOGICAL_WARP_THREADS, unsigned long long>( gen_mode, Sum()); | |
| if (gen_mode != RANDOM) | |
| { | |
| Test<WARPS, LOGICAL_WARP_THREADS, float>( gen_mode, Sum()); | |
| Test<WARPS, LOGICAL_WARP_THREADS, double>( gen_mode, Sum()); | |
| } | |
| // primitive (alternative reduce op) | |
| Test<WARPS, LOGICAL_WARP_THREADS, unsigned char>( gen_mode, Max()); | |
| Test<WARPS, LOGICAL_WARP_THREADS, unsigned short>( gen_mode, Max()); | |
| Test<WARPS, LOGICAL_WARP_THREADS, unsigned int>( gen_mode, Max()); | |
| Test<WARPS, LOGICAL_WARP_THREADS, unsigned long long>( gen_mode, Max()); | |
| // vec-1 | |
| Test<WARPS, LOGICAL_WARP_THREADS, uchar1>( gen_mode, Sum()); | |
| // vec-2 | |
| Test<WARPS, LOGICAL_WARP_THREADS, uchar2>( gen_mode, Sum()); | |
| Test<WARPS, LOGICAL_WARP_THREADS, ushort2>( gen_mode, Sum()); | |
| Test<WARPS, LOGICAL_WARP_THREADS, uint2>( gen_mode, Sum()); | |
| Test<WARPS, LOGICAL_WARP_THREADS, ulonglong2>( gen_mode, Sum()); | |
| // vec-4 | |
| Test<WARPS, LOGICAL_WARP_THREADS, uchar4>( gen_mode, Sum()); | |
| Test<WARPS, LOGICAL_WARP_THREADS, ushort4>( gen_mode, Sum()); | |
| Test<WARPS, LOGICAL_WARP_THREADS, uint4>( gen_mode, Sum()); | |
| Test<WARPS, LOGICAL_WARP_THREADS, ulonglong4>( gen_mode, Sum()); | |
| // complex | |
| Test<WARPS, LOGICAL_WARP_THREADS, TestFoo>( gen_mode, Sum()); | |
| Test<WARPS, LOGICAL_WARP_THREADS, TestBar>( gen_mode, Sum()); | |
| } | |
| /** | |
| * Run battery of tests for different problem generation options | |
| */ | |
| template < | |
| int WARPS, | |
| int LOGICAL_WARP_THREADS> | |
| void Test() | |
| { | |
| Test<WARPS, LOGICAL_WARP_THREADS>(UNIFORM); | |
| Test<WARPS, LOGICAL_WARP_THREADS>(INTEGER_SEED); | |
| Test<WARPS, LOGICAL_WARP_THREADS>(RANDOM); | |
| } | |
| /** | |
| * Run battery of tests for different number of active warps | |
| */ | |
| template <int LOGICAL_WARP_THREADS> | |
| void Test() | |
| { | |
| Test<1, LOGICAL_WARP_THREADS>(); | |
| // Only power-of-two subwarps can be tiled | |
| if ((LOGICAL_WARP_THREADS == 32) || PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE) | |
| Test<2, LOGICAL_WARP_THREADS>(); | |
| } | |
| /** | |
| * Main | |
| */ | |
| int main(int argc, char** argv) | |
| { | |
| // Initialize command line | |
| CommandLineArgs args(argc, argv); | |
| g_verbose = args.CheckCmdLineFlag("v"); | |
| args.GetCmdLineArgument("repeat", g_repeat); | |
| // Print usage | |
| if (args.CheckCmdLineFlag("help")) | |
| { | |
| printf("%s " | |
| "[--device=<device-id>] " | |
| "[--repeat=<repetitions of entire test suite>]" | |
| "[--v] " | |
| "\n", argv[0]); | |
| exit(0); | |
| } | |
| // Initialize device | |
| CubDebugExit(args.DeviceInit()); | |
| #ifdef QUICK_TEST | |
| // Compile/run quick tests | |
| TestReduce<1, 32, int>(UNIFORM, Sum()); | |
| TestReduce<1, 32, double>(UNIFORM, Sum()); | |
| TestReduce<2, 16, TestBar>(UNIFORM, Sum()); | |
| TestSegmentedReduce<1, 32, int>(UNIFORM, 1, Sum()); | |
| #else | |
| // Compile/run thorough tests | |
| for (int i = 0; i <= g_repeat; ++i) | |
| { | |
| // Test logical warp sizes | |
| Test<32>(); | |
| Test<16>(); | |
| Test<9>(); | |
| Test<7>(); | |
| } | |
| #endif | |
| return 0; | |
| } | |