/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of DeviceReduce utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "test_util.h" #include #define TEST_HALF_T !_NVHPC_CUDA #define TEST_BF_T !_NVHPC_CUDA #if TEST_HALF_T #include // Half support is provided by SM53+. We currently test against a few older architectures. // The specializations below can be removed once we drop these architectures. namespace cub { template <> __host__ __device__ __forceinline__ // __half Min::operator()(__half& a, __half& b) const { NV_IF_TARGET(NV_PROVIDES_SM_53, (return CUB_MIN(a, b);), (return CUB_MIN(__half2float(a), __half2float(b));)); } template <> __host__ __device__ __forceinline__ // KeyValuePair ArgMin::operator()(const KeyValuePair &a, const KeyValuePair &b) const { const float av = __half2float(a.value); const float bv = __half2float(b.value); if ((bv < av) || ((av == bv) && (b.key < a.key))) { return b; } return a; } template <> __host__ __device__ __forceinline__ // __half Max::operator()(__half& a, __half& b) const { NV_IF_TARGET(NV_PROVIDES_SM_53, (return CUB_MAX(a, b);), (return CUB_MAX(__half2float(a), __half2float(b));)); } template <> __host__ __device__ __forceinline__ // KeyValuePair ArgMax::operator()(const KeyValuePair &a, const KeyValuePair &b) const { const float av = __half2float(a.value); const float bv = __half2float(b.value); if ((bv > av) || ((av == bv) && (b.key < a.key))) { return b; } return a; } } // namespace cub #endif #if TEST_BF_T #include #endif using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- int g_ptx_version; int g_sm_count; double g_device_giga_bandwidth; bool g_verbose = false; bool g_verbose_input = false; int g_timing_iterations = 0; CachingDeviceAllocator g_allocator(true); // Dispatch types enum Backend { CUB, // CUB method CUB_SEGMENTED, // CUB segmented method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method CDP_SEGMENTED, // GPU-based segmented method }; inline const char* BackendToString(Backend b) { switch (b) { case CUB: return "CUB"; case CUB_SEGMENTED: return "CUB_SEGMENTED"; case CDP: return "CDP"; case CDP_SEGMENTED: return "CDP_SEGMENTED"; default: break; } return ""; } // Custom max functor struct CustomMax { /// Boolean max operator, returns (a > b) ? a : b template __host__ __device__ auto operator()(T&& a, C&& b) -> cub::detail::accumulator_t { return CUB_MAX(a, b); } #if TEST_HALF_T __host__ __device__ __half operator()(__half& a, __half& b) { return cub::Max{}(a, b); } #endif }; // Comparing results computed on CPU and GPU for extended floating point types is impossible. // For instance, when used with a constant iterator of two, the accumulator in sequential reference // computation (CPU) bumps into the 4096 limits, which will never change (`4096 + 2 = 4096`). // Meanwhile, per-thread aggregates (`2 * 16 = 32`) are accumulated within and among thread blocks, // yielding `inf` as a result. No reasonable epsilon can be selected to compare `inf` with `4096`. // To make `__half` and `__nv_bfloat16` arithmetic associative, the function object below raises // extended floating points to the area of unsigned short integers. This allows us to test large // inputs with few code-path differences in device algorithms. struct ExtendedFloatSum { template __host__ __device__ T operator()(T a, T b) const { T result{}; result.__x = a.raw() + b.raw(); return result; } #if TEST_HALF_T __host__ __device__ __half operator()(__half a, __half b) const { uint16_t result = this->operator()(half_t{a}, half_t(b)).raw(); return reinterpret_cast<__half &>(result); } #endif #if TEST_BF_T __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const { uint16_t result = this->operator()(bfloat16_t{a}, bfloat16_t(b)).raw(); return reinterpret_cast<__nv_bfloat16 &>(result); } #endif }; //--------------------------------------------------------------------- // Dispatch to different CUB DeviceReduce entrypoints //--------------------------------------------------------------------- /** * Dispatch to reduce entrypoint (custom-max) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, int /*max_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, ReductionOpT reduction_op) { using InputT = cub::detail::value_t; // The output value type using OutputT = cub::detail::non_void_value_t; // Max-identity OutputT identity = Traits::Lowest(); // replace with std::numeric_limits::lowest() when C++ support is more prevalent // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, identity); } return error; } /** * Dispatch to sum entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t * /*d_temp_storage_bytes*/, cudaError_t * /*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, int /*max_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, cub::Sum /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); } return error; } /** * Dispatch to extended fp sum entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch(Int2Type /*dispatch_to*/, int timing_iterations, size_t * /*d_temp_storage_bytes*/, cudaError_t * /*d_cdp_error*/, void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, int /*max_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, ExtendedFloatSum reduction_op) { using InputT = cub::detail::value_t; using OutputT = cub::detail::non_void_value_t; OutputT identity{}; // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, identity); } return error; } /** * Dispatch to min entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, int /*max_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, cub::Min /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); } return error; } /** * Dispatch to max entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, int /*max_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, cub::Max /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); } return error; } /** * Dispatch to argmin entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, int /*max_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, cub::ArgMin /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); } return error; } /** * Dispatch to argmax entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, int /*max_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, cub::ArgMax /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); } return error; } //--------------------------------------------------------------------- // Dispatch to different CUB DeviceSegmentedReduce entrypoints //--------------------------------------------------------------------- /** * Dispatch to reduce entrypoint (custom-max) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int /*num_items*/, int max_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, ReductionOpT reduction_op) { // The input value type using InputT = cub::detail::value_t; // The output value type using OutputT = cub::detail::non_void_value_t; // Max-identity OutputT identity = Traits::Lowest(); // replace with std::numeric_limits::lowest() when C++ support is more prevalent // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, identity); } return error; } /** * Dispatch to sum entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int /*num_items*/, int max_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, cub::Sum /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets); } return error; } /** * Dispatch to extended fp sum entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch(Int2Type /*dispatch_to*/, int timing_iterations, size_t * /*d_temp_storage_bytes*/, cudaError_t * /*d_cdp_error*/, void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int /*num_items*/, int max_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, ExtendedFloatSum reduction_op) { using InputT = cub::detail::value_t; using OutputT = cub::detail::non_void_value_t; OutputT identity{}; // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, identity); } return error; } /** * Dispatch to min entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int /*num_items*/, int max_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, cub::Min /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets); } return error; } /** * Dispatch to max entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int /*num_items*/, int max_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, cub::Max /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets); } return error; } /** * Dispatch to argmin entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int /*num_items*/, int max_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, cub::ArgMin /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets); } return error; } /** * Dispatch to argmax entrypoint */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*dispatch_to*/, int timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int /*num_items*/, int max_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, cub::ArgMax /*reduction_op*/) { // Invoke kernel to device reduction directly cudaError_t error = cudaSuccess; for (int i = 0; i < timing_iterations; ++i) { error = DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets); } return error; } //--------------------------------------------------------------------- // CUDA nested-parallelism test kernel //--------------------------------------------------------------------- #if TEST_CDP == 1 /** * Simple wrapper kernel to invoke DeviceReduce */ template __global__ void CDPDispatchKernel(Int2Type cub_backend, int timing_iterations, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, int max_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, ReductionOpT reduction_op) { *d_cdp_error = Dispatch(cub_backend, timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, max_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op); *d_temp_storage_bytes = temp_storage_bytes; } /** * Launch kernel and dispatch on device. Should only be called from host code. * The CubBackend should be one of the non-CDP CUB backends to invoke from the * device. */ template cudaError_t LaunchCDPKernel(Int2Type cub_backend, int timing_iterations, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, int num_items, int max_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, ReductionOpT reduction_op) { cudaError_t retval = thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0) .doit(CDPDispatchKernel, cub_backend, timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, max_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op); CubDebugExit(retval); CubDebugExit(cub::detail::device_synchronize()); // Copy out temp_storage_bytes CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); // Copy out error CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); return retval; } // Specializations of Dispatch that translate the CDP backend to the appropriate // CUB backend, and uses the CUB backend to launch the CDP kernel. #define DEFINE_CDP_DISPATCHER(CdpBackend, CubBackend) \ template \ cudaError_t Dispatch(Int2Type, \ int timing_iterations, \ size_t *d_temp_storage_bytes, \ cudaError_t *d_cdp_error, \ \ void *d_temp_storage, \ size_t &temp_storage_bytes, \ InputIteratorT d_in, \ OutputIteratorT d_out, \ int num_items, \ int max_segments, \ BeginOffsetIteratorT d_segment_begin_offsets, \ EndOffsetIteratorT d_segment_end_offsets, \ ReductionOpT reduction_op) \ { \ Int2Type cub_backend{}; \ return LaunchCDPKernel(cub_backend, \ timing_iterations, \ d_temp_storage_bytes, \ d_cdp_error, \ d_temp_storage, \ temp_storage_bytes, \ d_in, \ d_out, \ num_items, \ max_segments, \ d_segment_begin_offsets, \ d_segment_end_offsets, \ reduction_op); \ } DEFINE_CDP_DISPATCHER(CDP, CUB) DEFINE_CDP_DISPATCHER(CDP_SEGMENTED, CUB_SEGMENTED) #undef DEFINE_CDP_DISPATCHER #endif // TEST_CDP //--------------------------------------------------------------------- // Problem generation //--------------------------------------------------------------------- /// Initialize problem template void Initialize( GenMode gen_mode, InputT *h_in, int num_items) { for (int i = 0; i < num_items; ++i) { InitValue(gen_mode, h_in[i], i); } if (g_verbose_input) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /// Solve problem (max/custom-max functor) template struct Solution { using OutputT = _OutputT; using InitT = OutputT; using AccumT = cub::detail::accumulator_t; template static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, ReductionOpT reduction_op) { for (int i = 0; i < num_segments; ++i) { AccumT aggregate = Traits::Lowest(); // replace with std::numeric_limits::lowest() when C++ support is more prevalent for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j) aggregate = reduction_op(aggregate, OutputT(h_in[j])); h_reference[i] = aggregate; } } }; /// Solve problem (min functor) template struct Solution { using OutputT = _OutputT; using InitT = OutputT; using AccumT = cub::detail::accumulator_t; template static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, cub::Min reduction_op) { for (int i = 0; i < num_segments; ++i) { AccumT aggregate = Traits::Max(); // replace with std::numeric_limits::max() when C++ support is more prevalent for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j) aggregate = reduction_op(aggregate, OutputT(h_in[j])); h_reference[i] = aggregate; } } }; /// Solve problem (sum functor) template struct Solution { using OutputT = _OutputT; using InitT = OutputT; using AccumT = cub::detail::accumulator_t; template static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, ReductionOpT reduction_op) { for (int i = 0; i < num_segments; ++i) { AccumT aggregate{}; for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j) aggregate = reduction_op(aggregate, h_in[j]); h_reference[i] = static_cast(aggregate); } } }; template struct Solution { using OutputT = _OutputT; using InitT = OutputT; using AccumT = cub::detail::accumulator_t; template static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, ReductionOpT reduction_op) { for (int i = 0; i < num_segments; ++i) { AccumT aggregate{}; for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j) aggregate = reduction_op(aggregate, h_in[j]); h_reference[i] = static_cast(aggregate); } } }; /// Solve problem (argmin functor) template struct Solution { typedef KeyValuePair OutputT; template static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, cub::ArgMin reduction_op) { for (int i = 0; i < num_segments; ++i) { const auto segment_begin = h_segment_begin_offsets[i]; const auto segment_end = h_segment_end_offsets[i]; if (segment_begin < segment_end) { OutputT aggregate(0, OutputValueT(h_in[segment_begin])); for (int j = segment_begin + 1; j < segment_end; ++j) { OutputT item(j - segment_begin, OutputValueT(h_in[j])); aggregate = reduction_op(aggregate, item); } h_reference[i] = aggregate; } else { // Guaranteed output for empty segments OutputT aggregate(1, Traits::Max()); // replace with std::numeric_limits::max() when C++ support is more prevalent h_reference[i] = aggregate; } } } }; /// Solve problem (argmax functor) template struct Solution { typedef KeyValuePair OutputT; template static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, cub::ArgMax reduction_op) { for (int i = 0; i < num_segments; ++i) { const auto segment_begin = h_segment_begin_offsets[i]; const auto segment_end = h_segment_end_offsets[i]; if (segment_begin < segment_end) { OutputT aggregate(0, OutputValueT(h_in[segment_begin])); for (int j = segment_begin + 1; j < segment_end; ++j) { OutputT item(j - segment_begin, OutputValueT(h_in[j])); aggregate = reduction_op(aggregate, item); } h_reference[i] = aggregate; } else { OutputT aggregate(1, Traits::Lowest()); h_reference[i] = aggregate; } } } }; //--------------------------------------------------------------------- // Problem generation //--------------------------------------------------------------------- template It unwrap_it(It it) { return it; } #if TEST_HALF_T __half* unwrap_it(half_t *it) { return reinterpret_cast<__half*>(it); } template ConstantInputIterator<__half, OffsetT> unwrap_it(ConstantInputIterator it) { half_t wrapped_val = *it; __half val = wrapped_val.operator __half(); return ConstantInputIterator<__half, OffsetT>(val); } #endif #if TEST_BF_T __nv_bfloat16* unwrap_it(bfloat16_t* it) { return reinterpret_cast<__nv_bfloat16*>(it); } template ConstantInputIterator<__nv_bfloat16, OffsetT> unwrap_it(ConstantInputIterator it) { bfloat16_t wrapped_val = *it; __nv_bfloat16 val = wrapped_val.operator __nv_bfloat16(); return ConstantInputIterator<__nv_bfloat16, OffsetT>(val); } #endif template ()))> std::integral_constant::value> // reference_extended_fp(WrappedItT) { return {}; } ExtendedFloatSum unwrap_op(std::true_type /* extended float */, cub::Sum) // { return {}; } template OpT unwrap_op(std::integral_constant /* base case */, OpT op) { return op; } /// Test DeviceReduce for a given problem input template < typename BackendT, typename DeviceWrappedInputIteratorT, typename DeviceWrappedOutputIteratorT, typename HostReferenceIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename ReductionOpT> void Test( BackendT backend, DeviceWrappedInputIteratorT d_wrapped_in, DeviceWrappedOutputIteratorT d_wrapped_out, OffsetT num_items, OffsetT num_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, ReductionOpT reduction_op, HostReferenceIteratorT h_reference) { // Input data types auto d_in = unwrap_it(d_wrapped_in); auto d_out = unwrap_it(d_wrapped_out); // Allocate CDP device arrays for temp storage size and error size_t *d_temp_storage_bytes = NULL; cudaError_t *d_cdp_error = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); // Inquire temp device storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(Dispatch(backend, 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op)); // Allocate temp device storage CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Run warmup/correctness iteration CubDebugExit(Dispatch(backend, 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op)); // Check for correctness (and display results, if specified) int compare = CompareDeviceResults(h_reference, d_wrapped_out, num_segments, g_verbose, g_verbose); printf("\t%s", compare ? "FAIL" : "PASS"); // Flush any stdout/stderr fflush(stdout); fflush(stderr); // Performance if (g_timing_iterations > 0) { GpuTimer gpu_timer; gpu_timer.Start(); CubDebugExit(Dispatch(backend, g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op)); gpu_timer.Stop(); float elapsed_millis = gpu_timer.ElapsedMillis(); // Display performance float avg_millis = elapsed_millis / g_timing_iterations; float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; float giga_bandwidth = giga_rate * sizeof(h_reference[0]); printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0); } if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); // Correctness asserts AssertEquals(0, compare); } template void SolveAndTest(HostInputIteratorT h_in, DeviceInputIteratorT d_in, OffsetT num_items, OffsetT num_segments, BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, ReductionOpT wrapped_reduction_op) { auto reduction_op = unwrap_op(reference_extended_fp(d_in), wrapped_reduction_op); using InputValueT = cub::detail::value_t; using SolutionT = Solution; using OutputT = typename SolutionT::OutputT; printf("\n\n%s cub::DeviceReduce<%s> %d items (%s), %d segments\n", BackendToString(BACKEND), typeid(ReductionOpT).name(), num_items, typeid(HostInputIteratorT).name(), num_segments); fflush(stdout); // Allocate and solve solution OutputT *h_reference = new OutputT[num_segments]; SolutionT::Solve(h_in, h_reference, num_segments, h_segment_begin_offsets, h_segment_end_offsets, reduction_op); // Run with discard iterator DiscardOutputIterator discard_itr; Test(Int2Type(), d_in, discard_itr, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, h_reference); // Run with output data OutputT *d_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_segments)); CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_segments)); Test(Int2Type(), d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, h_reference); // Cleanup if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (h_reference) delete[] h_reference; } /// Test specific problem type template < Backend BACKEND, typename InputT, typename OutputT, typename OffsetT, typename ReductionOpT> void TestProblem( OffsetT num_items, OffsetT num_segments, GenMode gen_mode, ReductionOpT reduction_op) { printf("\n\nInitializing %d %s->%s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout); fflush(stdout); // Initialize value data InputT* h_in = new InputT[num_items]; Initialize(gen_mode, h_in, num_items); // Initialize segment data OffsetT *h_segment_offsets = new OffsetT[num_segments + 1]; InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input); // Initialize device data OffsetT *d_segment_offsets = NULL; InputT *d_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1))); CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice)); SolveAndTest(h_in, d_in, num_items, num_segments, h_segment_offsets, h_segment_offsets + 1, d_segment_offsets, d_segment_offsets + 1, reduction_op); if (h_segment_offsets) delete[] h_segment_offsets; if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets)); if (h_in) delete[] h_in; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); } /// Test different operators template < Backend BACKEND, typename OutputT, typename HostInputIteratorT, typename DeviceInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT> void TestByOp( HostInputIteratorT h_in, DeviceInputIteratorT d_in, OffsetT num_items, OffsetT num_segments, BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets) { SolveAndTest(h_in, d_in, num_items, num_segments, h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, CustomMax()); SolveAndTest(h_in, d_in, num_items, num_segments, h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, Sum()); SolveAndTest(h_in, d_in, num_items, num_segments, h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, Min()); SolveAndTest(h_in, d_in, num_items, num_segments, h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, ArgMin()); SolveAndTest(h_in, d_in, num_items, num_segments, h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, Max()); SolveAndTest(h_in, d_in, num_items, num_segments, h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, ArgMax()); } template struct TransformFunctor1 { __host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const { return offset; } }; template struct TransformFunctor2 { __host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const { return offset; } }; /// Test different backends template < typename InputT, typename OutputT, typename OffsetT> void TestByBackend( OffsetT num_items, OffsetT max_segments, GenMode gen_mode) { #if TEST_CDP == 0 constexpr auto NonSegmentedBackend = CUB; constexpr auto SegmentedBackend = CUB_SEGMENTED; #else // TEST_CDP constexpr auto NonSegmentedBackend = CDP; constexpr auto SegmentedBackend = CDP_SEGMENTED; #endif // TEST_CDP // Initialize host data printf("\n\nInitializing %d %s -> %s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout); InputT *h_in = new InputT[num_items]; OffsetT *h_segment_offsets = new OffsetT[max_segments + 1]; Initialize(gen_mode, h_in, num_items); // Initialize device data InputT *d_in = NULL; OffsetT *d_segment_offsets = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (max_segments + 1))); CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice)); // // Test single-segment implementations // InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input); // Page-aligned-input tests TestByOp(h_in, d_in, num_items, 1, h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL); // Non-page-aligned-input tests if (num_items > 1) { InitializeSegments(num_items - 1, 1, h_segment_offsets, g_verbose_input); TestByOp(h_in + 1, d_in + 1, num_items - 1, 1, h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL); } // // Test segmented implementation // // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment int max_items_per_segment = 128000; for (int num_segments = cub::DivideAndRoundUp(num_items, max_items_per_segment); num_segments < max_segments; num_segments = (num_segments * 32) + 1) { // Test with segment pointer InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input); CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice)); TestByOp(h_in, d_in, num_items, num_segments, h_segment_offsets, h_segment_offsets + 1, d_segment_offsets, d_segment_offsets + 1); // Test with segment iterator typedef CastOp IdentityOpT; IdentityOpT identity_op; TransformInputIterator h_segment_offsets_itr( h_segment_offsets, identity_op); TransformInputIterator d_segment_offsets_itr( d_segment_offsets, identity_op); TestByOp(h_in, d_in, num_items, num_segments, h_segment_offsets_itr, h_segment_offsets_itr + 1, d_segment_offsets_itr, d_segment_offsets_itr + 1); // Test with transform iterators of different types typedef TransformFunctor1 TransformFunctor1T; typedef TransformFunctor2 TransformFunctor2T; TransformInputIterator h_segment_begin_offsets_itr(h_segment_offsets, TransformFunctor1T()); TransformInputIterator h_segment_end_offsets_itr(h_segment_offsets + 1, TransformFunctor2T()); TransformInputIterator d_segment_begin_offsets_itr(d_segment_offsets, TransformFunctor1T()); TransformInputIterator d_segment_end_offsets_itr(d_segment_offsets + 1, TransformFunctor2T()); TestByOp(h_in, d_in, num_items, num_segments, h_segment_begin_offsets_itr, h_segment_end_offsets_itr, d_segment_begin_offsets_itr, d_segment_end_offsets_itr); } if (h_in) delete[] h_in; if (h_segment_offsets) delete[] h_segment_offsets; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets)); } /// Test different input-generation modes template < typename InputT, typename OutputT, typename OffsetT> void TestByGenMode( OffsetT num_items, OffsetT max_segments) { // // Test pointer support using different input-generation modes // TestByBackend(num_items, max_segments, UNIFORM); TestByBackend(num_items, max_segments, INTEGER_SEED); TestByBackend(num_items, max_segments, RANDOM); // // Test iterator support using a constant-iterator and SUM // InputT val; InitValue(UNIFORM, val, 0); ConstantInputIterator in(val); OffsetT *h_segment_offsets = new OffsetT[1 + 1]; InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input); #if TEST_CDP == 0 constexpr auto Backend = CUB; #else // TEST_CDP constexpr auto Backend = CDP; #endif // TEST_CDP SolveAndTest(in, in, num_items, 1, h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL, Sum()); if (h_segment_offsets) delete[] h_segment_offsets; } /// Test different problem sizes template void TestBySize(OffsetT max_items, OffsetT max_segments, OffsetT tile_size) { // Test 0, 1, many TestByGenMode(0, max_segments); TestByGenMode(1, max_segments); TestByGenMode(max_items, max_segments); // Test random problem sizes from a log-distribution [8, max_items-ish) int num_iterations = 8; double max_exp = log(double(max_items)) / log(double(2.0)); for (int i = 0; i < num_iterations; ++i) { OffsetT num_items = (OffsetT)pow(2.0, RandomValue(max_exp - 3.0) + 3.0); TestByGenMode(num_items, max_segments); } // // White-box testing of single-segment problems around specific sizes // #if TEST_CDP == 0 constexpr auto Backend = CUB; #else // TEST_CDP constexpr auto Backend = CDP; #endif // TEST_CDP // Tile-boundaries: multiple blocks, one tile per block TestProblem(tile_size * 4, 1, RANDOM, Sum()); TestProblem(tile_size * 4 + 1, 1, RANDOM, Sum()); TestProblem(tile_size * 4 - 1, 1, RANDOM, Sum()); // Tile-boundaries: multiple blocks, multiple tiles per block OffsetT sm_occupancy = 32; OffsetT occupancy = tile_size * sm_occupancy * g_sm_count; TestProblem(occupancy, 1, RANDOM, Sum()); TestProblem(occupancy + 1, 1, RANDOM, Sum()); TestProblem(occupancy - 1, 1, RANDOM, Sum()); }; class CustomInputT { char m_val{}; public: __host__ __device__ explicit CustomInputT(char val) : m_val(val) {} __host__ __device__ int get() const { return static_cast(m_val); } }; class CustomAccumulatorT { int m_val{0}; int m_magic_value{42}; __host__ __device__ CustomAccumulatorT(int val) : m_val(val) {} public: __host__ __device__ CustomAccumulatorT() {} __host__ __device__ CustomAccumulatorT(const CustomAccumulatorT &in) : m_val(in.is_valid() * in.get()) , m_magic_value(in.is_valid() * 42) {} __host__ __device__ void operator=(const CustomInputT &in) { if (this->is_valid()) { m_val = in.get(); } } __host__ __device__ void operator=(const CustomAccumulatorT &in) { if (this->is_valid() && in.is_valid()) { m_val = in.get(); } } __host__ __device__ CustomAccumulatorT operator+(const CustomInputT &in) const { const int multiplier = this->is_valid(); return {(m_val + in.get()) * multiplier}; } __host__ __device__ CustomAccumulatorT operator+(const CustomAccumulatorT &in) const { const int multiplier = this->is_valid() && in.is_valid(); return {(m_val + in.get()) * multiplier}; } __host__ __device__ int get() const { return m_val; } __host__ __device__ bool is_valid() const { return m_magic_value == 42; } }; class CustomOutputT { bool *m_d_flag{}; int m_expected{}; public: __host__ __device__ CustomOutputT(bool *d_flag, int expected) : m_d_flag(d_flag) , m_expected(expected) {} __host__ __device__ void operator=(const CustomAccumulatorT &accum) const { *m_d_flag = accum.is_valid() && (accum.get() == m_expected); } }; __global__ void InitializeTestAccumulatorTypes(int num_items, int expected, bool *d_flag, CustomInputT *d_in, CustomOutputT *d_out) { const int idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); if (idx < num_items) { d_in[idx] = CustomInputT(1); } if (idx == 0) { *d_out = CustomOutputT{d_flag, expected}; } } template void TestBigIndicesHelper(OffsetT num_items) { thrust::constant_iterator const_iter(T{1}); thrust::device_vector out(1); std::size_t* d_out = thrust::raw_pointer_cast(out.data()); std::uint8_t *d_temp_storage{}; std::size_t temp_storage_bytes{}; CubDebugExit( cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, const_iter, d_out, num_items)); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); CubDebugExit( cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, const_iter, d_out, num_items)); std::size_t result = out[0]; AssertEquals(result, num_items); } template void TestBigIndices() { TestBigIndicesHelper(1ull << 30); TestBigIndicesHelper(1ull << 31); TestBigIndicesHelper((1ull << 32) - 1); TestBigIndicesHelper(1ull << 33); } #if TEST_TYPES == 3 void TestAccumulatorTypes() { const int num_items = 2 * 1024 * 1024; const int expected = num_items; const int block_size = 256; const int grid_size = (num_items + block_size - 1) / block_size; CustomInputT *d_in{}; CustomOutputT *d_out{}; CustomAccumulatorT init{}; bool *d_flag{}; CubDebugExit( g_allocator.DeviceAllocate((void **)&d_out, sizeof(CustomOutputT))); CubDebugExit(g_allocator.DeviceAllocate((void **)&d_flag, sizeof(bool))); CubDebugExit(g_allocator.DeviceAllocate((void **)&d_in, sizeof(CustomInputT) * num_items)); InitializeTestAccumulatorTypes<<>>(num_items, expected, d_flag, d_in, d_out); std::uint8_t *d_temp_storage{}; std::size_t temp_storage_bytes{}; CubDebugExit(cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Sum{}, init)); CubDebugExit( g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes)); CubDebugExit(cudaMemset(d_temp_storage, 1, temp_storage_bytes)); CubDebugExit(cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Sum{}, init)); bool ok{}; CubDebugExit(cudaMemcpy(&ok, d_flag, sizeof(bool), cudaMemcpyDeviceToHost)); AssertTrue(ok); CubDebugExit(g_allocator.DeviceFree(d_out)); CubDebugExit(g_allocator.DeviceFree(d_in)); } /** * ArgMin should return max value for empty input. This interferes with * input data containing infinity values. This test checks that ArgMin * works correctly with infinity values. */ void TestFloatInfInArgMin() { using in_t = float; using offset_t = int; using out_t = cub::KeyValuePair; const int n = 10; const float inf = ::cuda::std::numeric_limits::infinity(); thrust::device_vector in(n, inf); thrust::device_vector out(1); const in_t *d_in = thrust::raw_pointer_cast(in.data()); out_t *d_out = thrust::raw_pointer_cast(out.data()); std::uint8_t *d_temp_storage{}; std::size_t temp_storage_bytes{}; CubDebugExit(cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, n)); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); CubDebugExit(cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, n)); const out_t result = out[0]; AssertEquals(result.key, 0); AssertEquals(result.value, inf); } /** * ArgMax should return lowest value for empty input. This interferes with * input data containing infinity values. This test checks that ArgMax * works correctly with infinity values. */ void TestFloatInfInArgMax() { using in_t = float; using offset_t = int; using out_t = cub::KeyValuePair; const int n = 10; const float inf = ::cuda::std::numeric_limits::infinity(); thrust::device_vector in(n, -inf); thrust::device_vector out(1); const in_t *d_in = thrust::raw_pointer_cast(in.data()); out_t *d_out = thrust::raw_pointer_cast(out.data()); std::uint8_t *d_temp_storage{}; std::size_t temp_storage_bytes{}; CubDebugExit(cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, n)); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); CubDebugExit(cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, n)); const out_t result = out[0]; AssertEquals(result.key, 0); AssertEquals(result.value, -inf); } void TestFloatInfInArg() { TestFloatInfInArgMin(); TestFloatInfInArgMax(); } #endif template struct GetTileSize { OffsetT max_items{}; OffsetT max_segments{}; OffsetT tile_size{}; GetTileSize(OffsetT max_items, OffsetT max_segments) : max_items(max_items) , max_segments(max_segments) {} template CUB_RUNTIME_FUNCTION cudaError_t Invoke() { this->tile_size = ActivePolicyT::ReducePolicy::BLOCK_THREADS * ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD; return cudaSuccess; } }; /// Test problem type template void TestType(OffsetT max_items, OffsetT max_segments) { // Inspect the tuning policies to determine this arch's tile size: using MaxPolicyT = typename DeviceReducePolicy::MaxPolicy; GetTileSize dispatch(max_items, max_segments); CubDebugExit(MaxPolicyT::Invoke(g_ptx_version, dispatch)); TestBySize(max_items, max_segments, dispatch.tile_size); } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { typedef int OffsetT; OffsetT max_items = 27000000; OffsetT max_segments = 34000; // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); g_verbose_input = args.CheckCmdLineFlag("v2"); args.GetCmdLineArgument("n", max_items); args.GetCmdLineArgument("s", max_segments); args.GetCmdLineArgument("i", g_timing_iterations); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--s= " "[--i= " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); g_device_giga_bandwidth = args.device_giga_bandwidth; // Get ptx version CubDebugExit(PtxVersion(g_ptx_version)); // Get SM count g_sm_count = args.deviceProp.multiProcessorCount; // %PARAM% TEST_CDP cdp 0:1 // %PARAM% TEST_TYPES types 0:1:2:3:4 #if TEST_TYPES == 0 TestType(max_items, max_segments); TestType(max_items, max_segments); TestType(max_items, max_segments); #elif TEST_TYPES == 1 TestType(max_items, max_segments); TestType(max_items, max_segments); TestType(max_items, max_segments); TestType(max_items, max_segments); #elif TEST_TYPES == 2 TestType(max_items, max_segments); TestType(max_items, max_segments); TestType(max_items, max_segments); TestType(max_items, max_segments); #elif TEST_TYPES == 3 TestType(max_items, max_segments); TestAccumulatorTypes(); TestFloatInfInArg(); #if TEST_HALF_T TestType(max_items, max_segments); #endif #else // TEST_TYPES == 4 TestType(max_items, max_segments); TestBigIndices(); #if TEST_BF_T TestType(max_items, max_segments); #endif #endif printf("\n"); return 0; }