File size: 69,895 Bytes

8ae5fc5

/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 * Test of DeviceReduce utilities
 ******************************************************************************/

// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR

#include <cub/device/device_reduce.cuh>
#include <cub/device/device_segmented_reduce.cuh>
#include <cub/iterator/constant_input_iterator.cuh>
#include <cub/iterator/discard_output_iterator.cuh>
#include <cub/iterator/transform_input_iterator.cuh>
#include <cub/thread/thread_operators.cuh>
#include <cub/util_allocator.cuh>
#include <cub/util_math.cuh>
#include <cub/util_type.cuh>

#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <cstdio>
#include <limits>
#include <type_traits>
#include <typeinfo>

#include "test_util.h"
#include <nv/target>

#define TEST_HALF_T !_NVHPC_CUDA

#define TEST_BF_T !_NVHPC_CUDA

#if TEST_HALF_T
#include <cuda_fp16.h>

// Half support is provided by SM53+. We currently test against a few older architectures. 
// The specializations below can be removed once we drop these architectures.  
namespace cub {

template <>
__host__ __device__ __forceinline__ //
__half Min::operator()(__half& a, __half& b) const 
{
    NV_IF_TARGET(NV_PROVIDES_SM_53, 
                    (return CUB_MIN(a, b);),
                    (return CUB_MIN(__half2float(a), __half2float(b));));
}

template <>
__host__ __device__ __forceinline__ //
KeyValuePair<int, __half> 
ArgMin::operator()(const KeyValuePair<int, __half> &a, 
                   const KeyValuePair<int, __half> &b) const 
{
    const float av = __half2float(a.value);
    const float bv = __half2float(b.value);

    if ((bv < av) || ((av == bv) && (b.key < a.key)))
    {
      return b;
    }

    return a;
}

template <>
__host__ __device__ __forceinline__ //
__half Max::operator()(__half& a, __half& b) const 
{
    NV_IF_TARGET(NV_PROVIDES_SM_53, 
                    (return CUB_MAX(a, b);),
                    (return CUB_MAX(__half2float(a), __half2float(b));));
}

template <>
__host__ __device__ __forceinline__ //
KeyValuePair<int, __half> 
ArgMax::operator()(const KeyValuePair<int, __half> &a, 
                   const KeyValuePair<int, __half> &b) const 
{
    const float av = __half2float(a.value);
    const float bv = __half2float(b.value);

    if ((bv > av) || ((av == bv) && (b.key < a.key)))
    {
      return b;
    }

    return a;
}

} // namespace cub
#endif

#if TEST_BF_T
#include <cuda_bf16.h>
#endif

using namespace cub;


//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------

int                     g_ptx_version;
int                     g_sm_count;
double                  g_device_giga_bandwidth;
bool                    g_verbose           = false;
bool                    g_verbose_input     = false;
int                     g_timing_iterations = 0;
CachingDeviceAllocator  g_allocator(true);


// Dispatch types
enum Backend
{
    CUB,            // CUB method
    CUB_SEGMENTED,  // CUB segmented method
    CDP,            // GPU-based (dynamic parallelism) dispatch to CUB method
    CDP_SEGMENTED,  // GPU-based segmented method
};

inline const char* BackendToString(Backend b)
{
  switch (b)
  {
    case CUB:
      return "CUB";
    case CUB_SEGMENTED:
      return "CUB_SEGMENTED";
    case CDP:
      return "CDP";
    case CDP_SEGMENTED:
      return "CDP_SEGMENTED";
    default:
      break;
  }

  return "";
}

// Custom max functor
struct CustomMax
{
    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
    template <typename T, typename C>
    __host__ __device__ auto operator()(T&& a, C&& b) 
      -> cub::detail::accumulator_t<cub::Max, T, C>
    {
        return CUB_MAX(a, b);
    }

#if TEST_HALF_T
    __host__ __device__ __half operator()(__half& a, __half& b) 
    {
        return cub::Max{}(a, b);
    }
#endif
};

// Comparing results computed on CPU and GPU for extended floating point types is impossible. 
// For instance, when used with a constant iterator of two, the accumulator in sequential reference 
// computation (CPU) bumps into the 4096 limits, which will never change (`4096 + 2 = 4096`). 
// Meanwhile, per-thread aggregates (`2 * 16 = 32`) are accumulated within and among thread blocks, 
// yielding `inf` as a result. No reasonable epsilon can be selected to compare `inf` with `4096`. 
// To make `__half` and `__nv_bfloat16` arithmetic associative, the function object below raises 
// extended floating points to the area of unsigned short integers. This allows us to test large 
// inputs with few code-path differences in device algorithms. 
struct ExtendedFloatSum
{
    template <class T>
    __host__ __device__ T operator()(T a, T b) const
    {
        T result{};
        result.__x = a.raw() + b.raw();
        return result;
    }

#if TEST_HALF_T
    __host__ __device__ __half operator()(__half a, __half b) const
    {
        uint16_t result = this->operator()(half_t{a}, half_t(b)).raw();
        return reinterpret_cast<__half &>(result);
    }
#endif

#if TEST_BF_T
    __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const
    {
        uint16_t result = this->operator()(bfloat16_t{a}, bfloat16_t(b)).raw();
        return reinterpret_cast<__nv_bfloat16 &>(result);
    }
#endif
};

//---------------------------------------------------------------------
// Dispatch to different CUB DeviceReduce entrypoints
//---------------------------------------------------------------------

/**
 * Dispatch to reduce entrypoint (custom-max)
 */
template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename ReductionOpT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
    Int2Type<CUB>       /*dispatch_to*/,
    int                 timing_iterations,
    size_t              */*d_temp_storage_bytes*/,
    cudaError_t         */*d_cdp_error*/,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 /*max_segments*/,
    BeginOffsetIteratorT /*d_segment_begin_offsets*/,
    EndOffsetIteratorT  /*d_segment_end_offsets*/,
    ReductionOpT        reduction_op)
{
    using InputT = cub::detail::value_t<InputIteratorT>;

    // The output value type
    using OutputT = cub::detail::non_void_value_t<OutputIteratorT, InputT>;

    // Max-identity
    OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent

    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_iterations; ++i)
    {
        error = DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,
            d_in, d_out, num_items, reduction_op, identity);
    }

    return error;
}

/**
 * Dispatch to sum entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
    Int2Type<CUB>         /*dispatch_to*/,
    int                     timing_iterations,
    size_t *              /*d_temp_storage_bytes*/,
    cudaError_t *         /*d_cdp_error*/,

    void*                   d_temp_storage,
    size_t&                 temp_storage_bytes,
    InputIteratorT          d_in,
    OutputIteratorT         d_out,
    int                     num_items,
    int                   /*max_segments*/,
    BeginOffsetIteratorT  /*d_segment_begin_offsets*/,
    EndOffsetIteratorT    /*d_segment_end_offsets*/,
    cub::Sum              /*reduction_op*/)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_iterations; ++i)
    {
        error = DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
    }

    return error;
}

/**
 * Dispatch to extended fp sum entrypoint
 */
template <typename InputIteratorT,
          typename OutputIteratorT,
          typename BeginOffsetIteratorT,
          typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION cudaError_t Dispatch(Int2Type<CUB> /*dispatch_to*/,
                                          int timing_iterations,
                                          size_t * /*d_temp_storage_bytes*/,
                                          cudaError_t * /*d_cdp_error*/,

                                          void *d_temp_storage,
                                          size_t &temp_storage_bytes,
                                          InputIteratorT d_in,
                                          OutputIteratorT d_out,
                                          int num_items,
                                          int /*max_segments*/,
                                          BeginOffsetIteratorT /*d_segment_begin_offsets*/,
                                          EndOffsetIteratorT /*d_segment_end_offsets*/,
                                          ExtendedFloatSum reduction_op)
{
    using InputT  = cub::detail::value_t<InputIteratorT>;
    using OutputT = cub::detail::non_void_value_t<OutputIteratorT, InputT>;

    OutputT identity{};

    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_iterations; ++i)
    {
        error = DeviceReduce::Reduce(d_temp_storage,
                                     temp_storage_bytes,
                                     d_in,
                                     d_out,
                                     num_items,
                                     reduction_op,
                                     identity);
    }

    return error;
}

/**
 * Dispatch to min entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
    Int2Type<CUB>       /*dispatch_to*/,
    int                 timing_iterations,
    size_t              */*d_temp_storage_bytes*/,
    cudaError_t         */*d_cdp_error*/,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 /*max_segments*/,
    BeginOffsetIteratorT /*d_segment_begin_offsets*/,
    EndOffsetIteratorT  /*d_segment_end_offsets*/,
    cub::Min            /*reduction_op*/)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_iterations; ++i)
    {
        error = DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
    }

    return error;
}

/**
 * Dispatch to max entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
    Int2Type<CUB>       /*dispatch_to*/,
    int                 timing_iterations,
    size_t              */*d_temp_storage_bytes*/,
    cudaError_t         */*d_cdp_error*/,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 /*max_segments*/,
    BeginOffsetIteratorT /*d_segment_begin_offsets*/,
    EndOffsetIteratorT  /*d_segment_end_offsets*/,
    cub::Max            /*reduction_op*/)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_iterations; ++i)
    {
        error = DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
    }

    return error;
}

/**
 * Dispatch to argmin entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
    Int2Type<CUB>       /*dispatch_to*/,
    int                 timing_iterations,
    size_t              */*d_temp_storage_bytes*/,
    cudaError_t         */*d_cdp_error*/,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 /*max_segments*/,
    BeginOffsetIteratorT /*d_segment_begin_offsets*/,
    EndOffsetIteratorT  /*d_segment_end_offsets*/,
    cub::ArgMin         /*reduction_op*/)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_iterations; ++i)
    {
        error = DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
    }

    return error;
}

/**
 * Dispatch to argmax entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
    Int2Type<CUB>       /*dispatch_to*/,
    int                 timing_iterations,
    size_t              */*d_temp_storage_bytes*/,
    cudaError_t         */*d_cdp_error*/,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 num_items,
    int                 /*max_segments*/,
    BeginOffsetIteratorT /*d_segment_begin_offsets*/,
    EndOffsetIteratorT  /*d_segment_end_offsets*/,
    cub::ArgMax         /*reduction_op*/)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_iterations; ++i)
    {
        error = DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
    }

    return error;
}


//---------------------------------------------------------------------
// Dispatch to different CUB DeviceSegmentedReduce entrypoints
//---------------------------------------------------------------------

/**
 * Dispatch to reduce entrypoint (custom-max)
 */
template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename ReductionOpT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
    Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
    int                 timing_iterations,
    size_t              */*d_temp_storage_bytes*/,
    cudaError_t         */*d_cdp_error*/,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 /*num_items*/,
    int                 max_segments,
    BeginOffsetIteratorT d_segment_begin_offsets,
    EndOffsetIteratorT  d_segment_end_offsets,
    ReductionOpT        reduction_op)
{
    // The input value type
    using InputT = cub::detail::value_t<InputIteratorT>;

    // The output value type
    using OutputT = cub::detail::non_void_value_t<OutputIteratorT, InputT>;

    // Max-identity
    OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent

    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_iterations; ++i)
    {
        error = DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes,
            d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, identity);
    }
    return error;
}

/**
 * Dispatch to sum entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
    Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
    int                 timing_iterations,
    size_t              */*d_temp_storage_bytes*/,
    cudaError_t         */*d_cdp_error*/,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 /*num_items*/,
    int                 max_segments,
    BeginOffsetIteratorT d_segment_begin_offsets,
    EndOffsetIteratorT  d_segment_end_offsets,
    cub::Sum            /*reduction_op*/)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_iterations; ++i)
    {
        error = DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes,
            d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets);
    }
    return error;
}

/**
 * Dispatch to extended fp sum entrypoint
 */
template <typename InputIteratorT,
          typename OutputIteratorT,
          typename BeginOffsetIteratorT,
          typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION cudaError_t Dispatch(Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
                                          int timing_iterations,
                                          size_t * /*d_temp_storage_bytes*/,
                                          cudaError_t * /*d_cdp_error*/,

                                          void *d_temp_storage,
                                          size_t &temp_storage_bytes,
                                          InputIteratorT d_in,
                                          OutputIteratorT d_out,
                                          int /*num_items*/,
                                          int max_segments,
                                          BeginOffsetIteratorT d_segment_begin_offsets,
                                          EndOffsetIteratorT d_segment_end_offsets,
                                          ExtendedFloatSum reduction_op)
{
    using InputT  = cub::detail::value_t<InputIteratorT>;
    using OutputT = cub::detail::non_void_value_t<OutputIteratorT, InputT>;

    OutputT identity{};

    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_iterations; ++i)
    {
        error = DeviceSegmentedReduce::Reduce(d_temp_storage,
                                              temp_storage_bytes,
                                              d_in,
                                              d_out,
                                              max_segments,
                                              d_segment_begin_offsets,
                                              d_segment_end_offsets,
                                              reduction_op,
                                              identity);
    }

    return error;
}

/**
 * Dispatch to min entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
    Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
    int                 timing_iterations,
    size_t              */*d_temp_storage_bytes*/,
    cudaError_t         */*d_cdp_error*/,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 /*num_items*/,
    int                 max_segments,
    BeginOffsetIteratorT d_segment_begin_offsets,
    EndOffsetIteratorT  d_segment_end_offsets,
    cub::Min            /*reduction_op*/)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_iterations; ++i)
    {
        error = DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes,
            d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets);
    }
    return error;
}

/**
 * Dispatch to max entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
    Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
    int                 timing_iterations,
    size_t              */*d_temp_storage_bytes*/,
    cudaError_t         */*d_cdp_error*/,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 /*num_items*/,
    int                 max_segments,
    BeginOffsetIteratorT d_segment_begin_offsets,
    EndOffsetIteratorT  d_segment_end_offsets,
    cub::Max            /*reduction_op*/)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_iterations; ++i)
    {
        error = DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes,
            d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets);
    }
    return error;
}

/**
 * Dispatch to argmin entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
    Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
    int                 timing_iterations,
    size_t              */*d_temp_storage_bytes*/,
    cudaError_t         */*d_cdp_error*/,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 /*num_items*/,
    int                 max_segments,
    BeginOffsetIteratorT d_segment_begin_offsets,
    EndOffsetIteratorT  d_segment_end_offsets,
    cub::ArgMin         /*reduction_op*/)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_iterations; ++i)
    {
        error = DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes,
            d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets);
    }
    return error;
}

/**
 * Dispatch to argmax entrypoint
 */
template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
    Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
    int                 timing_iterations,
    size_t              */*d_temp_storage_bytes*/,
    cudaError_t         */*d_cdp_error*/,

    void*               d_temp_storage,
    size_t&             temp_storage_bytes,
    InputIteratorT      d_in,
    OutputIteratorT     d_out,
    int                 /*num_items*/,
    int                 max_segments,
    BeginOffsetIteratorT d_segment_begin_offsets,
    EndOffsetIteratorT  d_segment_end_offsets,
    cub::ArgMax         /*reduction_op*/)
{
    // Invoke kernel to device reduction directly
    cudaError_t error = cudaSuccess;
    for (int i = 0; i < timing_iterations; ++i)
    {
        error = DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes,
            d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets);
    }
    return error;
}


//---------------------------------------------------------------------
// CUDA nested-parallelism test kernel
//---------------------------------------------------------------------

#if TEST_CDP == 1

/**
 * Simple wrapper kernel to invoke DeviceReduce
 */
template <int CubBackend,
          typename InputIteratorT,
          typename OutputIteratorT,
          typename BeginOffsetIteratorT,
          typename EndOffsetIteratorT,
          typename ReductionOpT>
__global__ void CDPDispatchKernel(Int2Type<CubBackend> cub_backend,
                                  int                  timing_iterations,
                                  size_t              *d_temp_storage_bytes,
                                  cudaError_t         *d_cdp_error,

                                  void                *d_temp_storage,
                                  size_t               temp_storage_bytes,
                                  InputIteratorT       d_in,
                                  OutputIteratorT      d_out,
                                  int                  num_items,
                                  int                  max_segments,
                                  BeginOffsetIteratorT d_segment_begin_offsets,
                                  EndOffsetIteratorT   d_segment_end_offsets,
                                  ReductionOpT         reduction_op)
{
  *d_cdp_error = Dispatch(cub_backend,
                          timing_iterations,
                          d_temp_storage_bytes,
                          d_cdp_error,
                          d_temp_storage,
                          temp_storage_bytes,
                          d_in,
                          d_out,
                          num_items,
                          max_segments,
                          d_segment_begin_offsets,
                          d_segment_end_offsets,
                          reduction_op);

  *d_temp_storage_bytes = temp_storage_bytes;
}

/**
 * Launch kernel and dispatch on device. Should only be called from host code.
 * The CubBackend should be one of the non-CDP CUB backends to invoke from the
 * device.
 */
template <int CubBackend,
          typename InputIteratorT,
          typename OutputIteratorT,
          typename BeginOffsetIteratorT,
          typename EndOffsetIteratorT,
          typename ReductionOpT>
cudaError_t LaunchCDPKernel(Int2Type<CubBackend> cub_backend,
                            int                  timing_iterations,
                            size_t              *d_temp_storage_bytes,
                            cudaError_t         *d_cdp_error,

                            void                *d_temp_storage,
                            size_t              &temp_storage_bytes,
                            InputIteratorT       d_in,
                            OutputIteratorT      d_out,
                            int                  num_items,
                            int                  max_segments,
                            BeginOffsetIteratorT d_segment_begin_offsets,
                            EndOffsetIteratorT   d_segment_end_offsets,
                            ReductionOpT         reduction_op)
{
  cudaError_t retval =
    thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0)
      .doit(CDPDispatchKernel<CubBackend,
                              InputIteratorT,
                              OutputIteratorT,
                              BeginOffsetIteratorT,
                              EndOffsetIteratorT,
                              ReductionOpT>,
            cub_backend,
            timing_iterations,
            d_temp_storage_bytes,
            d_cdp_error,
            d_temp_storage,
            temp_storage_bytes,
            d_in,
            d_out,
            num_items,
            max_segments,
            d_segment_begin_offsets,
            d_segment_end_offsets,
            reduction_op);
  CubDebugExit(retval);
  CubDebugExit(cub::detail::device_synchronize());

  // Copy out temp_storage_bytes
  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
                          d_temp_storage_bytes,
                          sizeof(size_t) * 1,
                          cudaMemcpyDeviceToHost));

  // Copy out error
  CubDebugExit(cudaMemcpy(&retval,
                          d_cdp_error,
                          sizeof(cudaError_t) * 1,
                          cudaMemcpyDeviceToHost));

  return retval;
}

// Specializations of Dispatch that translate the CDP backend to the appropriate
// CUB backend, and uses the CUB backend to launch the CDP kernel.
#define DEFINE_CDP_DISPATCHER(CdpBackend, CubBackend)                          \
  template <typename InputIteratorT,                                           \
            typename OutputIteratorT,                                          \
            typename BeginOffsetIteratorT,                                     \
            typename EndOffsetIteratorT,                                       \
            typename ReductionOpT>                                             \
  cudaError_t Dispatch(Int2Type<CdpBackend>,                                   \
                       int          timing_iterations,                         \
                       size_t      *d_temp_storage_bytes,                      \
                       cudaError_t *d_cdp_error,                               \
                                                                               \
                       void                *d_temp_storage,                    \
                       size_t              &temp_storage_bytes,                \
                       InputIteratorT       d_in,                              \
                       OutputIteratorT      d_out,                             \
                       int                  num_items,                         \
                       int                  max_segments,                      \
                       BeginOffsetIteratorT d_segment_begin_offsets,           \
                       EndOffsetIteratorT   d_segment_end_offsets,             \
                       ReductionOpT         reduction_op)                      \
  {                                                                            \
    Int2Type<CubBackend> cub_backend{};                                        \
    return LaunchCDPKernel(cub_backend,                                        \
                           timing_iterations,                                  \
                           d_temp_storage_bytes,                               \
                           d_cdp_error,                                        \
                           d_temp_storage,                                     \
                           temp_storage_bytes,                                 \
                           d_in,                                               \
                           d_out,                                              \
                           num_items,                                          \
                           max_segments,                                       \
                           d_segment_begin_offsets,                            \
                           d_segment_end_offsets,                              \
                           reduction_op);                                      \
  }

DEFINE_CDP_DISPATCHER(CDP, CUB)
DEFINE_CDP_DISPATCHER(CDP_SEGMENTED, CUB_SEGMENTED)

#undef DEFINE_CDP_DISPATCHER

#endif // TEST_CDP

//---------------------------------------------------------------------
// Problem generation
//---------------------------------------------------------------------
/// Initialize problem
template <typename InputT>
void Initialize(
    GenMode         gen_mode,
    InputT          *h_in,
    int             num_items)
{
    for (int i = 0; i < num_items; ++i)
    {
        InitValue(gen_mode, h_in[i], i);
    }

    if (g_verbose_input)
    {
        printf("Input:\n");
        DisplayResults(h_in, num_items);
        printf("\n\n");
    }
}


/// Solve problem (max/custom-max functor)
template <typename ReductionOpT, typename InputT, typename _OutputT>
struct Solution
{
    using OutputT = _OutputT;
    using InitT = OutputT;
    using AccumT = cub::detail::accumulator_t<ReductionOpT, InitT, InputT>;

    template <typename HostInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments,
        BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, ReductionOpT reduction_op)
    {
        for (int i = 0; i < num_segments; ++i)
        {
            AccumT aggregate = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
            for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j)
                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
            h_reference[i] = aggregate;
        }
    }
};

/// Solve problem (min functor)
template <typename InputT, typename _OutputT>
struct Solution<cub::Min, InputT, _OutputT>
{
    using OutputT = _OutputT;
    using InitT = OutputT;
    using AccumT = cub::detail::accumulator_t<cub::Min, InitT, InputT>;

    template <typename HostInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments,
        BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, cub::Min reduction_op)
    {
        for (int i = 0; i < num_segments; ++i)
        {
            AccumT aggregate = Traits<InputT>::Max();    // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
            for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j)
                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
            h_reference[i] = aggregate;
        }
    }
};


/// Solve problem (sum functor)
template <typename InputT, typename _OutputT>
struct Solution<cub::Sum, InputT, _OutputT>
{
    using OutputT = _OutputT;
    using InitT = OutputT;
    using AccumT = cub::detail::accumulator_t<cub::Sum, InitT, InputT>;

    template <typename HostInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename ReductionOpT>
    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments,
        BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, ReductionOpT reduction_op)
    {
        for (int i = 0; i < num_segments; ++i)
        {
            AccumT aggregate{};
            for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j)
                aggregate = reduction_op(aggregate, h_in[j]);
            h_reference[i] = static_cast<OutputT>(aggregate);
        }
    }
};

template <typename InputT, typename _OutputT>
struct Solution<ExtendedFloatSum, InputT, _OutputT>
{
    using OutputT = _OutputT;
    using InitT   = OutputT;
    using AccumT  = cub::detail::accumulator_t<cub::Sum, InitT, InputT>;

    template <typename HostInputIteratorT,
              typename OffsetT,
              typename BeginOffsetIteratorT,
              typename EndOffsetIteratorT,
              typename ReductionOpT>
    static void Solve(HostInputIteratorT h_in,
                      OutputT *h_reference,
                      OffsetT num_segments,
                      BeginOffsetIteratorT h_segment_begin_offsets,
                      EndOffsetIteratorT h_segment_end_offsets,
                      ReductionOpT reduction_op)
    {
        for (int i = 0; i < num_segments; ++i)
        {
            AccumT aggregate{};
            for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j)
                aggregate = reduction_op(aggregate, h_in[j]);
            h_reference[i] = static_cast<OutputT>(aggregate);
        }
    }
};

/// Solve problem (argmin functor)
template <typename InputValueT, typename OutputValueT>
struct Solution<cub::ArgMin, InputValueT, OutputValueT>
{
    typedef KeyValuePair<int, OutputValueT> OutputT;

    template <typename HostInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments,
        BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, cub::ArgMin reduction_op)
    {
        for (int i = 0; i < num_segments; ++i)
        {
            const auto segment_begin = h_segment_begin_offsets[i];
            const auto segment_end = h_segment_end_offsets[i];

            if (segment_begin < segment_end) 
            {
                OutputT aggregate(0, OutputValueT(h_in[segment_begin]));
                for (int j = segment_begin + 1; j < segment_end; ++j)
                {
                    OutputT item(j - segment_begin, OutputValueT(h_in[j]));
                    aggregate = reduction_op(aggregate, item);
                }
                h_reference[i] = aggregate;
            }
            else 
            {
                // Guaranteed output for empty segments
                OutputT aggregate(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
                h_reference[i] = aggregate;
            }
        }
    }
};


/// Solve problem (argmax functor)
template <typename InputValueT, typename OutputValueT>
struct Solution<cub::ArgMax, InputValueT, OutputValueT>
{
    typedef KeyValuePair<int, OutputValueT> OutputT;

    template <typename HostInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments,
        BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, cub::ArgMax reduction_op)
    {
        for (int i = 0; i < num_segments; ++i)
        {
            const auto segment_begin = h_segment_begin_offsets[i];
            const auto segment_end = h_segment_end_offsets[i];

            if (segment_begin < segment_end) 
            {
                OutputT aggregate(0, OutputValueT(h_in[segment_begin])); 
                for (int j = segment_begin + 1; j < segment_end; ++j)
                {
                    OutputT item(j - segment_begin, OutputValueT(h_in[j]));
                    aggregate = reduction_op(aggregate, item);
                }
                h_reference[i] = aggregate;
            }
            else 
            {
                OutputT aggregate(1, Traits<InputValueT>::Lowest());
                h_reference[i] = aggregate;
            }
        }
    }
};


//---------------------------------------------------------------------
// Problem generation
//---------------------------------------------------------------------

template <class It>
It unwrap_it(It it) {
    return it;
}

#if TEST_HALF_T
__half* unwrap_it(half_t *it) {
    return reinterpret_cast<__half*>(it);
}

template <class OffsetT>
ConstantInputIterator<__half, OffsetT> unwrap_it(ConstantInputIterator<half_t, OffsetT> it) {
    half_t wrapped_val = *it;
    __half val = wrapped_val.operator __half();
    return ConstantInputIterator<__half, OffsetT>(val);
}
#endif

#if TEST_BF_T
__nv_bfloat16* unwrap_it(bfloat16_t* it) {
    return reinterpret_cast<__nv_bfloat16*>(it);
}

template <class OffsetT>
ConstantInputIterator<__nv_bfloat16, OffsetT> unwrap_it(ConstantInputIterator<bfloat16_t, OffsetT> it) {
    bfloat16_t wrapped_val = *it;
    __nv_bfloat16 val = wrapped_val.operator __nv_bfloat16();
    return ConstantInputIterator<__nv_bfloat16, OffsetT>(val);
}
#endif

template <class WrappedItT, //
          class ItT = decltype(unwrap_it(std::declval<WrappedItT>()))>
std::integral_constant<bool, !std::is_same<WrappedItT, ItT>::value> //
reference_extended_fp(WrappedItT)
{
    return {};
}

ExtendedFloatSum unwrap_op(std::true_type /* extended float */, cub::Sum) //
{
    return {};
}

template <bool V, class OpT>
OpT unwrap_op(std::integral_constant<bool, V> /* base case */, OpT op)
{
    return op;
}

/// Test DeviceReduce for a given problem input
template <
    typename                BackendT,
    typename                DeviceWrappedInputIteratorT,
    typename                DeviceWrappedOutputIteratorT,
    typename                HostReferenceIteratorT,
    typename                OffsetT,
    typename                BeginOffsetIteratorT,
    typename                EndOffsetIteratorT,
    typename                ReductionOpT>
void Test(
    BackendT                        backend,
    DeviceWrappedInputIteratorT     d_wrapped_in,
    DeviceWrappedOutputIteratorT    d_wrapped_out,
    OffsetT                         num_items,
    OffsetT                         num_segments,
    BeginOffsetIteratorT            d_segment_begin_offsets,
    EndOffsetIteratorT              d_segment_end_offsets,
    ReductionOpT                    reduction_op,
    HostReferenceIteratorT          h_reference)
{
    // Input data types
    auto d_in = unwrap_it(d_wrapped_in);
    auto d_out = unwrap_it(d_wrapped_out);

    // Allocate CDP device arrays for temp storage size and error
    size_t          *d_temp_storage_bytes = NULL;
    cudaError_t     *d_cdp_error = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));

    // Inquire temp device storage
    void            *d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;
    CubDebugExit(Dispatch(backend, 1,
        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
        d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
        reduction_op));

    // Allocate temp device storage
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

    // Run warmup/correctness iteration
    CubDebugExit(Dispatch(backend, 1,
        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
        d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
        reduction_op));

    // Check for correctness (and display results, if specified)
    int compare = CompareDeviceResults(h_reference, d_wrapped_out, num_segments, g_verbose, g_verbose);

    printf("\t%s", compare ? "FAIL" : "PASS");

    // Flush any stdout/stderr
    fflush(stdout);
    fflush(stderr);

    // Performance
    if (g_timing_iterations > 0)
    {
        GpuTimer gpu_timer;
        gpu_timer.Start();

        CubDebugExit(Dispatch(backend, g_timing_iterations,
            d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
            d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
            reduction_op));

        gpu_timer.Stop();
        float elapsed_millis = gpu_timer.ElapsedMillis();

        // Display performance
        float avg_millis = elapsed_millis / g_timing_iterations;
        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
        float giga_bandwidth = giga_rate * sizeof(h_reference[0]);
        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak",
            avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);

    }

    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

    // Correctness asserts
    AssertEquals(0, compare);
}

template <Backend BACKEND,
          typename OutputValueT,
          typename HostInputIteratorT,
          typename DeviceInputIteratorT,
          typename OffsetT,
          typename BeginOffsetIteratorT,
          typename EndOffsetIteratorT,
          typename ReductionOpT>
void SolveAndTest(HostInputIteratorT h_in,
                  DeviceInputIteratorT d_in,
                  OffsetT num_items,
                  OffsetT num_segments,
                  BeginOffsetIteratorT h_segment_begin_offsets,
                  EndOffsetIteratorT h_segment_end_offsets,
                  BeginOffsetIteratorT d_segment_begin_offsets,
                  EndOffsetIteratorT d_segment_end_offsets,
                  ReductionOpT wrapped_reduction_op)
{
    auto reduction_op = unwrap_op(reference_extended_fp(d_in), wrapped_reduction_op);

    using InputValueT = cub::detail::value_t<DeviceInputIteratorT>;
    using SolutionT = Solution<decltype(reduction_op), InputValueT, OutputValueT>;
    using OutputT = typename SolutionT::OutputT;

    printf("\n\n%s cub::DeviceReduce<%s> %d items (%s), %d segments\n",
           BackendToString(BACKEND),
           typeid(ReductionOpT).name(),
           num_items,
           typeid(HostInputIteratorT).name(),
           num_segments);
    fflush(stdout);

    // Allocate and solve solution
    OutputT *h_reference = new OutputT[num_segments];
    SolutionT::Solve(h_in, h_reference, num_segments, h_segment_begin_offsets, h_segment_end_offsets, reduction_op);

    // Run with discard iterator
    DiscardOutputIterator<OffsetT> discard_itr;
    Test(Int2Type<BACKEND>(), d_in, discard_itr, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, h_reference);

    // Run with output data
    OutputT *d_out = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_segments));
    CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_segments));
    Test(Int2Type<BACKEND>(), d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, h_reference);

    // Cleanup
    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
    if (h_reference) delete[] h_reference;
}

/// Test specific problem type
template <
    Backend         BACKEND,
    typename        InputT,
    typename        OutputT,
    typename        OffsetT,
    typename        ReductionOpT>
void TestProblem(
    OffsetT         num_items,
    OffsetT         num_segments,
    GenMode         gen_mode,
    ReductionOpT    reduction_op)
{
    printf("\n\nInitializing %d %s->%s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
    fflush(stdout);

    // Initialize value data
    InputT* h_in = new InputT[num_items];
    Initialize(gen_mode, h_in, num_items);

    // Initialize segment data
    OffsetT *h_segment_offsets = new OffsetT[num_segments + 1];
    InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);

    // Initialize device data
    OffsetT *d_segment_offsets      = NULL;
    InputT  *d_in                   = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in,              sizeof(InputT) * num_items));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1)));
    CubDebugExit(cudaMemcpy(d_in,               h_in,                   sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
    CubDebugExit(cudaMemcpy(d_segment_offsets,  h_segment_offsets,      sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));

    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
        h_segment_offsets, h_segment_offsets + 1, d_segment_offsets, d_segment_offsets + 1, reduction_op);

    if (h_segment_offsets)  delete[] h_segment_offsets;
    if (d_segment_offsets)  CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
    if (h_in)               delete[] h_in;
    if (d_in)               CubDebugExit(g_allocator.DeviceFree(d_in));
}


/// Test different operators
template <
    Backend             BACKEND,
    typename            OutputT,
    typename            HostInputIteratorT,
    typename            DeviceInputIteratorT,
    typename            OffsetT,
    typename            BeginOffsetIteratorT,
    typename            EndOffsetIteratorT>
void TestByOp(
    HostInputIteratorT      h_in,
    DeviceInputIteratorT    d_in,
    OffsetT                 num_items,
    OffsetT                 num_segments,
    BeginOffsetIteratorT    h_segment_begin_offsets,
    EndOffsetIteratorT      h_segment_end_offsets,
    BeginOffsetIteratorT    d_segment_begin_offsets,
    EndOffsetIteratorT      d_segment_end_offsets)
{
    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
        h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, CustomMax());
    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
        h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, Sum());
    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
        h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, Min());
    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
        h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, ArgMin());
    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
        h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, Max());
    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
        h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, ArgMax());
}

template<typename OffsetT>
struct TransformFunctor1
{
    __host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const
    {
        return offset;
    }
};

template<typename OffsetT>
struct TransformFunctor2
{
    __host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const
    {
        return offset;
    }
};

/// Test different backends
template <
    typename    InputT,
    typename    OutputT,
    typename    OffsetT>
void TestByBackend(
    OffsetT     num_items,
    OffsetT     max_segments,
    GenMode     gen_mode)
{
#if TEST_CDP == 0
  constexpr auto NonSegmentedBackend   = CUB;
  constexpr auto SegmentedBackend      = CUB_SEGMENTED;
#else  // TEST_CDP
  constexpr auto NonSegmentedBackend   = CDP;
  constexpr auto SegmentedBackend      = CDP_SEGMENTED;
#endif // TEST_CDP

    // Initialize host data
    printf("\n\nInitializing %d %s -> %s (gen mode %d)... ",
        num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);

    InputT  *h_in               = new InputT[num_items];
    OffsetT *h_segment_offsets  = new OffsetT[max_segments + 1];
    Initialize(gen_mode, h_in, num_items);

    // Initialize device data
    InputT  *d_in               = NULL;
    OffsetT *d_segment_offsets  = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items));
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (max_segments + 1)));
    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));

    //
    // Test single-segment implementations
    //

    InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);

    // Page-aligned-input tests
    TestByOp<NonSegmentedBackend, OutputT>(h_in, d_in, num_items, 1,
        h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL);

    // Non-page-aligned-input tests
    if (num_items > 1)
    {
        InitializeSegments(num_items - 1, 1, h_segment_offsets, g_verbose_input);
        TestByOp<NonSegmentedBackend, OutputT>(h_in + 1, d_in + 1, num_items - 1, 1,
            h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL);
    }

    //
    // Test segmented implementation
    //

    // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
    int max_items_per_segment = 128000;

    for (int num_segments = cub::DivideAndRoundUp(num_items, max_items_per_segment);
        num_segments < max_segments;
        num_segments = (num_segments * 32) + 1)
    {
        // Test with segment pointer
        InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
        CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
        TestByOp<SegmentedBackend, OutputT>(h_in, d_in, num_items, num_segments,
            h_segment_offsets, h_segment_offsets + 1, d_segment_offsets, d_segment_offsets + 1);

        // Test with segment iterator
        typedef CastOp<OffsetT> IdentityOpT;
        IdentityOpT identity_op;
        TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> h_segment_offsets_itr(
            h_segment_offsets,
            identity_op);
        TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> d_segment_offsets_itr(
            d_segment_offsets,
            identity_op);

        TestByOp<SegmentedBackend, OutputT>(h_in, d_in, num_items, num_segments,
            h_segment_offsets_itr, h_segment_offsets_itr + 1, d_segment_offsets_itr, d_segment_offsets_itr + 1);

        // Test with transform iterators of different types

        typedef TransformFunctor1<OffsetT> TransformFunctor1T;
        typedef TransformFunctor2<OffsetT> TransformFunctor2T;

        TransformInputIterator<OffsetT, TransformFunctor1T, OffsetT*, OffsetT> h_segment_begin_offsets_itr(h_segment_offsets, TransformFunctor1T());
        TransformInputIterator<OffsetT, TransformFunctor2T, OffsetT*, OffsetT> h_segment_end_offsets_itr(h_segment_offsets + 1, TransformFunctor2T());

        TransformInputIterator<OffsetT, TransformFunctor1T, OffsetT*, OffsetT> d_segment_begin_offsets_itr(d_segment_offsets, TransformFunctor1T());
        TransformInputIterator<OffsetT, TransformFunctor2T, OffsetT*, OffsetT> d_segment_end_offsets_itr(d_segment_offsets + 1, TransformFunctor2T());

        TestByOp<SegmentedBackend, OutputT>(h_in, d_in, num_items, num_segments,
            h_segment_begin_offsets_itr, h_segment_end_offsets_itr,
            d_segment_begin_offsets_itr, d_segment_end_offsets_itr);
    }

    if (h_in)               delete[] h_in;
    if (h_segment_offsets)  delete[] h_segment_offsets;
    if (d_in)               CubDebugExit(g_allocator.DeviceFree(d_in));
    if (d_segment_offsets)  CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
}


/// Test different input-generation modes
template <
    typename InputT,
    typename OutputT,
    typename OffsetT>
void TestByGenMode(
    OffsetT num_items,
    OffsetT max_segments)
{
    //
    // Test pointer support using different input-generation modes
    //

    TestByBackend<InputT, OutputT>(num_items, max_segments, UNIFORM);
    TestByBackend<InputT, OutputT>(num_items, max_segments, INTEGER_SEED);
    TestByBackend<InputT, OutputT>(num_items, max_segments, RANDOM);

    //
    // Test iterator support using a constant-iterator and SUM
    //

    InputT val;
    InitValue(UNIFORM, val, 0);
    ConstantInputIterator<InputT, OffsetT> in(val);

    OffsetT *h_segment_offsets = new OffsetT[1 + 1];
    InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);

#if TEST_CDP == 0
    constexpr auto Backend   = CUB;
#else  // TEST_CDP
    constexpr auto Backend   = CDP;
#endif // TEST_CDP

    SolveAndTest<Backend, OutputT>(in, in, num_items, 1,
        h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL, Sum());

    if (h_segment_offsets) delete[] h_segment_offsets;
}

/// Test different problem sizes
template <typename InputT, typename OutputT, typename OffsetT>
void TestBySize(OffsetT max_items, OffsetT max_segments, OffsetT tile_size)
{
  // Test 0, 1, many
  TestByGenMode<InputT, OutputT>(0, max_segments);
  TestByGenMode<InputT, OutputT>(1, max_segments);
  TestByGenMode<InputT, OutputT>(max_items, max_segments);

  // Test random problem sizes from a log-distribution [8, max_items-ish)
  int    num_iterations = 8;
  double max_exp        = log(double(max_items)) / log(double(2.0));
  for (int i = 0; i < num_iterations; ++i)
  {
    OffsetT num_items = (OffsetT)pow(2.0, RandomValue(max_exp - 3.0) + 3.0);
    TestByGenMode<InputT, OutputT>(num_items, max_segments);
  }

  //
  // White-box testing of single-segment problems around specific sizes
  //

#if TEST_CDP == 0
  constexpr auto Backend   = CUB;
#else  // TEST_CDP
  constexpr auto Backend   = CDP;
#endif // TEST_CDP

  // Tile-boundaries: multiple blocks, one tile per block
  TestProblem<Backend, InputT, OutputT>(tile_size * 4, 1, RANDOM, Sum());
  TestProblem<Backend, InputT, OutputT>(tile_size * 4 + 1, 1, RANDOM, Sum());
  TestProblem<Backend, InputT, OutputT>(tile_size * 4 - 1, 1, RANDOM, Sum());

  // Tile-boundaries: multiple blocks, multiple tiles per block
  OffsetT sm_occupancy = 32;
  OffsetT occupancy    = tile_size * sm_occupancy * g_sm_count;
  TestProblem<Backend, InputT, OutputT>(occupancy, 1, RANDOM, Sum());
  TestProblem<Backend, InputT, OutputT>(occupancy + 1, 1, RANDOM, Sum());
  TestProblem<Backend, InputT, OutputT>(occupancy - 1, 1, RANDOM, Sum());
};

class CustomInputT
{
  char m_val{};

public:
  __host__ __device__ explicit CustomInputT(char val)
      : m_val(val)
  {}

  __host__ __device__ int get() const { return static_cast<int>(m_val); }
};

class CustomAccumulatorT
{
  int m_val{0};
  int m_magic_value{42};

  __host__ __device__ CustomAccumulatorT(int val)
      : m_val(val)
  {}

public:
  __host__ __device__ CustomAccumulatorT()
  {}

  __host__ __device__ CustomAccumulatorT(const CustomAccumulatorT &in)
    : m_val(in.is_valid() * in.get())
    , m_magic_value(in.is_valid() * 42)
  {}

  __host__ __device__ void operator=(const CustomInputT &in)
  {
    if (this->is_valid())
    {
      m_val = in.get();
    }
  }

  __host__ __device__ void operator=(const CustomAccumulatorT &in)
  {
    if (this->is_valid() && in.is_valid())
    {
      m_val = in.get();
    }
  }

  __host__ __device__ CustomAccumulatorT 
  operator+(const CustomInputT &in) const
  {
    const int multiplier = this->is_valid();
    return {(m_val + in.get()) * multiplier};
  }

  __host__ __device__ CustomAccumulatorT
  operator+(const CustomAccumulatorT &in) const
  {
    const int multiplier = this->is_valid() && in.is_valid();
    return {(m_val + in.get()) * multiplier};
  }

  __host__ __device__ int get() const { return m_val; }

  __host__ __device__ bool is_valid() const { return m_magic_value == 42; }
};

class CustomOutputT
{
  bool *m_d_flag{};
  int m_expected{};

public:
  __host__ __device__ CustomOutputT(bool *d_flag, int expected)
      : m_d_flag(d_flag)
      , m_expected(expected)
  {}

  __host__ __device__ void operator=(const CustomAccumulatorT &accum) const
  {
    *m_d_flag = accum.is_valid() && (accum.get() == m_expected);
  }
};

__global__ void InitializeTestAccumulatorTypes(int num_items,
                                               int expected,
                                               bool *d_flag,
                                               CustomInputT *d_in,
                                               CustomOutputT *d_out)
{
  const int idx = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);

  if (idx < num_items)
  {
    d_in[idx] = CustomInputT(1);
  }

  if (idx == 0)
  {
    *d_out = CustomOutputT{d_flag, expected};
  }
}

template <typename T, 
          typename OffsetT>
void TestBigIndicesHelper(OffsetT num_items)
{
  thrust::constant_iterator<T> const_iter(T{1});
  thrust::device_vector<std::size_t> out(1);
  std::size_t* d_out = thrust::raw_pointer_cast(out.data());

  std::uint8_t *d_temp_storage{};
  std::size_t temp_storage_bytes{};

  CubDebugExit(
    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, const_iter, d_out, num_items));

  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

  CubDebugExit(
    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, const_iter, d_out, num_items));
  std::size_t result = out[0];

  AssertEquals(result, num_items);
}

template <typename T>
void TestBigIndices()
{
  TestBigIndicesHelper<T, std::uint32_t>(1ull << 30);
  TestBigIndicesHelper<T, std::uint32_t>(1ull << 31);
  TestBigIndicesHelper<T, std::uint32_t>((1ull << 32) - 1);
  TestBigIndicesHelper<T, std::uint64_t>(1ull << 33);
}

#if TEST_TYPES == 3
void TestAccumulatorTypes()
{
  const int num_items  = 2 * 1024 * 1024;
  const int expected   = num_items;
  const int block_size = 256;
  const int grid_size  = (num_items + block_size - 1) / block_size;

  CustomInputT *d_in{};
  CustomOutputT *d_out{};
  CustomAccumulatorT init{};
  bool *d_flag{};

  CubDebugExit(
    g_allocator.DeviceAllocate((void **)&d_out, sizeof(CustomOutputT)));
  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_flag, sizeof(bool)));
  CubDebugExit(g_allocator.DeviceAllocate((void **)&d_in,
                                          sizeof(CustomInputT) * num_items));

  InitializeTestAccumulatorTypes<<<grid_size, block_size>>>(num_items,
                                                            expected,
                                                            d_flag,
                                                            d_in,
                                                            d_out);

  std::uint8_t *d_temp_storage{};
  std::size_t temp_storage_bytes{};

  CubDebugExit(cub::DeviceReduce::Reduce(d_temp_storage,
                                         temp_storage_bytes,
                                         d_in,
                                         d_out,
                                         num_items,
                                         cub::Sum{},
                                         init));

  CubDebugExit(
    g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes));
  CubDebugExit(cudaMemset(d_temp_storage, 1,  temp_storage_bytes));

  CubDebugExit(cub::DeviceReduce::Reduce(d_temp_storage,
                                         temp_storage_bytes,
                                         d_in,
                                         d_out,
                                         num_items,
                                         cub::Sum{},
                                         init));

  bool ok{};
  CubDebugExit(cudaMemcpy(&ok, d_flag, sizeof(bool), cudaMemcpyDeviceToHost));

  AssertTrue(ok);

  CubDebugExit(g_allocator.DeviceFree(d_out));
  CubDebugExit(g_allocator.DeviceFree(d_in));
}

/**
 * ArgMin should return max value for empty input. This interferes with
 * input data containing infinity values. This test checks that ArgMin
 * works correctly with infinity values.
 */
void TestFloatInfInArgMin()
{
  using in_t     = float;
  using offset_t = int;
  using out_t    = cub::KeyValuePair<offset_t, in_t>;

  const int n     = 10;
  const float inf = ::cuda::std::numeric_limits<float>::infinity();

  thrust::device_vector<in_t> in(n, inf);
  thrust::device_vector<out_t> out(1);

  const in_t *d_in = thrust::raw_pointer_cast(in.data());
  out_t *d_out     = thrust::raw_pointer_cast(out.data());

  std::uint8_t *d_temp_storage{};
  std::size_t temp_storage_bytes{};

  CubDebugExit(cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, n));
  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
  CubDebugExit(cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, n));

  const out_t result = out[0];
  AssertEquals(result.key, 0);
  AssertEquals(result.value, inf);
}

/**
 * ArgMax should return lowest value for empty input. This interferes with
 * input data containing infinity values. This test checks that ArgMax
 * works correctly with infinity values.
 */
void TestFloatInfInArgMax()
{
  using in_t = float;
  using offset_t = int;
  using out_t = cub::KeyValuePair<offset_t, in_t>;

  const int n = 10;
  const float inf = ::cuda::std::numeric_limits<float>::infinity();
  
  thrust::device_vector<in_t> in(n, -inf);
  thrust::device_vector<out_t> out(1);

  const in_t *d_in = thrust::raw_pointer_cast(in.data());
  out_t *d_out = thrust::raw_pointer_cast(out.data());

  std::uint8_t *d_temp_storage{};
  std::size_t temp_storage_bytes{};  

  CubDebugExit(cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, n));
  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
  CubDebugExit(cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, n));

  const out_t result = out[0];
  AssertEquals(result.key, 0);
  AssertEquals(result.value, -inf);
}

void TestFloatInfInArg()
{
  TestFloatInfInArgMin();
  TestFloatInfInArgMax();
}
#endif

template <typename InputT, typename OutputT, typename OffsetT>
struct GetTileSize
{
  OffsetT max_items{};
  OffsetT max_segments{};
  OffsetT tile_size{};

  GetTileSize(OffsetT max_items, OffsetT max_segments)
      : max_items(max_items)
      , max_segments(max_segments)
  {}

  template <typename ActivePolicyT>
  CUB_RUNTIME_FUNCTION cudaError_t Invoke()
  {
    this->tile_size = ActivePolicyT::ReducePolicy::BLOCK_THREADS *
                      ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD;
    return cudaSuccess;
  }
};

/// Test problem type
template <typename InputT, typename OutputT, typename OffsetT>
void TestType(OffsetT max_items, OffsetT max_segments)
{
  // Inspect the tuning policies to determine this arch's tile size:
  using MaxPolicyT =
    typename DeviceReducePolicy<InputT, OffsetT, cub::Sum>::MaxPolicy;
  GetTileSize<InputT, OutputT, OffsetT> dispatch(max_items, max_segments);
  CubDebugExit(MaxPolicyT::Invoke(g_ptx_version, dispatch));

  TestBySize<InputT, OutputT>(max_items, max_segments, dispatch.tile_size);
}

//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------


/**
 * Main
 */
int main(int argc, char** argv)
{
    typedef int OffsetT;

    OffsetT max_items       = 27000000;
    OffsetT max_segments    = 34000;

    // Initialize command line
    CommandLineArgs args(argc, argv);
    g_verbose = args.CheckCmdLineFlag("v");
    g_verbose_input = args.CheckCmdLineFlag("v2");
    args.GetCmdLineArgument("n", max_items);
    args.GetCmdLineArgument("s", max_segments);
    args.GetCmdLineArgument("i", g_timing_iterations);

    // Print usage
    if (args.CheckCmdLineFlag("help"))
    {
        printf("%s "
            "[--n=<input items> "
            "[--s=<num segments> "
            "[--i=<timing iterations> "
            "[--device=<device-id>] "
            "[--v] "
            "\n", argv[0]);
        exit(0);
    }

    // Initialize device
    CubDebugExit(args.DeviceInit());
    g_device_giga_bandwidth = args.device_giga_bandwidth;

    // Get ptx version
    CubDebugExit(PtxVersion(g_ptx_version));

    // Get SM count
    g_sm_count = args.deviceProp.multiProcessorCount;

    // %PARAM% TEST_CDP cdp 0:1
    // %PARAM% TEST_TYPES types 0:1:2:3:4

#if TEST_TYPES == 0
    TestType<signed char, signed char>(max_items, max_segments);
    TestType<unsigned char, unsigned char>(max_items, max_segments);
    TestType<signed char, int>(max_items, max_segments);
#elif TEST_TYPES == 1
    TestType<short, short>(max_items, max_segments);
    TestType<int, int>(max_items, max_segments);
    TestType<long, long>(max_items, max_segments);
    TestType<long long, long long>(max_items, max_segments);
#elif TEST_TYPES == 2
    TestType<uchar2, uchar2>(max_items, max_segments);
    TestType<uint2, uint2>(max_items, max_segments);
    TestType<ulonglong2, ulonglong2>(max_items, max_segments);
    TestType<ulonglong4, ulonglong4>(max_items, max_segments);
#elif TEST_TYPES == 3
    TestType<TestFoo, TestFoo>(max_items, max_segments);
    TestAccumulatorTypes();
    TestFloatInfInArg();

#if TEST_HALF_T
    TestType<half_t, half_t>(max_items, max_segments);
#endif
#else // TEST_TYPES == 4
    TestType<TestBar, TestBar>(max_items, max_segments);
    TestBigIndices<std::size_t>();

#if TEST_BF_T
    TestType<bfloat16_t, bfloat16_t>(max_items, max_segments);
#endif
#endif
    printf("\n");
    return 0;
}