thrust / dependencies /cub /test /test_device_reduce.cu

thanks to nvidia ❤

8ae5fc5 over 2 years ago

69.9 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/******************************************************************************
	* Test of DeviceReduce utilities
	******************************************************************************/

	// Ensure printing of CUDA runtime errors to console
	#define CUB_STDERR

	#include <cub/device/device_reduce.cuh>
	#include <cub/device/device_segmented_reduce.cuh>
	#include <cub/iterator/constant_input_iterator.cuh>
	#include <cub/iterator/discard_output_iterator.cuh>
	#include <cub/iterator/transform_input_iterator.cuh>
	#include <cub/thread/thread_operators.cuh>
	#include <cub/util_allocator.cuh>
	#include <cub/util_math.cuh>
	#include <cub/util_type.cuh>

	#include <thrust/device_vector.h>
	#include <thrust/iterator/constant_iterator.h>
	#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

	#include <cstdio>
	#include <limits>
	#include <type_traits>
	#include <typeinfo>

	#include "test_util.h"
	#include <nv/target>

	#define TEST_HALF_T !_NVHPC_CUDA

	#define TEST_BF_T !_NVHPC_CUDA

	#if TEST_HALF_T
	#include <cuda_fp16.h>

	// Half support is provided by SM53+. We currently test against a few older architectures.
	// The specializations below can be removed once we drop these architectures.
	namespace cub {

	template <>
	__host__ __device__ __forceinline__ //
	__half Min::operator()(__half& a, __half& b) const
	{
	NV_IF_TARGET(NV_PROVIDES_SM_53,
	(return CUB_MIN(a, b);),
	(return CUB_MIN(__half2float(a), __half2float(b));));
	}

	template <>
	__host__ __device__ __forceinline__ //
	KeyValuePair<int, __half>
	ArgMin::operator()(const KeyValuePair<int, __half> &a,
	const KeyValuePair<int, __half> &b) const
	{
	const float av = __half2float(a.value);
	const float bv = __half2float(b.value);

	if ((bv < av) \|\| ((av == bv) && (b.key < a.key)))
	{
	return b;
	}

	return a;
	}

	template <>
	__host__ __device__ __forceinline__ //
	__half Max::operator()(__half& a, __half& b) const
	{
	NV_IF_TARGET(NV_PROVIDES_SM_53,
	(return CUB_MAX(a, b);),
	(return CUB_MAX(__half2float(a), __half2float(b));));
	}

	template <>
	__host__ __device__ __forceinline__ //
	KeyValuePair<int, __half>
	ArgMax::operator()(const KeyValuePair<int, __half> &a,
	const KeyValuePair<int, __half> &b) const
	{
	const float av = __half2float(a.value);
	const float bv = __half2float(b.value);

	if ((bv > av) \|\| ((av == bv) && (b.key < a.key)))
	{
	return b;
	}

	return a;
	}

	} // namespace cub
	#endif

	#if TEST_BF_T
	#include <cuda_bf16.h>
	#endif

	using namespace cub;


	//---------------------------------------------------------------------
	// Globals, constants and typedefs
	//---------------------------------------------------------------------

	int g_ptx_version;
	int g_sm_count;
	double g_device_giga_bandwidth;
	bool g_verbose = false;
	bool g_verbose_input = false;
	int g_timing_iterations = 0;
	CachingDeviceAllocator g_allocator(true);


	// Dispatch types
	enum Backend
	{
	CUB, // CUB method
	CUB_SEGMENTED, // CUB segmented method
	CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
	CDP_SEGMENTED, // GPU-based segmented method
	};

	inline const char* BackendToString(Backend b)
	{
	switch (b)
	{
	case CUB:
	return "CUB";
	case CUB_SEGMENTED:
	return "CUB_SEGMENTED";
	case CDP:
	return "CDP";
	case CDP_SEGMENTED:
	return "CDP_SEGMENTED";
	default:
	break;
	}

	return "";
	}

	// Custom max functor
	struct CustomMax
	{
	/// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
	template <typename T, typename C>
	__host__ __device__ auto operator()(T&& a, C&& b)
	-> cub::detail::accumulator_t<cub::Max, T, C>
	{
	return CUB_MAX(a, b);
	}

	#if TEST_HALF_T
	__host__ __device__ __half operator()(__half& a, __half& b)
	{
	return cub::Max{}(a, b);
	}
	#endif
	};

	// Comparing results computed on CPU and GPU for extended floating point types is impossible.
	// For instance, when used with a constant iterator of two, the accumulator in sequential reference
	// computation (CPU) bumps into the 4096 limits, which will never change (`4096 + 2 = 4096`).
	// Meanwhile, per-thread aggregates (`2 * 16 = 32`) are accumulated within and among thread blocks,
	// yielding `inf` as a result. No reasonable epsilon can be selected to compare `inf` with `4096`.
	// To make `__half` and `__nv_bfloat16` arithmetic associative, the function object below raises
	// extended floating points to the area of unsigned short integers. This allows us to test large
	// inputs with few code-path differences in device algorithms.
	struct ExtendedFloatSum
	{
	template <class T>
	__host__ __device__ T operator()(T a, T b) const
	{
	T result{};
	result.__x = a.raw() + b.raw();
	return result;
	}

	#if TEST_HALF_T
	__host__ __device__ __half operator()(__half a, __half b) const
	{
	uint16_t result = this->operator()(half_t{a}, half_t(b)).raw();
	return reinterpret_cast<__half &>(result);
	}
	#endif

	#if TEST_BF_T
	__device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const
	{
	uint16_t result = this->operator()(bfloat16_t{a}, bfloat16_t(b)).raw();
	return reinterpret_cast<__nv_bfloat16 &>(result);
	}
	#endif
	};

	//---------------------------------------------------------------------
	// Dispatch to different CUB DeviceReduce entrypoints
	//---------------------------------------------------------------------

	/**
	* Dispatch to reduce entrypoint (custom-max)
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename ReductionOpT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<CUB> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int /max_segments/,
	BeginOffsetIteratorT /d_segment_begin_offsets/,
	EndOffsetIteratorT /d_segment_end_offsets/,
	ReductionOpT reduction_op)
	{
	using InputT = cub::detail::value_t<InputIteratorT>;

	// The output value type
	using OutputT = cub::detail::non_void_value_t<OutputIteratorT, InputT>;

	// Max-identity
	OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent

	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,
	d_in, d_out, num_items, reduction_op, identity);
	}

	return error;
	}

	/**
	* Dispatch to sum entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<CUB> /dispatch_to/,
	int timing_iterations,
	size_t * /d_temp_storage_bytes/,
	cudaError_t * /d_cdp_error/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int /max_segments/,
	BeginOffsetIteratorT /d_segment_begin_offsets/,
	EndOffsetIteratorT /d_segment_end_offsets/,
	cub::Sum /reduction_op/)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
	}

	return error;
	}

	/**
	* Dispatch to extended fp sum entrypoint
	*/
	template <typename InputIteratorT,
	typename OutputIteratorT,
	typename BeginOffsetIteratorT,
	typename EndOffsetIteratorT>
	CUB_RUNTIME_FUNCTION cudaError_t Dispatch(Int2Type<CUB> /dispatch_to/,
	int timing_iterations,
	size_t * /d_temp_storage_bytes/,
	cudaError_t * /d_cdp_error/,

	void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int /max_segments/,
	BeginOffsetIteratorT /d_segment_begin_offsets/,
	EndOffsetIteratorT /d_segment_end_offsets/,
	ExtendedFloatSum reduction_op)
	{
	using InputT = cub::detail::value_t<InputIteratorT>;
	using OutputT = cub::detail::non_void_value_t<OutputIteratorT, InputT>;

	OutputT identity{};

	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceReduce::Reduce(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	num_items,
	reduction_op,
	identity);
	}

	return error;
	}

	/**
	* Dispatch to min entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<CUB> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int /max_segments/,
	BeginOffsetIteratorT /d_segment_begin_offsets/,
	EndOffsetIteratorT /d_segment_end_offsets/,
	cub::Min /reduction_op/)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
	}

	return error;
	}

	/**
	* Dispatch to max entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<CUB> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int /max_segments/,
	BeginOffsetIteratorT /d_segment_begin_offsets/,
	EndOffsetIteratorT /d_segment_end_offsets/,
	cub::Max /reduction_op/)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
	}

	return error;
	}

	/**
	* Dispatch to argmin entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<CUB> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int /max_segments/,
	BeginOffsetIteratorT /d_segment_begin_offsets/,
	EndOffsetIteratorT /d_segment_end_offsets/,
	cub::ArgMin /reduction_op/)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
	}

	return error;
	}

	/**
	* Dispatch to argmax entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<CUB> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int /max_segments/,
	BeginOffsetIteratorT /d_segment_begin_offsets/,
	EndOffsetIteratorT /d_segment_end_offsets/,
	cub::ArgMax /reduction_op/)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
	}

	return error;
	}


	//---------------------------------------------------------------------
	// Dispatch to different CUB DeviceSegmentedReduce entrypoints
	//---------------------------------------------------------------------

	/**
	* Dispatch to reduce entrypoint (custom-max)
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename ReductionOpT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<CUB_SEGMENTED> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int /num_items/,
	int max_segments,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	ReductionOpT reduction_op)
	{
	// The input value type
	using InputT = cub::detail::value_t<InputIteratorT>;

	// The output value type
	using OutputT = cub::detail::non_void_value_t<OutputIteratorT, InputT>;

	// Max-identity
	OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent

	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes,
	d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, identity);
	}
	return error;
	}

	/**
	* Dispatch to sum entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<CUB_SEGMENTED> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int /num_items/,
	int max_segments,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	cub::Sum /reduction_op/)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes,
	d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets);
	}
	return error;
	}

	/**
	* Dispatch to extended fp sum entrypoint
	*/
	template <typename InputIteratorT,
	typename OutputIteratorT,
	typename BeginOffsetIteratorT,
	typename EndOffsetIteratorT>
	CUB_RUNTIME_FUNCTION cudaError_t Dispatch(Int2Type<CUB_SEGMENTED> /dispatch_to/,
	int timing_iterations,
	size_t * /d_temp_storage_bytes/,
	cudaError_t * /d_cdp_error/,

	void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int /num_items/,
	int max_segments,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	ExtendedFloatSum reduction_op)
	{
	using InputT = cub::detail::value_t<InputIteratorT>;
	using OutputT = cub::detail::non_void_value_t<OutputIteratorT, InputT>;

	OutputT identity{};

	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceSegmentedReduce::Reduce(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	max_segments,
	d_segment_begin_offsets,
	d_segment_end_offsets,
	reduction_op,
	identity);
	}

	return error;
	}

	/**
	* Dispatch to min entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<CUB_SEGMENTED> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int /num_items/,
	int max_segments,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	cub::Min /reduction_op/)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes,
	d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets);
	}
	return error;
	}

	/**
	* Dispatch to max entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<CUB_SEGMENTED> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int /num_items/,
	int max_segments,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	cub::Max /reduction_op/)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes,
	d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets);
	}
	return error;
	}

	/**
	* Dispatch to argmin entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<CUB_SEGMENTED> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int /num_items/,
	int max_segments,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	cub::ArgMin /reduction_op/)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes,
	d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets);
	}
	return error;
	}

	/**
	* Dispatch to argmax entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<CUB_SEGMENTED> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int /num_items/,
	int max_segments,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	cub::ArgMax /reduction_op/)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes,
	d_in, d_out, max_segments, d_segment_begin_offsets, d_segment_end_offsets);
	}
	return error;
	}


	//---------------------------------------------------------------------
	// CUDA nested-parallelism test kernel
	//---------------------------------------------------------------------

	#if TEST_CDP == 1

	/**
	* Simple wrapper kernel to invoke DeviceReduce
	*/
	template <int CubBackend,
	typename InputIteratorT,
	typename OutputIteratorT,
	typename BeginOffsetIteratorT,
	typename EndOffsetIteratorT,
	typename ReductionOpT>
	__global__ void CDPDispatchKernel(Int2Type<CubBackend> cub_backend,
	int timing_iterations,
	size_t *d_temp_storage_bytes,
	cudaError_t *d_cdp_error,

	void *d_temp_storage,
	size_t temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int max_segments,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	ReductionOpT reduction_op)
	{
	*d_cdp_error = Dispatch(cub_backend,
	timing_iterations,
	d_temp_storage_bytes,
	d_cdp_error,
	d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	num_items,
	max_segments,
	d_segment_begin_offsets,
	d_segment_end_offsets,
	reduction_op);

	*d_temp_storage_bytes = temp_storage_bytes;
	}

	/**
	* Launch kernel and dispatch on device. Should only be called from host code.
	* The CubBackend should be one of the non-CDP CUB backends to invoke from the
	* device.
	*/
	template <int CubBackend,
	typename InputIteratorT,
	typename OutputIteratorT,
	typename BeginOffsetIteratorT,
	typename EndOffsetIteratorT,
	typename ReductionOpT>
	cudaError_t LaunchCDPKernel(Int2Type<CubBackend> cub_backend,
	int timing_iterations,
	size_t *d_temp_storage_bytes,
	cudaError_t *d_cdp_error,

	void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int max_segments,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	ReductionOpT reduction_op)
	{
	cudaError_t retval =
	thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0)
	.doit(CDPDispatchKernel<CubBackend,
	InputIteratorT,
	OutputIteratorT,
	BeginOffsetIteratorT,
	EndOffsetIteratorT,
	ReductionOpT>,
	cub_backend,
	timing_iterations,
	d_temp_storage_bytes,
	d_cdp_error,
	d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	num_items,
	max_segments,
	d_segment_begin_offsets,
	d_segment_end_offsets,
	reduction_op);
	CubDebugExit(retval);
	CubDebugExit(cub::detail::device_synchronize());

	// Copy out temp_storage_bytes
	CubDebugExit(cudaMemcpy(&temp_storage_bytes,
	d_temp_storage_bytes,
	sizeof(size_t) * 1,
	cudaMemcpyDeviceToHost));

	// Copy out error
	CubDebugExit(cudaMemcpy(&retval,
	d_cdp_error,
	sizeof(cudaError_t) * 1,
	cudaMemcpyDeviceToHost));

	return retval;
	}

	// Specializations of Dispatch that translate the CDP backend to the appropriate
	// CUB backend, and uses the CUB backend to launch the CDP kernel.
	#define DEFINE_CDP_DISPATCHER(CdpBackend, CubBackend) \
	template <typename InputIteratorT, \
	typename OutputIteratorT, \
	typename BeginOffsetIteratorT, \
	typename EndOffsetIteratorT, \
	typename ReductionOpT> \
	cudaError_t Dispatch(Int2Type<CdpBackend>, \
	int timing_iterations, \
	size_t *d_temp_storage_bytes, \
	cudaError_t *d_cdp_error, \
	\
	void *d_temp_storage, \
	size_t &temp_storage_bytes, \
	InputIteratorT d_in, \
	OutputIteratorT d_out, \
	int num_items, \
	int max_segments, \
	BeginOffsetIteratorT d_segment_begin_offsets, \
	EndOffsetIteratorT d_segment_end_offsets, \
	ReductionOpT reduction_op) \
	{ \
	Int2Type<CubBackend> cub_backend{}; \
	return LaunchCDPKernel(cub_backend, \
	timing_iterations, \
	d_temp_storage_bytes, \
	d_cdp_error, \
	d_temp_storage, \
	temp_storage_bytes, \
	d_in, \
	d_out, \
	num_items, \
	max_segments, \
	d_segment_begin_offsets, \
	d_segment_end_offsets, \
	reduction_op); \
	}

	DEFINE_CDP_DISPATCHER(CDP, CUB)
	DEFINE_CDP_DISPATCHER(CDP_SEGMENTED, CUB_SEGMENTED)

	#undef DEFINE_CDP_DISPATCHER

	#endif // TEST_CDP

	//---------------------------------------------------------------------
	// Problem generation
	//---------------------------------------------------------------------
	/// Initialize problem
	template <typename InputT>
	void Initialize(
	GenMode gen_mode,
	InputT *h_in,
	int num_items)
	{
	for (int i = 0; i < num_items; ++i)
	{
	InitValue(gen_mode, h_in[i], i);
	}

	if (g_verbose_input)
	{
	printf("Input:\n");
	DisplayResults(h_in, num_items);
	printf("\n\n");
	}
	}


	/// Solve problem (max/custom-max functor)
	template <typename ReductionOpT, typename InputT, typename _OutputT>
	struct Solution
	{
	using OutputT = _OutputT;
	using InitT = OutputT;
	using AccumT = cub::detail::accumulator_t<ReductionOpT, InitT, InputT>;

	template <typename HostInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
	static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments,
	BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, ReductionOpT reduction_op)
	{
	for (int i = 0; i < num_segments; ++i)
	{
	AccumT aggregate = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
	for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j)
	aggregate = reduction_op(aggregate, OutputT(h_in[j]));
	h_reference[i] = aggregate;
	}
	}
	};

	/// Solve problem (min functor)
	template <typename InputT, typename _OutputT>
	struct Solution<cub::Min, InputT, _OutputT>
	{
	using OutputT = _OutputT;
	using InitT = OutputT;
	using AccumT = cub::detail::accumulator_t<cub::Min, InitT, InputT>;

	template <typename HostInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
	static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments,
	BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, cub::Min reduction_op)
	{
	for (int i = 0; i < num_segments; ++i)
	{
	AccumT aggregate = Traits<InputT>::Max(); // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
	for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j)
	aggregate = reduction_op(aggregate, OutputT(h_in[j]));
	h_reference[i] = aggregate;
	}
	}
	};


	/// Solve problem (sum functor)
	template <typename InputT, typename _OutputT>
	struct Solution<cub::Sum, InputT, _OutputT>
	{
	using OutputT = _OutputT;
	using InitT = OutputT;
	using AccumT = cub::detail::accumulator_t<cub::Sum, InitT, InputT>;

	template <typename HostInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename ReductionOpT>
	static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments,
	BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, ReductionOpT reduction_op)
	{
	for (int i = 0; i < num_segments; ++i)
	{
	AccumT aggregate{};
	for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j)
	aggregate = reduction_op(aggregate, h_in[j]);
	h_reference[i] = static_cast<OutputT>(aggregate);
	}
	}
	};

	template <typename InputT, typename _OutputT>
	struct Solution<ExtendedFloatSum, InputT, _OutputT>
	{
	using OutputT = _OutputT;
	using InitT = OutputT;
	using AccumT = cub::detail::accumulator_t<cub::Sum, InitT, InputT>;

	template <typename HostInputIteratorT,
	typename OffsetT,
	typename BeginOffsetIteratorT,
	typename EndOffsetIteratorT,
	typename ReductionOpT>
	static void Solve(HostInputIteratorT h_in,
	OutputT *h_reference,
	OffsetT num_segments,
	BeginOffsetIteratorT h_segment_begin_offsets,
	EndOffsetIteratorT h_segment_end_offsets,
	ReductionOpT reduction_op)
	{
	for (int i = 0; i < num_segments; ++i)
	{
	AccumT aggregate{};
	for (int j = h_segment_begin_offsets[i]; j < h_segment_end_offsets[i]; ++j)
	aggregate = reduction_op(aggregate, h_in[j]);
	h_reference[i] = static_cast<OutputT>(aggregate);
	}
	}
	};

	/// Solve problem (argmin functor)
	template <typename InputValueT, typename OutputValueT>
	struct Solution<cub::ArgMin, InputValueT, OutputValueT>
	{
	typedef KeyValuePair<int, OutputValueT> OutputT;

	template <typename HostInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
	static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments,
	BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, cub::ArgMin reduction_op)
	{
	for (int i = 0; i < num_segments; ++i)
	{
	const auto segment_begin = h_segment_begin_offsets[i];
	const auto segment_end = h_segment_end_offsets[i];

	if (segment_begin < segment_end)
	{
	OutputT aggregate(0, OutputValueT(h_in[segment_begin]));
	for (int j = segment_begin + 1; j < segment_end; ++j)
	{
	OutputT item(j - segment_begin, OutputValueT(h_in[j]));
	aggregate = reduction_op(aggregate, item);
	}
	h_reference[i] = aggregate;
	}
	else
	{
	// Guaranteed output for empty segments
	OutputT aggregate(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
	h_reference[i] = aggregate;
	}
	}
	}
	};


	/// Solve problem (argmax functor)
	template <typename InputValueT, typename OutputValueT>
	struct Solution<cub::ArgMax, InputValueT, OutputValueT>
	{
	typedef KeyValuePair<int, OutputValueT> OutputT;

	template <typename HostInputIteratorT, typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
	static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments,
	BeginOffsetIteratorT h_segment_begin_offsets, EndOffsetIteratorT h_segment_end_offsets, cub::ArgMax reduction_op)
	{
	for (int i = 0; i < num_segments; ++i)
	{
	const auto segment_begin = h_segment_begin_offsets[i];
	const auto segment_end = h_segment_end_offsets[i];

	if (segment_begin < segment_end)
	{
	OutputT aggregate(0, OutputValueT(h_in[segment_begin]));
	for (int j = segment_begin + 1; j < segment_end; ++j)
	{
	OutputT item(j - segment_begin, OutputValueT(h_in[j]));
	aggregate = reduction_op(aggregate, item);
	}
	h_reference[i] = aggregate;
	}
	else
	{
	OutputT aggregate(1, Traits<InputValueT>::Lowest());
	h_reference[i] = aggregate;
	}
	}
	}
	};


	//---------------------------------------------------------------------
	// Problem generation
	//---------------------------------------------------------------------

	template <class It>
	It unwrap_it(It it) {
	return it;
	}

	#if TEST_HALF_T
	__half* unwrap_it(half_t *it) {
	return reinterpret_cast<__half*>(it);
	}

	template <class OffsetT>
	ConstantInputIterator<__half, OffsetT> unwrap_it(ConstantInputIterator<half_t, OffsetT> it) {
	half_t wrapped_val = *it;
	__half val = wrapped_val.operator __half();
	return ConstantInputIterator<__half, OffsetT>(val);
	}
	#endif

	#if TEST_BF_T
	__nv_bfloat16* unwrap_it(bfloat16_t* it) {
	return reinterpret_cast<__nv_bfloat16*>(it);
	}

	template <class OffsetT>
	ConstantInputIterator<__nv_bfloat16, OffsetT> unwrap_it(ConstantInputIterator<bfloat16_t, OffsetT> it) {
	bfloat16_t wrapped_val = *it;
	__nv_bfloat16 val = wrapped_val.operator __nv_bfloat16();
	return ConstantInputIterator<__nv_bfloat16, OffsetT>(val);
	}
	#endif

	template <class WrappedItT, //
	class ItT = decltype(unwrap_it(std::declval<WrappedItT>()))>
	std::integral_constant<bool, !std::is_same<WrappedItT, ItT>::value> //
	reference_extended_fp(WrappedItT)
	{
	return {};
	}

	ExtendedFloatSum unwrap_op(std::true_type /* extended float */, cub::Sum) //
	{
	return {};
	}

	template <bool V, class OpT>
	OpT unwrap_op(std::integral_constant<bool, V> /* base case */, OpT op)
	{
	return op;
	}

	/// Test DeviceReduce for a given problem input
	template <
	typename BackendT,
	typename DeviceWrappedInputIteratorT,
	typename DeviceWrappedOutputIteratorT,
	typename HostReferenceIteratorT,
	typename OffsetT,
	typename BeginOffsetIteratorT,
	typename EndOffsetIteratorT,
	typename ReductionOpT>
	void Test(
	BackendT backend,
	DeviceWrappedInputIteratorT d_wrapped_in,
	DeviceWrappedOutputIteratorT d_wrapped_out,
	OffsetT num_items,
	OffsetT num_segments,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	ReductionOpT reduction_op,
	HostReferenceIteratorT h_reference)
	{
	// Input data types
	auto d_in = unwrap_it(d_wrapped_in);
	auto d_out = unwrap_it(d_wrapped_out);

	// Allocate CDP device arrays for temp storage size and error
	size_t *d_temp_storage_bytes = NULL;
	cudaError_t *d_cdp_error = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_temp_storage_bytes, sizeof(size_t) 1));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_cdp_error, sizeof(cudaError_t) 1));

	// Inquire temp device storage
	void *d_temp_storage = NULL;
	size_t temp_storage_bytes = 0;
	CubDebugExit(Dispatch(backend, 1,
	d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
	d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
	reduction_op));

	// Allocate temp device storage
	CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

	// Run warmup/correctness iteration
	CubDebugExit(Dispatch(backend, 1,
	d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
	d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
	reduction_op));

	// Check for correctness (and display results, if specified)
	int compare = CompareDeviceResults(h_reference, d_wrapped_out, num_segments, g_verbose, g_verbose);

	printf("\t%s", compare ? "FAIL" : "PASS");

	// Flush any stdout/stderr
	fflush(stdout);
	fflush(stderr);

	// Performance
	if (g_timing_iterations > 0)
	{
	GpuTimer gpu_timer;
	gpu_timer.Start();

	CubDebugExit(Dispatch(backend, g_timing_iterations,
	d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
	d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
	reduction_op));

	gpu_timer.Stop();
	float elapsed_millis = gpu_timer.ElapsedMillis();

	// Display performance
	float avg_millis = elapsed_millis / g_timing_iterations;
	float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
	float giga_bandwidth = giga_rate * sizeof(h_reference[0]);
	printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak",
	avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);

	}

	if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
	if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
	if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

	// Correctness asserts
	AssertEquals(0, compare);
	}

	template <Backend BACKEND,
	typename OutputValueT,
	typename HostInputIteratorT,
	typename DeviceInputIteratorT,
	typename OffsetT,
	typename BeginOffsetIteratorT,
	typename EndOffsetIteratorT,
	typename ReductionOpT>
	void SolveAndTest(HostInputIteratorT h_in,
	DeviceInputIteratorT d_in,
	OffsetT num_items,
	OffsetT num_segments,
	BeginOffsetIteratorT h_segment_begin_offsets,
	EndOffsetIteratorT h_segment_end_offsets,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	ReductionOpT wrapped_reduction_op)
	{
	auto reduction_op = unwrap_op(reference_extended_fp(d_in), wrapped_reduction_op);

	using InputValueT = cub::detail::value_t<DeviceInputIteratorT>;
	using SolutionT = Solution<decltype(reduction_op), InputValueT, OutputValueT>;
	using OutputT = typename SolutionT::OutputT;

	printf("\n\n%s cub::DeviceReduce<%s> %d items (%s), %d segments\n",
	BackendToString(BACKEND),
	typeid(ReductionOpT).name(),
	num_items,
	typeid(HostInputIteratorT).name(),
	num_segments);
	fflush(stdout);

	// Allocate and solve solution
	OutputT *h_reference = new OutputT[num_segments];
	SolutionT::Solve(h_in, h_reference, num_segments, h_segment_begin_offsets, h_segment_end_offsets, reduction_op);

	// Run with discard iterator
	DiscardOutputIterator<OffsetT> discard_itr;
	Test(Int2Type<BACKEND>(), d_in, discard_itr, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, h_reference);

	// Run with output data
	OutputT *d_out = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_out, sizeof(OutputT) num_segments));
	CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_segments));
	Test(Int2Type<BACKEND>(), d_in, d_out, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, h_reference);

	// Cleanup
	if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
	if (h_reference) delete[] h_reference;
	}

	/// Test specific problem type
	template <
	Backend BACKEND,
	typename InputT,
	typename OutputT,
	typename OffsetT,
	typename ReductionOpT>
	void TestProblem(
	OffsetT num_items,
	OffsetT num_segments,
	GenMode gen_mode,
	ReductionOpT reduction_op)
	{
	printf("\n\nInitializing %d %s->%s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
	fflush(stdout);

	// Initialize value data
	InputT* h_in = new InputT[num_items];
	Initialize(gen_mode, h_in, num_items);

	// Initialize segment data
	OffsetT *h_segment_offsets = new OffsetT[num_segments + 1];
	InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);

	// Initialize device data
	OffsetT *d_segment_offsets = NULL;
	InputT *d_in = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_in, sizeof(InputT) num_items));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_segment_offsets, sizeof(OffsetT) (num_segments + 1)));
	CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
	CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));

	SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
	h_segment_offsets, h_segment_offsets + 1, d_segment_offsets, d_segment_offsets + 1, reduction_op);

	if (h_segment_offsets) delete[] h_segment_offsets;
	if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
	if (h_in) delete[] h_in;
	if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
	}


	/// Test different operators
	template <
	Backend BACKEND,
	typename OutputT,
	typename HostInputIteratorT,
	typename DeviceInputIteratorT,
	typename OffsetT,
	typename BeginOffsetIteratorT,
	typename EndOffsetIteratorT>
	void TestByOp(
	HostInputIteratorT h_in,
	DeviceInputIteratorT d_in,
	OffsetT num_items,
	OffsetT num_segments,
	BeginOffsetIteratorT h_segment_begin_offsets,
	EndOffsetIteratorT h_segment_end_offsets,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets)
	{
	SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
	h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, CustomMax());
	SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
	h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, Sum());
	SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
	h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, Min());
	SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
	h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, ArgMin());
	SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
	h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, Max());
	SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments,
	h_segment_begin_offsets, h_segment_end_offsets, d_segment_begin_offsets, d_segment_end_offsets, ArgMax());
	}

	template<typename OffsetT>
	struct TransformFunctor1
	{
	__host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const
	{
	return offset;
	}
	};

	template<typename OffsetT>
	struct TransformFunctor2
	{
	__host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const
	{
	return offset;
	}
	};

	/// Test different backends
	template <
	typename InputT,
	typename OutputT,
	typename OffsetT>
	void TestByBackend(
	OffsetT num_items,
	OffsetT max_segments,
	GenMode gen_mode)
	{
	#if TEST_CDP == 0
	constexpr auto NonSegmentedBackend = CUB;
	constexpr auto SegmentedBackend = CUB_SEGMENTED;
	#else // TEST_CDP
	constexpr auto NonSegmentedBackend = CDP;
	constexpr auto SegmentedBackend = CDP_SEGMENTED;
	#endif // TEST_CDP

	// Initialize host data
	printf("\n\nInitializing %d %s -> %s (gen mode %d)... ",
	num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);

	InputT *h_in = new InputT[num_items];
	OffsetT *h_segment_offsets = new OffsetT[max_segments + 1];
	Initialize(gen_mode, h_in, num_items);

	// Initialize device data
	InputT *d_in = NULL;
	OffsetT *d_segment_offsets = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_in, sizeof(InputT) num_items));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_segment_offsets, sizeof(OffsetT) (max_segments + 1)));
	CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));

	//
	// Test single-segment implementations
	//

	InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);

	// Page-aligned-input tests
	TestByOp<NonSegmentedBackend, OutputT>(h_in, d_in, num_items, 1,
	h_segment_offsets, h_segment_offsets + 1, (OffsetT) NULL, (OffsetT)NULL);

	// Non-page-aligned-input tests
	if (num_items > 1)
	{
	InitializeSegments(num_items - 1, 1, h_segment_offsets, g_verbose_input);
	TestByOp<NonSegmentedBackend, OutputT>(h_in + 1, d_in + 1, num_items - 1, 1,
	h_segment_offsets, h_segment_offsets + 1, (OffsetT) NULL, (OffsetT)NULL);
	}

	//
	// Test segmented implementation
	//

	// Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
	int max_items_per_segment = 128000;

	for (int num_segments = cub::DivideAndRoundUp(num_items, max_items_per_segment);
	num_segments < max_segments;
	num_segments = (num_segments * 32) + 1)
	{
	// Test with segment pointer
	InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
	CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
	TestByOp<SegmentedBackend, OutputT>(h_in, d_in, num_items, num_segments,
	h_segment_offsets, h_segment_offsets + 1, d_segment_offsets, d_segment_offsets + 1);

	// Test with segment iterator
	typedef CastOp<OffsetT> IdentityOpT;
	IdentityOpT identity_op;
	TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> h_segment_offsets_itr(
	h_segment_offsets,
	identity_op);
	TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> d_segment_offsets_itr(
	d_segment_offsets,
	identity_op);

	TestByOp<SegmentedBackend, OutputT>(h_in, d_in, num_items, num_segments,
	h_segment_offsets_itr, h_segment_offsets_itr + 1, d_segment_offsets_itr, d_segment_offsets_itr + 1);

	// Test with transform iterators of different types

	typedef TransformFunctor1<OffsetT> TransformFunctor1T;
	typedef TransformFunctor2<OffsetT> TransformFunctor2T;

	TransformInputIterator<OffsetT, TransformFunctor1T, OffsetT*, OffsetT> h_segment_begin_offsets_itr(h_segment_offsets, TransformFunctor1T());
	TransformInputIterator<OffsetT, TransformFunctor2T, OffsetT*, OffsetT> h_segment_end_offsets_itr(h_segment_offsets + 1, TransformFunctor2T());

	TransformInputIterator<OffsetT, TransformFunctor1T, OffsetT*, OffsetT> d_segment_begin_offsets_itr(d_segment_offsets, TransformFunctor1T());
	TransformInputIterator<OffsetT, TransformFunctor2T, OffsetT*, OffsetT> d_segment_end_offsets_itr(d_segment_offsets + 1, TransformFunctor2T());

	TestByOp<SegmentedBackend, OutputT>(h_in, d_in, num_items, num_segments,
	h_segment_begin_offsets_itr, h_segment_end_offsets_itr,
	d_segment_begin_offsets_itr, d_segment_end_offsets_itr);
	}

	if (h_in) delete[] h_in;
	if (h_segment_offsets) delete[] h_segment_offsets;
	if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
	if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
	}


	/// Test different input-generation modes
	template <
	typename InputT,
	typename OutputT,
	typename OffsetT>
	void TestByGenMode(
	OffsetT num_items,
	OffsetT max_segments)
	{
	//
	// Test pointer support using different input-generation modes
	//

	TestByBackend<InputT, OutputT>(num_items, max_segments, UNIFORM);
	TestByBackend<InputT, OutputT>(num_items, max_segments, INTEGER_SEED);
	TestByBackend<InputT, OutputT>(num_items, max_segments, RANDOM);

	//
	// Test iterator support using a constant-iterator and SUM
	//

	InputT val;
	InitValue(UNIFORM, val, 0);
	ConstantInputIterator<InputT, OffsetT> in(val);

	OffsetT *h_segment_offsets = new OffsetT[1 + 1];
	InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);

	#if TEST_CDP == 0
	constexpr auto Backend = CUB;
	#else // TEST_CDP
	constexpr auto Backend = CDP;
	#endif // TEST_CDP

	SolveAndTest<Backend, OutputT>(in, in, num_items, 1,
	h_segment_offsets, h_segment_offsets + 1, (OffsetT) NULL, (OffsetT)NULL, Sum());

	if (h_segment_offsets) delete[] h_segment_offsets;
	}

	/// Test different problem sizes
	template <typename InputT, typename OutputT, typename OffsetT>
	void TestBySize(OffsetT max_items, OffsetT max_segments, OffsetT tile_size)
	{
	// Test 0, 1, many
	TestByGenMode<InputT, OutputT>(0, max_segments);
	TestByGenMode<InputT, OutputT>(1, max_segments);
	TestByGenMode<InputT, OutputT>(max_items, max_segments);

	// Test random problem sizes from a log-distribution [8, max_items-ish)
	int num_iterations = 8;
	double max_exp = log(double(max_items)) / log(double(2.0));
	for (int i = 0; i < num_iterations; ++i)
	{
	OffsetT num_items = (OffsetT)pow(2.0, RandomValue(max_exp - 3.0) + 3.0);
	TestByGenMode<InputT, OutputT>(num_items, max_segments);
	}

	//
	// White-box testing of single-segment problems around specific sizes
	//

	#if TEST_CDP == 0
	constexpr auto Backend = CUB;
	#else // TEST_CDP
	constexpr auto Backend = CDP;
	#endif // TEST_CDP

	// Tile-boundaries: multiple blocks, one tile per block
	TestProblem<Backend, InputT, OutputT>(tile_size * 4, 1, RANDOM, Sum());
	TestProblem<Backend, InputT, OutputT>(tile_size * 4 + 1, 1, RANDOM, Sum());
	TestProblem<Backend, InputT, OutputT>(tile_size * 4 - 1, 1, RANDOM, Sum());

	// Tile-boundaries: multiple blocks, multiple tiles per block
	OffsetT sm_occupancy = 32;
	OffsetT occupancy = tile_size * sm_occupancy * g_sm_count;
	TestProblem<Backend, InputT, OutputT>(occupancy, 1, RANDOM, Sum());
	TestProblem<Backend, InputT, OutputT>(occupancy + 1, 1, RANDOM, Sum());
	TestProblem<Backend, InputT, OutputT>(occupancy - 1, 1, RANDOM, Sum());
	};

	class CustomInputT
	{
	char m_val{};

	public:
	__host__ __device__ explicit CustomInputT(char val)
	: m_val(val)
	{}

	__host__ __device__ int get() const { return static_cast<int>(m_val); }
	};

	class CustomAccumulatorT
	{
	int m_val{0};
	int m_magic_value{42};

	__host__ __device__ CustomAccumulatorT(int val)
	: m_val(val)
	{}

	public:
	__host__ __device__ CustomAccumulatorT()
	{}

	__host__ __device__ CustomAccumulatorT(const CustomAccumulatorT &in)
	: m_val(in.is_valid() * in.get())
	, m_magic_value(in.is_valid() * 42)
	{}

	__host__ __device__ void operator=(const CustomInputT &in)
	{
	if (this->is_valid())
	{
	m_val = in.get();
	}
	}

	__host__ __device__ void operator=(const CustomAccumulatorT &in)
	{
	if (this->is_valid() && in.is_valid())
	{
	m_val = in.get();
	}
	}

	__host__ __device__ CustomAccumulatorT
	operator+(const CustomInputT &in) const
	{
	const int multiplier = this->is_valid();
	return {(m_val + in.get()) * multiplier};
	}

	__host__ __device__ CustomAccumulatorT
	operator+(const CustomAccumulatorT &in) const
	{
	const int multiplier = this->is_valid() && in.is_valid();
	return {(m_val + in.get()) * multiplier};
	}

	__host__ __device__ int get() const { return m_val; }

	__host__ __device__ bool is_valid() const { return m_magic_value == 42; }
	};

	class CustomOutputT
	{
	bool *m_d_flag{};
	int m_expected{};

	public:
	__host__ __device__ CustomOutputT(bool *d_flag, int expected)
	: m_d_flag(d_flag)
	, m_expected(expected)
	{}

	__host__ __device__ void operator=(const CustomAccumulatorT &accum) const
	{
	*m_d_flag = accum.is_valid() && (accum.get() == m_expected);
	}
	};

	__global__ void InitializeTestAccumulatorTypes(int num_items,
	int expected,
	bool *d_flag,
	CustomInputT *d_in,
	CustomOutputT *d_out)
	{
	const int idx = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);

	if (idx < num_items)
	{
	d_in[idx] = CustomInputT(1);
	}

	if (idx == 0)
	{
	*d_out = CustomOutputT{d_flag, expected};
	}
	}

	template <typename T,
	typename OffsetT>
	void TestBigIndicesHelper(OffsetT num_items)
	{
	thrust::constant_iterator<T> const_iter(T{1});
	thrust::device_vector<std::size_t> out(1);
	std::size_t* d_out = thrust::raw_pointer_cast(out.data());

	std::uint8_t *d_temp_storage{};
	std::size_t temp_storage_bytes{};

	CubDebugExit(
	cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, const_iter, d_out, num_items));

	thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
	d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

	CubDebugExit(
	cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, const_iter, d_out, num_items));
	std::size_t result = out[0];

	AssertEquals(result, num_items);
	}

	template <typename T>
	void TestBigIndices()
	{
	TestBigIndicesHelper<T, std::uint32_t>(1ull << 30);
	TestBigIndicesHelper<T, std::uint32_t>(1ull << 31);
	TestBigIndicesHelper<T, std::uint32_t>((1ull << 32) - 1);
	TestBigIndicesHelper<T, std::uint64_t>(1ull << 33);
	}

	#if TEST_TYPES == 3
	void TestAccumulatorTypes()
	{
	const int num_items = 2 * 1024 * 1024;
	const int expected = num_items;
	const int block_size = 256;
	const int grid_size = (num_items + block_size - 1) / block_size;

	CustomInputT *d_in{};
	CustomOutputT *d_out{};
	CustomAccumulatorT init{};
	bool *d_flag{};

	CubDebugExit(
	g_allocator.DeviceAllocate((void **)&d_out, sizeof(CustomOutputT)));
	CubDebugExit(g_allocator.DeviceAllocate((void **)&d_flag, sizeof(bool)));
	CubDebugExit(g_allocator.DeviceAllocate((void **)&d_in,
	sizeof(CustomInputT) * num_items));

	InitializeTestAccumulatorTypes<<<grid_size, block_size>>>(num_items,
	expected,
	d_flag,
	d_in,
	d_out);

	std::uint8_t *d_temp_storage{};
	std::size_t temp_storage_bytes{};

	CubDebugExit(cub::DeviceReduce::Reduce(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	num_items,
	cub::Sum{},
	init));

	CubDebugExit(
	g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes));
	CubDebugExit(cudaMemset(d_temp_storage, 1, temp_storage_bytes));

	CubDebugExit(cub::DeviceReduce::Reduce(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	num_items,
	cub::Sum{},
	init));

	bool ok{};
	CubDebugExit(cudaMemcpy(&ok, d_flag, sizeof(bool), cudaMemcpyDeviceToHost));

	AssertTrue(ok);

	CubDebugExit(g_allocator.DeviceFree(d_out));
	CubDebugExit(g_allocator.DeviceFree(d_in));
	}

	/**
	* ArgMin should return max value for empty input. This interferes with
	* input data containing infinity values. This test checks that ArgMin
	* works correctly with infinity values.
	*/
	void TestFloatInfInArgMin()
	{
	using in_t = float;
	using offset_t = int;
	using out_t = cub::KeyValuePair<offset_t, in_t>;

	const int n = 10;
	const float inf = ::cuda::std::numeric_limits<float>::infinity();

	thrust::device_vector<in_t> in(n, inf);
	thrust::device_vector<out_t> out(1);

	const in_t *d_in = thrust::raw_pointer_cast(in.data());
	out_t *d_out = thrust::raw_pointer_cast(out.data());

	std::uint8_t *d_temp_storage{};
	std::size_t temp_storage_bytes{};

	CubDebugExit(cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, n));
	thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
	d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
	CubDebugExit(cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, n));

	const out_t result = out[0];
	AssertEquals(result.key, 0);
	AssertEquals(result.value, inf);
	}

	/**
	* ArgMax should return lowest value for empty input. This interferes with
	* input data containing infinity values. This test checks that ArgMax
	* works correctly with infinity values.
	*/
	void TestFloatInfInArgMax()
	{
	using in_t = float;
	using offset_t = int;
	using out_t = cub::KeyValuePair<offset_t, in_t>;

	const int n = 10;
	const float inf = ::cuda::std::numeric_limits<float>::infinity();

	thrust::device_vector<in_t> in(n, -inf);
	thrust::device_vector<out_t> out(1);

	const in_t *d_in = thrust::raw_pointer_cast(in.data());
	out_t *d_out = thrust::raw_pointer_cast(out.data());

	std::uint8_t *d_temp_storage{};
	std::size_t temp_storage_bytes{};

	CubDebugExit(cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, n));
	thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
	d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
	CubDebugExit(cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, n));

	const out_t result = out[0];
	AssertEquals(result.key, 0);
	AssertEquals(result.value, -inf);
	}

	void TestFloatInfInArg()
	{
	TestFloatInfInArgMin();
	TestFloatInfInArgMax();
	}
	#endif

	template <typename InputT, typename OutputT, typename OffsetT>
	struct GetTileSize
	{
	OffsetT max_items{};
	OffsetT max_segments{};
	OffsetT tile_size{};

	GetTileSize(OffsetT max_items, OffsetT max_segments)
	: max_items(max_items)
	, max_segments(max_segments)
	{}

	template <typename ActivePolicyT>
	CUB_RUNTIME_FUNCTION cudaError_t Invoke()
	{
	this->tile_size = ActivePolicyT::ReducePolicy::BLOCK_THREADS *
	ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD;
	return cudaSuccess;
	}
	};

	/// Test problem type
	template <typename InputT, typename OutputT, typename OffsetT>
	void TestType(OffsetT max_items, OffsetT max_segments)
	{
	// Inspect the tuning policies to determine this arch's tile size:
	using MaxPolicyT =
	typename DeviceReducePolicy<InputT, OffsetT, cub::Sum>::MaxPolicy;
	GetTileSize<InputT, OutputT, OffsetT> dispatch(max_items, max_segments);
	CubDebugExit(MaxPolicyT::Invoke(g_ptx_version, dispatch));

	TestBySize<InputT, OutputT>(max_items, max_segments, dispatch.tile_size);
	}

	//---------------------------------------------------------------------
	// Main
	//---------------------------------------------------------------------


	/**
	* Main
	*/
	int main(int argc, char** argv)
	{
	typedef int OffsetT;

	OffsetT max_items = 27000000;
	OffsetT max_segments = 34000;

	// Initialize command line
	CommandLineArgs args(argc, argv);
	g_verbose = args.CheckCmdLineFlag("v");
	g_verbose_input = args.CheckCmdLineFlag("v2");
	args.GetCmdLineArgument("n", max_items);
	args.GetCmdLineArgument("s", max_segments);
	args.GetCmdLineArgument("i", g_timing_iterations);

	// Print usage
	if (args.CheckCmdLineFlag("help"))
	{
	printf("%s "
	"[--n=<input items> "
	"[--s=<num segments> "
	"[--i=<timing iterations> "
	"[--device=<device-id>] "
	"[--v] "
	"\n", argv[0]);
	exit(0);
	}

	// Initialize device
	CubDebugExit(args.DeviceInit());
	g_device_giga_bandwidth = args.device_giga_bandwidth;

	// Get ptx version
	CubDebugExit(PtxVersion(g_ptx_version));

	// Get SM count
	g_sm_count = args.deviceProp.multiProcessorCount;

	// %PARAM% TEST_CDP cdp 0:1
	// %PARAM% TEST_TYPES types 0:1:2:3:4

	#if TEST_TYPES == 0
	TestType<signed char, signed char>(max_items, max_segments);
	TestType<unsigned char, unsigned char>(max_items, max_segments);
	TestType<signed char, int>(max_items, max_segments);
	#elif TEST_TYPES == 1
	TestType<short, short>(max_items, max_segments);
	TestType<int, int>(max_items, max_segments);
	TestType<long, long>(max_items, max_segments);
	TestType<long long, long long>(max_items, max_segments);
	#elif TEST_TYPES == 2
	TestType<uchar2, uchar2>(max_items, max_segments);
	TestType<uint2, uint2>(max_items, max_segments);
	TestType<ulonglong2, ulonglong2>(max_items, max_segments);
	TestType<ulonglong4, ulonglong4>(max_items, max_segments);
	#elif TEST_TYPES == 3
	TestType<TestFoo, TestFoo>(max_items, max_segments);
	TestAccumulatorTypes();
	TestFloatInfInArg();

	#if TEST_HALF_T
	TestType<half_t, half_t>(max_items, max_segments);
	#endif
	#else // TEST_TYPES == 4
	TestType<TestBar, TestBar>(max_items, max_segments);
	TestBigIndices<std::size_t>();

	#if TEST_BF_T
	TestType<bfloat16_t, bfloat16_t>(max_items, max_segments);
	#endif
	#endif
	printf("\n");
	return 0;
	}