thrust / dependencies /cub /test /test_device_radix_sort.cu

thanks to nvidia ❤

8ae5fc5 over 2 years ago

83.3 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/******************************************************************************
	* Test of DeviceRadixSort utilities
	******************************************************************************/

	// Ensure printing of CUDA runtime errors to console
	#define CUB_STDERR

	#include <algorithm>
	#include <climits>
	#include <cstdio>
	#include <limits>
	#include <memory>
	#include <random>
	#include <type_traits>
	#include <typeinfo>
	#include <vector>

	#if !_NVHPC_CUDA
	#include <cuda_fp16.h>
	#endif

	#if !_NVHPC_CUDA
	#include <cuda_bf16.h>
	#endif

	#include <cub/detail/device_synchronize.cuh>
	#include <cub/device/device_radix_sort.cuh>
	#include <cub/device/device_segmented_radix_sort.cuh>
	#include <cub/iterator/transform_input_iterator.cuh>
	#include <cub/util_allocator.cuh>
	#include <cub/util_math.cuh>

	#include <thrust/device_vector.h>
	#include <thrust/execution_policy.h>
	#include <thrust/host_vector.h>
	#include <thrust/mismatch.h>
	#include <thrust/sequence.h>
	#include <thrust/sort.h>
	#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

	#include "test_util.h"

	using namespace cub;


	//---------------------------------------------------------------------
	// Globals, constants and typedefs
	//---------------------------------------------------------------------

	bool g_verbose = false;
	int g_timing_iterations = 0;
	std::size_t g_smallest_pre_sorted_num_items = (std::size_t(1) << 32) - 42;
	CachingDeviceAllocator g_allocator(true);

	// Dispatch types
	enum Backend
	{
	CUB, // CUB method (allows overwriting of input)
	CUB_NO_OVERWRITE, // CUB method (disallows overwriting of input)

	CUB_SEGMENTED, // CUB method (allows overwriting of input)
	CUB_SEGMENTED_NO_OVERWRITE, // CUB method (disallows overwriting of input)

	// Same as above, but launches kernels from device using CDP.
	CDP,
	CDP_NO_OVERWRITE,
	CDP_SEGMENTED,
	CDP_SEGMENTED_NO_OVERWRITE,
	};

	static const char* BackendToString(Backend b)
	{
	switch (b)
	{
	case CUB:
	return "CUB";
	case CUB_NO_OVERWRITE:
	return "CUB_NO_OVERWRITE";
	case CUB_SEGMENTED:
	return "CUB_SEGMENTED";
	case CUB_SEGMENTED_NO_OVERWRITE:
	return "CUB_SEGMENTED_NO_OVERWRITE";
	case CDP:
	return "CDP";
	case CDP_NO_OVERWRITE:
	return "CDP_NO_OVERWRITE";
	case CDP_SEGMENTED:
	return "CDP_SEGMENTED";
	case CDP_SEGMENTED_NO_OVERWRITE:
	return "CDP_SEGMENTED_NO_OVERWRITE";
	default:
	break;
	}

	return "";
	}

	//---------------------------------------------------------------------
	// Dispatch to different DeviceRadixSort entrypoints
	//---------------------------------------------------------------------

	/**
	* Dispatch to CUB sorting entrypoint (specialized for ascending)
	*/
	template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
	typename NumItemsT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<false> /is_descending/,
	Int2Type<CUB> /dispatch_to/,
	int /d_selector*/,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	DoubleBuffer<KeyT> &d_keys,
	DoubleBuffer<ValueT> &d_values,
	NumItemsT num_items,
	int /num_segments/,
	BeginOffsetIteratorT /d_segment_begin_offsets/,
	EndOffsetIteratorT /d_segment_end_offsets/,
	int begin_bit,
	int end_bit)
	{
	return DeviceRadixSort::SortPairs(
	d_temp_storage, temp_storage_bytes,
	d_keys, d_values,
	num_items, begin_bit, end_bit);
	}

	/**
	* Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for ascending)
	*/
	template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
	typename NumItemsT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<false> /is_descending/,
	Int2Type<CUB_NO_OVERWRITE> /dispatch_to/,
	int /d_selector*/,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	DoubleBuffer<KeyT> &d_keys,
	DoubleBuffer<ValueT> &d_values,
	NumItemsT num_items,
	int /num_segments/,
	BeginOffsetIteratorT /d_segment_begin_offsets/,
	EndOffsetIteratorT /d_segment_end_offsets/,
	int begin_bit,
	int end_bit)
	{
	KeyT const *const_keys_itr = d_keys.Current();
	ValueT const *const_values_itr = d_values.Current();

	cudaError_t retval = DeviceRadixSort::SortPairs(
	d_temp_storage, temp_storage_bytes,
	const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
	num_items, begin_bit, end_bit);

	d_keys.selector ^= 1;
	d_values.selector ^= 1;
	return retval;
	}

	/**
	* Dispatch to CUB sorting entrypoint (specialized for descending)
	*/
	template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
	typename NumItemsT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<true> /is_descending/,
	Int2Type<CUB> /dispatch_to/,
	int /d_selector*/,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	DoubleBuffer<KeyT> &d_keys,
	DoubleBuffer<ValueT> &d_values,
	NumItemsT num_items,
	int /num_segments/,
	BeginOffsetIteratorT /d_segment_begin_offsets/,
	EndOffsetIteratorT /d_segment_end_offsets/,
	int begin_bit,
	int end_bit)
	{
	return DeviceRadixSort::SortPairsDescending(
	d_temp_storage, temp_storage_bytes,
	d_keys, d_values, num_items,
	begin_bit, end_bit);
	}


	/**
	* Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for descending)
	*/
	template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
	typename NumItemsT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<true> /is_descending/,
	Int2Type<CUB_NO_OVERWRITE> /dispatch_to/,
	int /d_selector*/,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	DoubleBuffer<KeyT> &d_keys,
	DoubleBuffer<ValueT> &d_values,
	NumItemsT num_items,
	int /num_segments/,
	BeginOffsetIteratorT /d_segment_begin_offsets/,
	EndOffsetIteratorT /d_segment_end_offsets/,
	int begin_bit,
	int end_bit)
	{
	KeyT const *const_keys_itr = d_keys.Current();
	ValueT const *const_values_itr = d_values.Current();

	cudaError_t retval = DeviceRadixSort::SortPairsDescending(
	d_temp_storage, temp_storage_bytes,
	const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
	num_items, begin_bit, end_bit);

	d_keys.selector ^= 1;
	d_values.selector ^= 1;
	return retval;
	}

	//---------------------------------------------------------------------
	// Dispatch to different DeviceRadixSort entrypoints
	//---------------------------------------------------------------------

	// Validates that `num_items` fits into `int`
	// TODO(canonizer): remove this check once num_items is templated for segmented sort.
	template <typename NumItemsT>
	__host__ __device__ bool ValidateNumItemsForSegmentedSort(NumItemsT num_items)
	{
	if (static_cast<long long int>(num_items) <
	static_cast<long long int>(INT_MAX))
	{
	return true;
	}
	else
	{
	printf("cub::DeviceSegmentedRadixSort is currently limited by %d items but "
	"%lld were provided\n",
	INT_MAX,
	static_cast<long long int>(num_items));
	}

	return false;
	}

	/**
	* Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for ascending)
	*/
	template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
	typename NumItemsT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<false> /is_descending/,
	Int2Type<CUB_SEGMENTED> /dispatch_to/,
	int /d_selector*/,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	DoubleBuffer<KeyT> &d_keys,
	DoubleBuffer<ValueT> &d_values,
	NumItemsT num_items,
	int num_segments,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	int begin_bit,
	int end_bit)
	{
	if (ValidateNumItemsForSegmentedSort(num_items))
	{
	return DeviceSegmentedRadixSort::SortPairs(d_temp_storage,
	temp_storage_bytes,
	d_keys,
	d_values,
	static_cast<int>(num_items),
	num_segments,
	d_segment_begin_offsets,
	d_segment_end_offsets,
	begin_bit,
	end_bit);
	}

	return cudaErrorInvalidValue;
	}

	/**
	* Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for ascending)
	*/
	template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
	typename NumItemsT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<false> /is_descending/,
	Int2Type<CUB_SEGMENTED_NO_OVERWRITE> /dispatch_to/,
	int /d_selector*/,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	DoubleBuffer<KeyT> &d_keys,
	DoubleBuffer<ValueT> &d_values,
	NumItemsT num_items,
	int num_segments,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	int begin_bit,
	int end_bit)
	{
	if (ValidateNumItemsForSegmentedSort(num_items))
	{
	KeyT const *const_keys_itr = d_keys.Current();
	ValueT const *const_values_itr = d_values.Current();

	cudaError_t retval =
	DeviceSegmentedRadixSort::SortPairs(d_temp_storage,
	temp_storage_bytes,
	const_keys_itr,
	d_keys.Alternate(),
	const_values_itr,
	d_values.Alternate(),
	static_cast<int>(num_items),
	num_segments,
	d_segment_begin_offsets,
	d_segment_end_offsets,
	begin_bit,
	end_bit);

	d_keys.selector ^= 1;
	d_values.selector ^= 1;
	return retval;
	}

	return cudaErrorInvalidValue;
	}


	/**
	* Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for descending)
	*/
	template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
	typename NumItemsT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<true> /is_descending/,
	Int2Type<CUB_SEGMENTED> /dispatch_to/,
	int /d_selector*/,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	DoubleBuffer<KeyT> &d_keys,
	DoubleBuffer<ValueT> &d_values,
	NumItemsT num_items,
	int num_segments,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	int begin_bit,
	int end_bit)
	{
	if (ValidateNumItemsForSegmentedSort(num_items))
	{
	return DeviceSegmentedRadixSort::SortPairsDescending(
	d_temp_storage,
	temp_storage_bytes,
	d_keys,
	d_values,
	static_cast<int>(num_items),
	num_segments,
	d_segment_begin_offsets,
	d_segment_end_offsets,
	begin_bit,
	end_bit);
	}

	return cudaErrorInvalidValue;
	}

	/**
	* Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for descending)
	*/
	template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
	typename NumItemsT>
	CUB_RUNTIME_FUNCTION
	cudaError_t Dispatch(
	Int2Type<true> /is_descending/,
	Int2Type<CUB_SEGMENTED_NO_OVERWRITE> /dispatch_to/,
	int /d_selector*/,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	DoubleBuffer<KeyT> &d_keys,
	DoubleBuffer<ValueT> &d_values,
	NumItemsT num_items,
	int num_segments,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	int begin_bit,
	int end_bit)
	{
	if (ValidateNumItemsForSegmentedSort(num_items))
	{
	KeyT const *const_keys_itr = d_keys.Current();
	ValueT const *const_values_itr = d_values.Current();

	cudaError_t retval =
	DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage,
	temp_storage_bytes,
	const_keys_itr,
	d_keys.Alternate(),
	const_values_itr,
	d_values.Alternate(),
	static_cast<int>(num_items),
	num_segments,
	d_segment_begin_offsets,
	d_segment_end_offsets,
	begin_bit,
	end_bit);

	d_keys.selector ^= 1;
	d_values.selector ^= 1;
	return retval;
	}

	return cudaErrorInvalidValue;
	}

	//---------------------------------------------------------------------
	// CUDA Nested Parallelism Test Kernel
	//---------------------------------------------------------------------

	#if TEST_CDP == 1

	/**
	* Simple wrapper kernel to invoke DeviceRadixSort
	*/
	template <int IsDescending,
	int CubBackend,
	typename KeyT,
	typename ValueT,
	typename BeginOffsetIteratorT,
	typename EndOffsetIteratorT,
	typename NumItemsT>
	__global__ void CDPDispatchKernel(Int2Type<IsDescending> is_descending,
	Int2Type<CubBackend> cub_backend,
	int *d_selector,
	size_t *d_temp_storage_bytes,
	cudaError_t *d_cdp_error,
	void *d_temp_storage,
	size_t temp_storage_bytes,
	DoubleBuffer<KeyT> d_keys,
	DoubleBuffer<ValueT> d_values,
	NumItemsT num_items,
	int num_segments,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	int begin_bit,
	int end_bit)
	{
	*d_cdp_error = Dispatch(is_descending,
	cub_backend,
	d_selector,
	d_temp_storage_bytes,
	d_cdp_error,
	d_temp_storage,
	temp_storage_bytes,
	d_keys,
	d_values,
	num_items,
	num_segments,
	d_segment_begin_offsets,
	d_segment_end_offsets,
	begin_bit,
	end_bit);

	*d_temp_storage_bytes = temp_storage_bytes;
	*d_selector = d_keys.selector;
	}

	/**
	* Launch kernel and dispatch on device. Should only be called from host code.
	* The CubBackend should be one of the non-CDP CUB backends to invoke from the
	* device.
	*/
	template <int IsDescending,
	int CubBackend,
	typename KeyT,
	typename ValueT,
	typename BeginOffsetIteratorT,
	typename EndOffsetIteratorT,
	typename NumItemsT>
	cudaError_t LaunchCDPKernel(Int2Type<IsDescending> is_descending,
	Int2Type<CubBackend> cub_backend,
	int *d_selector,
	size_t *d_temp_storage_bytes,
	cudaError_t *d_cdp_error,

	void *d_temp_storage,
	size_t &temp_storage_bytes,
	DoubleBuffer<KeyT> &d_keys,
	DoubleBuffer<ValueT> &d_values,
	NumItemsT num_items,
	int num_segments,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	int begin_bit,
	int end_bit)
	{
	// Invoke kernel to invoke device-side dispatch:
	cudaError_t retval =
	thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0)
	.doit(CDPDispatchKernel<IsDescending,
	CubBackend,
	KeyT,
	ValueT,
	BeginOffsetIteratorT,
	EndOffsetIteratorT,
	NumItemsT>,
	is_descending,
	cub_backend,
	d_selector,
	d_temp_storage_bytes,
	d_cdp_error,
	d_temp_storage,
	temp_storage_bytes,
	d_keys,
	d_values,
	num_items,
	num_segments,
	d_segment_begin_offsets,
	d_segment_end_offsets,
	begin_bit,
	end_bit);
	CubDebugExit(retval);
	CubDebugExit(cub::detail::device_synchronize());

	// Copy out selector
	CubDebugExit(cudaMemcpy(&d_keys.selector,
	d_selector,
	sizeof(int) * 1,
	cudaMemcpyDeviceToHost));
	d_values.selector = d_keys.selector;

	// Copy out temp_storage_bytes
	CubDebugExit(cudaMemcpy(&temp_storage_bytes,
	d_temp_storage_bytes,
	sizeof(size_t) * 1,
	cudaMemcpyDeviceToHost));

	// Copy out error
	CubDebugExit(cudaMemcpy(&retval,
	d_cdp_error,
	sizeof(cudaError_t) * 1,
	cudaMemcpyDeviceToHost));

	return retval;
	}

	// Specializations of Dispatch that translate the CDP backend to the appropriate
	// CUB backend, and uses the CUB backend to launch the CDP kernel.
	#define DEFINE_CDP_DISPATCHER(CdpBackend, CubBackend) \
	template <int IsDescending, \
	typename KeyT, \
	typename ValueT, \
	typename BeginOffsetIteratorT, \
	typename EndOffsetIteratorT, \
	typename NumItemsT> \
	cudaError_t Dispatch(Int2Type<IsDescending> is_descending, \
	Int2Type<CdpBackend> /dispatch_to/, \
	int *d_selector, \
	size_t *d_temp_storage_bytes, \
	cudaError_t *d_cdp_error, \
	\
	void *d_temp_storage, \
	size_t &temp_storage_bytes, \
	DoubleBuffer<KeyT> &d_keys, \
	DoubleBuffer<ValueT> &d_values, \
	NumItemsT num_items, \
	int num_segments, \
	BeginOffsetIteratorT d_segment_begin_offsets, \
	EndOffsetIteratorT d_segment_end_offsets, \
	int begin_bit, \
	int end_bit) \
	{ \
	Int2Type<CubBackend> cub_backend{}; \
	return LaunchCDPKernel(is_descending, \
	cub_backend, \
	d_selector, \
	d_temp_storage_bytes, \
	d_cdp_error, \
	d_temp_storage, \
	temp_storage_bytes, \
	d_keys, \
	d_values, \
	num_items, \
	num_segments, \
	d_segment_begin_offsets, \
	d_segment_end_offsets, \
	begin_bit, \
	end_bit); \
	}

	DEFINE_CDP_DISPATCHER(CDP, CUB)
	DEFINE_CDP_DISPATCHER(CDP_NO_OVERWRITE, CUB_NO_OVERWRITE)
	DEFINE_CDP_DISPATCHER(CDP_SEGMENTED, CUB_SEGMENTED)
	DEFINE_CDP_DISPATCHER(CDP_SEGMENTED_NO_OVERWRITE, CUB_SEGMENTED_NO_OVERWRITE)

	#undef DEFINE_CDP_DISPATCHER

	#endif // TEST_CDP

	//---------------------------------------------------------------------
	// Problem generation
	//---------------------------------------------------------------------


	/**
	* Simple key-value pairing
	*/
	template <
	typename KeyT,
	typename ValueT>
	struct Pair
	{
	KeyT key;
	ValueT value;

	bool operator<(const Pair &b) const
	{
	return (key < b.key);
	}
	};


	/**
	* Simple key-value pairing (specialized for bool types)
	*/
	template <typename ValueT>
	struct Pair<bool, ValueT>
	{
	bool key;
	ValueT value;

	bool operator<(const Pair &b) const
	{
	return (!key && b.key);
	}
	};


	/**
	* Initialize key data
	*/
	template <typename KeyT, typename NumItemsT>
	void InitializeKeyBits(
	GenMode gen_mode,
	KeyT *h_keys,
	NumItemsT num_items,
	int /entropy_reduction/)
	{
	for (NumItemsT i = 0; i < num_items; ++i)
	InitValue(gen_mode, h_keys[i], i);
	}

	template <typename KeyT,
	typename UnsignedBits = typename cub::Traits<KeyT>::UnsignedBits>
	UnsignedBits KeyBits(KeyT key)
	{
	UnsignedBits bits;
	memcpy(&bits, &key, sizeof(KeyT));
	return bits;
	}

	/** Initialize the reference array monotonically. */
	template <typename KeyT, typename NumItemsT>
	void InitializeKeysSorted(
	KeyT *h_keys,
	NumItemsT num_items)
	{
	using TraitsT = cub::Traits<KeyT>;
	using UnsignedBits = typename TraitsT::UnsignedBits;

	// Numbers to generate random runs.
	UnsignedBits max_inc = 1 << (sizeof(UnsignedBits) < 4 ? 3 :
	(sizeof(UnsignedBits) < 8 ? 14 : 24));
	UnsignedBits min_bits = TraitsT::TwiddleIn(KeyBits(TraitsT::Lowest()));
	UnsignedBits max_bits = TraitsT::TwiddleIn(KeyBits(TraitsT::Max()));
	NumItemsT max_run = std::max(
	NumItemsT(double(num_items) * (max_inc + 1) / max_bits),
	NumItemsT(1 << 14));

	UnsignedBits h_key_bits = reinterpret_cast<UnsignedBits>(h_keys);
	NumItemsT i = 0;
	// Start with the minimum twiddled key.
	UnsignedBits twiddled_key = min_bits;
	while (i < num_items)
	{
	// Generate random increment (avoid overflow).
	UnsignedBits inc_bits = 0;
	RandomBits(inc_bits);
	// twiddled_key < max_bits at this point.
	UnsignedBits inc = static_cast<UnsignedBits>(std::min(1 + inc_bits % max_inc, max_bits - twiddled_key));
	twiddled_key += inc;

	// Generate random run length (ensure there are enough values to fill the rest).
	NumItemsT run_bits = 0;
	RandomBits(run_bits);
	NumItemsT run_length = std::min(1 + run_bits % max_run, num_items - i);
	if (twiddled_key == max_bits) run_length = num_items - i;
	NumItemsT run_end = i + run_length;

	// Fill the array.
	UnsignedBits key = TraitsT::TwiddleOut(twiddled_key);
	// Avoid -0.0 for floating-point keys.
	UnsignedBits negative_zero = UnsignedBits(1) << UnsignedBits(sizeof(UnsignedBits) * 8 - 1);
	if (TraitsT::CATEGORY == cub::FLOATING_POINT && key == negative_zero)
	{
	key = 0;
	}

	for (; i < run_end; ++i)
	{
	h_key_bits[i] = key;
	}
	}
	}


	/**
	* Initialize solution
	*/
	template <bool IS_DESCENDING, bool WANT_RANKS, typename KeyT, typename NumItemsT>
	void InitializeSolution(
	KeyT *h_keys,
	NumItemsT num_items,
	int num_segments,
	bool pre_sorted,
	NumItemsT *h_segment_offsets,
	int begin_bit,
	int end_bit,
	NumItemsT *&h_reference_ranks,
	KeyT *&h_reference_keys)
	{
	if (num_items == 0)
	{
	h_reference_ranks = nullptr;
	h_reference_keys = nullptr;
	return;
	}

	if (pre_sorted)
	{
	printf("Shuffling reference solution on CPU\n");
	// Note: begin_bit and end_bit are ignored here, and assumed to have the
	// default values (begin_bit == 0, end_bit == 8 * sizeof(KeyT)).
	// Otherwise, pre-sorting won't work, as it doesn't necessarily
	// correspond to the order of keys sorted by a subrange of bits.
	// num_segments is also ignored as assumed to be 1, as pre-sorted tests
	// are currently not supported for multiple segments.
	//
	// Pre-sorted tests with non-default begin_bit, end_bit or num_segments
	// != 1 are skipped in TestBits() and TestSegments(), respectively.
	AssertEquals(begin_bit, 0);
	AssertEquals(end_bit, static_cast<int>(8 * sizeof(KeyT)));
	AssertEquals(num_segments, 1);

	// Copy to the reference solution.
	h_reference_keys = new KeyT[num_items];
	if (IS_DESCENDING)
	{
	// Copy in reverse.
	for (NumItemsT i = 0; i < num_items; ++i)
	{
	h_reference_keys[i] = h_keys[num_items - 1 - i];
	}
	// Copy back.
	memcpy(h_keys, h_reference_keys, num_items * sizeof(KeyT));
	}
	else
	{
	memcpy(h_reference_keys, h_keys, num_items * sizeof(KeyT));
	}

	// Summarize the pre-sorted array (element, 1st position, count).
	struct Element
	{
	KeyT key;
	NumItemsT num;
	NumItemsT index;
	};

	std::vector<Element> summary;
	KeyT cur_key = h_reference_keys[0];
	summary.push_back(Element{cur_key, 1, 0});
	for (NumItemsT i = 1; i < num_items; ++i)
	{
	KeyT key = h_reference_keys[i];
	if (key == cur_key)
	{
	// Same key.
	summary.back().num++;
	continue;
	}

	// Different key.
	cur_key = key;
	summary.push_back(Element{cur_key, 1, i});
	}

	// Generate a random permutation from the summary. Such a complicated
	// approach is used to permute the array and compute ranks in a
	// cache-friendly way and in a short time.
	if (WANT_RANKS)
	{
	h_reference_ranks = new NumItemsT[num_items];
	}
	NumItemsT max_run = 32;
	NumItemsT run = 0;
	NumItemsT i = 0;
	while (summary.size() > 0)
	{
	// Pick up a random element and a run.
	NumItemsT bits = 0;
	RandomBits(bits);
	NumItemsT summary_id = bits % summary.size();
	Element& element = summary[summary_id];
	run = std::min(1 + bits % (max_run - 1), element.num);
	for (NumItemsT j = 0; j < run; ++j)
	{
	h_keys[i + j] = element.key;
	if (WANT_RANKS)
	{
	h_reference_ranks[element.index + j] = i + j;
	}
	}
	i += run;
	element.index += run;
	element.num -= run;
	if (element.num == 0)
	{
	// Remove the empty entry.
	std::swap(summary[summary_id], summary.back());
	summary.pop_back();
	}
	}
	printf(" Done.\n");
	}
	else
	{
	typedef Pair<KeyT, NumItemsT> PairT;

	PairT *h_pairs = new PairT[num_items];

	int num_bits = end_bit - begin_bit;
	for (NumItemsT i = 0; i < num_items; ++i)
	{

	// Mask off unwanted portions
	if (num_bits < static_cast<int>(sizeof(KeyT) * 8))
	{
	using UnsignedBits = typename cub::Traits<KeyT>::UnsignedBits;

	UnsignedBits base = 0;
	memcpy(&base, &h_keys[i], sizeof(KeyT));
	base &= ((UnsignedBits{1} << num_bits) - 1) << begin_bit;
	memcpy(&h_pairs[i].key, &base, sizeof(KeyT));
	}
	else
	{
	h_pairs[i].key = h_keys[i];
	}

	h_pairs[i].value = i;
	}

	printf("\nSorting reference solution on CPU "
	"(%zd items, %d segments, %zd items/seg)...",
	static_cast<std::size_t>(num_items),
	num_segments,
	static_cast<std::size_t>(num_items / num_segments));
	fflush(stdout);

	for (int i = 0; i < num_segments; ++i)
	{
	if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
	std::stable_sort( h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
	if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
	}

	printf(" Done.\n"); fflush(stdout);

	if (WANT_RANKS)
	{
	h_reference_ranks = new NumItemsT[num_items];
	}
	h_reference_keys = new KeyT[num_items];

	for (NumItemsT i = 0; i < num_items; ++i)
	{
	if (WANT_RANKS)
	{
	h_reference_ranks[i] = h_pairs[i].value;
	}
	h_reference_keys[i] = h_keys[h_pairs[i].value];
	}

	if (h_pairs) delete[] h_pairs;
	}
	}

	template <bool IS_DESCENDING, typename KeyT, typename NumItemsT>
	void ResetKeys(KeyT h_keys, NumItemsT num_items, bool pre_sorted, KeyT reference_keys)
	{
	if (!pre_sorted) return;

	// Copy the reference keys back.
	if (IS_DESCENDING)
	{
	// Keys need to be copied in reverse.
	for (NumItemsT i = 0; i < num_items; ++i)
	{
	h_keys[i] = reference_keys[num_items - 1 - i];
	}
	}
	else
	{
	memcpy(h_keys, reference_keys, num_items * sizeof(KeyT));
	}
	}


	//---------------------------------------------------------------------
	// Test generation
	//---------------------------------------------------------------------

	template <typename T>
	struct UnwrapHalfAndBfloat16 {
	using Type = T;
	};

	#if !_NVHPC_CUDA
	template <>
	struct UnwrapHalfAndBfloat16<half_t> {
	using Type = __half;
	};
	#endif

	#if !_NVHPC_CUDA
	template <>
	struct UnwrapHalfAndBfloat16<bfloat16_t> {
	using Type = __nv_bfloat16;
	};
	#endif

	template <class T, class NumItemsT>
	int compare_device_arrays(T* host_reference, T* d_tmp_buffer, T* d_data, NumItemsT num_items)
	{
	CubDebugExit(cudaMemcpy(d_tmp_buffer, host_reference, sizeof(T) * num_items, cudaMemcpyHostToDevice));

	auto d_reference_begin = d_tmp_buffer;
	auto d_reference_end = d_tmp_buffer + num_items;
	auto err = thrust::mismatch(thrust::device, d_reference_begin, d_reference_end, d_data);

	if (err.first != d_reference_end)
	{
	const auto index = thrust::distance(d_reference_begin, err.first);
	return CompareDeviceResults(host_reference + index, d_data + index, 1, true, g_verbose);
	}

	return 0;
	}

	template <class NumItemsT>
	int compare_device_arrays(
	CUB_NS_QUALIFIER::NullType / h_reference */,
	CUB_NS_QUALIFIER::NullType / d_tmp_buffer */,
	CUB_NS_QUALIFIER::NullType / d_data */,
	NumItemsT /* num_items */)
	{
	return 0;
	}

	/**
	* Test DeviceRadixSort
	*/
	template <
	Backend BACKEND,
	bool IS_DESCENDING,
	typename KeyT,
	typename ValueT,
	typename BeginOffsetIteratorT,
	typename EndOffsetIteratorT,
	typename NumItemsT>
	void Test(
	KeyT *h_keys,
	ValueT *h_values,
	NumItemsT num_items,
	int num_segments,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	int begin_bit,
	int end_bit,
	KeyT *h_reference_keys,
	ValueT *h_reference_values)
	{
	// Key alias type
	using KeyAliasT = typename UnwrapHalfAndBfloat16<KeyT>::Type;

	const bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;

	printf("%s %s cub::DeviceRadixSort %zd items, %d segments, "
	"%d-byte keys (%s) %d-byte values (%s), %d-byte num_items (%s), "
	"descending %d, begin_bit %d, end_bit %d\n",
	BackendToString(BACKEND),
	(KEYS_ONLY) ? "keys-only" : "key-value",
	static_cast<std::size_t>(num_items),
	num_segments,
	static_cast<int>(sizeof(KeyT)),
	typeid(KeyT).name(),
	(KEYS_ONLY) ? 0 : static_cast<int>(sizeof(ValueT)),
	typeid(ValueT).name(),
	static_cast<int>(sizeof(NumItemsT)),
	typeid(NumItemsT).name(),
	IS_DESCENDING,
	begin_bit,
	end_bit);

	if (g_verbose)
	{
	printf("Input keys:\n");
	DisplayResults(h_keys, num_items);
	printf("\n\n");
	}

	// Allocate device arrays
	DoubleBuffer<KeyAliasT> d_keys;
	DoubleBuffer<ValueT> d_values;
	int *d_selector;
	size_t *d_temp_storage_bytes;
	cudaError_t *d_cdp_error;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_keys.d_buffers[0], sizeof(KeyT) num_items));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_keys.d_buffers[1], sizeof(KeyT) num_items));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_selector, sizeof(int) 1));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_temp_storage_bytes, sizeof(size_t) 1));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_cdp_error, sizeof(cudaError_t) 1));
	if (!KEYS_ONLY)
	{
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_values.d_buffers[0], sizeof(ValueT) num_items));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_values.d_buffers[1], sizeof(ValueT) num_items));
	}

	// Allocate temporary storage (and make it un-aligned)
	size_t temp_storage_bytes = 0;
	void *d_temp_storage = NULL;
	CubDebugExit(Dispatch(
	Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
	d_temp_storage, temp_storage_bytes, d_keys, d_values,
	num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
	begin_bit, end_bit));

	CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + 1));
	void* mis_aligned_temp = static_cast<char*>(d_temp_storage) + 1;

	// Initialize/clear device arrays
	d_keys.selector = 0;
	CubDebugExit(cudaMemcpy(d_keys.d_buffers[0], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
	CubDebugExit(cudaMemset(d_keys.d_buffers[1], 0, sizeof(KeyT) * num_items));
	if (!KEYS_ONLY)
	{
	d_values.selector = 0;
	CubDebugExit(cudaMemcpy(d_values.d_buffers[0], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
	CubDebugExit(cudaMemset(d_values.d_buffers[1], 0, sizeof(ValueT) * num_items));
	}

	// Run warmup/correctness iteration
	CubDebugExit(Dispatch(
	Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
	mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
	num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
	begin_bit, end_bit));

	// Flush any stdout/stderr
	fflush(stdout);
	fflush(stderr);

	// Check for correctness (and display results, if specified)
	printf("Warmup done. Checking results:\n"); fflush(stdout);

	int compare = 0;

	// If in/out API is used, we are not allowed to overwrite the input.
	// Let's check that the input buffer is not overwritten by the algorithm.
	if (BACKEND == CUB_NO_OVERWRITE)
	{
	KeyT d_input_keys = reinterpret_cast<KeyT>(d_keys.d_buffers[0]);

	if (temp_storage_bytes < sizeof(KeyT) * num_items)
	{
	// For small input sizes, temporary storage is not large enough to fit keys.
	compare = CompareDeviceResults(h_keys, d_input_keys, num_items, true, g_verbose);
	}
	else
	{
	// If overwrite is not allowed, temporary storage is large enough to fit keys.
	KeyT* temp_keys = reinterpret_cast<KeyT*>(d_temp_storage);

	compare = compare_device_arrays(h_keys, temp_keys, d_input_keys, num_items);
	printf("\t Compare input keys: %s ", compare ? "FAIL" : "PASS"); fflush(stdout);
	}
	}

	// After the previous check is done, we can safely reuse alternative buffer to store
	// the reference results and compare current output.
	compare \|= compare_device_arrays(h_reference_keys,
	reinterpret_cast<KeyT *>(d_keys.Alternate()),
	reinterpret_cast<KeyT *>(d_keys.Current()),
	num_items);

	printf("\t Compare keys (selector %d): %s ", d_keys.selector, compare ? "FAIL" : "PASS"); fflush(stdout);

	if (!KEYS_ONLY)
	{
	int values_compare = compare_device_arrays(h_reference_values,
	d_values.Alternate(),
	d_values.Current(),
	num_items);

	compare \|= values_compare;
	printf("\t Compare values (selector %d): %s ", d_values.selector, values_compare ? "FAIL" : "PASS"); fflush(stdout);
	}

	// Performance
	if (g_timing_iterations)
	printf("\nPerforming timing iterations:\n"); fflush(stdout);

	GpuTimer gpu_timer;
	float elapsed_millis = 0.0f;
	for (int i = 0; i < g_timing_iterations; ++i)
	{
	// Initialize/clear device arrays
	CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
	CubDebugExit(cudaMemset(d_keys.d_buffers[d_keys.selector ^ 1], 0, sizeof(KeyT) * num_items));
	if (!KEYS_ONLY)
	{
	CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
	CubDebugExit(cudaMemset(d_values.d_buffers[d_values.selector ^ 1], 0, sizeof(ValueT) * num_items));
	}

	gpu_timer.Start();
	CubDebugExit(Dispatch(
	Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
	mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
	num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
	begin_bit, end_bit));
	gpu_timer.Stop();
	elapsed_millis += gpu_timer.ElapsedMillis();
	}

	// Display performance
	if (g_timing_iterations > 0)
	{
	float avg_millis = elapsed_millis / g_timing_iterations;
	float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
	float giga_bandwidth = (KEYS_ONLY) ?
	giga_rate * sizeof(KeyT) * 2 :
	giga_rate * (sizeof(KeyT) + sizeof(ValueT)) * 2;
	printf("\n%.3f elapsed ms, %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", elapsed_millis, avg_millis, giga_rate, giga_bandwidth);
	}

	printf("\n\n");

	// Cleanup
	if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
	if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
	if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
	if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
	if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
	if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
	if (d_selector) CubDebugExit(g_allocator.DeviceFree(d_selector));
	if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));

	// Correctness asserts
	AssertEquals(0, compare);
	}

	// Returns whether there is enough memory for the test.
	template <typename KeyT, typename ValueT>
	bool HasEnoughMemory(std::size_t num_items, bool overwrite)
	{
	std::size_t total_mem = TotalGlobalMem();
	std::size_t value_size = std::is_same<ValueT, NullType>::value
	? 0
	: sizeof(ValueT);
	// A conservative estimate of the amount of memory required.
	double factor = overwrite ? 2.25 : 3.25;
	std::size_t test_mem = static_cast<std::size_t>
	(num_items * (sizeof(KeyT) + value_size) * factor);
	return test_mem < total_mem;
	}


	/**
	* Test backend
	*/
	template <bool IS_DESCENDING,
	typename KeyT,
	typename ValueT,
	typename BeginOffsetIteratorT,
	typename EndOffsetIteratorT,
	typename NumItemsT>
	void TestBackend(KeyT *h_keys,
	NumItemsT num_items,
	int num_segments,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	int begin_bit,
	int end_bit,
	KeyT *h_reference_keys,
	NumItemsT *h_reference_ranks)
	{
	#if TEST_CDP == 0
	constexpr auto NonSegmentedOverwrite = CUB;
	constexpr auto NonSegmentedNoOverwrite = CUB_NO_OVERWRITE;
	constexpr auto SegmentedOverwrite = CUB_SEGMENTED;
	constexpr auto SegmentedNoOverwrite = CUB_SEGMENTED_NO_OVERWRITE;
	#else // TEST_CDP
	constexpr auto NonSegmentedOverwrite = CDP;
	constexpr auto NonSegmentedNoOverwrite = CDP_NO_OVERWRITE;
	constexpr auto SegmentedOverwrite = CDP_SEGMENTED;
	constexpr auto SegmentedNoOverwrite = CDP_SEGMENTED_NO_OVERWRITE;
	#endif // TEST_CDP

	const bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;

	// A conservative check assuming overwrite is allowed.
	if (!HasEnoughMemory<KeyT, ValueT>(static_cast<std::size_t>(num_items), true))
	{
	printf("Skipping the test due to insufficient device memory\n");
	return;
	}

	std::unique_ptr<ValueT[]> h_value_data{};

	ValueT *h_values = nullptr;
	ValueT *h_reference_values = nullptr;

	if (!KEYS_ONLY)
	{
	h_value_data.reset(new ValueT[2 * static_cast<std::size_t>(num_items)]);
	h_values = h_value_data.get();
	h_reference_values = h_value_data.get() + num_items;

	for (NumItemsT i = 0; i < num_items; ++i)
	{
	InitValue(INTEGER_SEED, h_values[i], i);
	InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
	}
	}

	// Skip segmented sort if num_items isn't int.
	// TODO(64bit-seg-sort): re-enable these tests once num_items is templated for
	// segmented sort.
	if (std::is_same<NumItemsT, int>::value)
	{
	printf("Testing segmented sort with overwrite\n");
	Test<SegmentedOverwrite, IS_DESCENDING>(h_keys,
	h_values,
	num_items,
	num_segments,
	d_segment_begin_offsets,
	d_segment_end_offsets,
	begin_bit,
	end_bit,
	h_reference_keys,
	h_reference_values);
	printf("Testing segmented sort with no overwrite\n");
	Test<SegmentedNoOverwrite, IS_DESCENDING>(h_keys,
	h_values,
	num_items,
	num_segments,
	d_segment_begin_offsets,
	d_segment_end_offsets,
	begin_bit,
	end_bit,
	h_reference_keys,
	h_reference_values);
	}
	else
	{
	printf("Skipping segmented sort tests (NumItemsT != int)\n");
	}

	if (num_segments == 1)
	{
	printf("Testing non-segmented sort with overwrite\n");
	Test<NonSegmentedOverwrite, IS_DESCENDING>(h_keys,
	h_values,
	num_items,
	num_segments,
	d_segment_begin_offsets,
	d_segment_end_offsets,
	begin_bit,
	end_bit,
	h_reference_keys,
	h_reference_values);
	if (HasEnoughMemory<KeyT, ValueT>(static_cast<std::size_t>(num_items),
	false))
	{
	printf("Testing non-segmented sort with no overwrite\n");
	Test<NonSegmentedNoOverwrite, IS_DESCENDING>(h_keys,
	h_values,
	num_items,
	num_segments,
	d_segment_begin_offsets,
	d_segment_end_offsets,
	begin_bit,
	end_bit,
	h_reference_keys,
	h_reference_values);
	}
	else
	{
	printf("Skipping no-overwrite tests with %zd items due to "
	"insufficient memory\n",
	static_cast<std::size_t>(num_items));
	}
	}
	}


	// Smallest value type for TEST_VALUE_TYPE.
	// Unless TEST_VALUE_TYPE == 3, this is the only value type tested.
	#if TEST_VALUE_TYPE == 0
	// Test keys-only
	using SmallestValueT = NullType;
	#elif TEST_VALUE_TYPE == 1
	// Test with 8b value
	using SmallestValueT = unsigned char;
	#elif TEST_VALUE_TYPE == 2
	// Test with 32b value
	using SmallestValueT = unsigned int;
	// Test with 64b value
	#elif TEST_VALUE_TYPE == 3
	using SmallestValueT = unsigned long long;
	#endif


	/**
	* Test value type
	*/
	template <bool IS_DESCENDING, typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename NumItemsT>
	void TestValueTypes(
	KeyT *h_keys,
	NumItemsT num_items,
	int num_segments,
	bool pre_sorted,
	NumItemsT *h_segment_offsets,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	int begin_bit,
	int end_bit)
	{
	// Initialize the solution
	NumItemsT *h_reference_ranks = NULL;
	KeyT *h_reference_keys = NULL;
	// If TEST_VALUE_TYPE == 0, no values are sorted, only keys.
	// Since ranks are only necessary when checking for values,
	// they are not computed in this case.
	InitializeSolution<IS_DESCENDING, TEST_VALUE_TYPE != 0>(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, begin_bit, end_bit, h_reference_ranks, h_reference_keys);

	TestBackend<IS_DESCENDING, KeyT, SmallestValueT> (h_keys, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);

	#if TEST_VALUE_TYPE == 3
	// Test with non-trivially-constructable value
	// These are cheap to build, so lump them in with the 64b value tests.
	TestBackend<IS_DESCENDING, KeyT, TestBar> (h_keys, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
	#endif

	// Cleanup
	ResetKeys<IS_DESCENDING>(h_keys, num_items, pre_sorted, h_reference_keys);
	if (h_reference_ranks) delete[] h_reference_ranks;
	if (h_reference_keys) delete[] h_reference_keys;
	}



	/**
	* Test ascending/descending
	*/
	template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
	typename NumItemsT>
	void TestDirection(
	KeyT *h_keys,
	NumItemsT num_items,
	int num_segments,
	bool pre_sorted,
	NumItemsT *h_segment_offsets,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets,
	int begin_bit,
	int end_bit)
	{
	TestValueTypes<true>(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit);
	TestValueTypes<false>(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit);
	}


	/**
	* Test different bit ranges
	*/
	template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
	typename NumItemsT>
	void TestBits(
	KeyT *h_keys,
	NumItemsT num_items,
	int num_segments,
	bool pre_sorted,
	NumItemsT *h_segment_offsets,
	BeginOffsetIteratorT d_segment_begin_offsets,
	EndOffsetIteratorT d_segment_end_offsets)
	{
	// Don't test partial-word sorting for boolean, fp, or signed types (the bit-flipping techniques get in the way) or pre-sorted keys
	if ((Traits<KeyT>::CATEGORY == UNSIGNED_INTEGER)
	&& (!std::is_same<KeyT, bool>::value)
	&& !pre_sorted)
	{
	// Partial bits
	int begin_bit = 1;
	int end_bit = (sizeof(KeyT) * 8) - 1;
	printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout);
	TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit);

	// Equal bits
	begin_bit = end_bit = 0;
	printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout);
	TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit);

	// Across subword boundaries
	int mid_bit = sizeof(KeyT) * 4;
	printf("Testing key bits [%d,%d)\n", mid_bit - 1, mid_bit + 1); fflush(stdout);
	TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, mid_bit - 1, mid_bit + 1);
	}

	printf("Testing key bits [%d,%d)\n", 0, int(sizeof(KeyT)) * 8); fflush(stdout);
	TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, 0, sizeof(KeyT) * 8);
	}


	template<typename OffsetT>
	struct TransformFunctor1
	{
	__host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const
	{
	return offset;
	}
	};

	template<typename OffsetT>
	struct TransformFunctor2
	{
	__host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const
	{
	return offset;
	}
	};


	/**
	* Test different segment iterators
	*/
	template <typename KeyT, typename NumItemsT>
	void TestSegmentIterators(
	KeyT *h_keys,
	NumItemsT num_items,
	int num_segments,
	bool pre_sorted,
	NumItemsT *h_segment_offsets,
	NumItemsT *d_segment_offsets)
	{
	InitializeSegments(num_items, num_segments, h_segment_offsets);
	CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(NumItemsT) * (num_segments + 1), cudaMemcpyHostToDevice));

	// Test with segment pointer.
	// This is also used to test non-segmented sort.
	TestBits(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_offsets, d_segment_offsets + 1);

	if (num_segments > 1)
	{
	// Test with transform iterators of different types
	typedef TransformFunctor1<NumItemsT> TransformFunctor1T;
	typedef TransformFunctor2<NumItemsT> TransformFunctor2T;

	TransformInputIterator<NumItemsT, TransformFunctor1T, NumItemsT*, NumItemsT> d_segment_begin_offsets_itr(d_segment_offsets, TransformFunctor1T());
	TransformInputIterator<NumItemsT, TransformFunctor2T, NumItemsT*, NumItemsT> d_segment_end_offsets_itr(d_segment_offsets + 1, TransformFunctor2T());

	TestBits(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets_itr, d_segment_end_offsets_itr);
	}
	}


	/**
	* Test different segment compositions
	*/
	template <typename KeyT, typename NumItemsT>
	void TestSegments(
	KeyT *h_keys,
	NumItemsT num_items,
	int max_segments,
	bool pre_sorted)
	{
	max_segments = static_cast<int>(CUB_MIN(num_items, static_cast<NumItemsT>(max_segments)));
	NumItemsT *h_segment_offsets = new NumItemsT[max_segments + 1];

	NumItemsT *d_segment_offsets = nullptr;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_segment_offsets, sizeof(NumItemsT) (max_segments + 1)));

	for (int num_segments = max_segments; num_segments > 1; num_segments = cub::DivideAndRoundUp(num_segments, 64))
	{
	// Pre-sorted tests are not supported for segmented sort
	if (num_items / num_segments < 128 * 1000 && !pre_sorted) {
	// Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
	TestSegmentIterators(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_offsets);
	}
	}

	// Test single segment
	if (num_items > 0)
	{
	if (num_items < 128 * 1000 \|\| pre_sorted)
	{
	// Right now we assign a single thread block to each segment, so lets
	// keep it to under 128K items per segment
	TestSegmentIterators(h_keys,
	num_items,
	1,
	pre_sorted,
	h_segment_offsets,
	d_segment_offsets);
	}
	}

	if (h_segment_offsets) delete[] h_segment_offsets;
	if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
	}

	/**
	* Test different NumItemsT, i.e. types of num_items
	*/
	template <typename KeyT>
	void TestNumItems(KeyT *h_keys,
	std::size_t num_items,
	int max_segments,
	bool pre_sorted)
	{
	if (!pre_sorted && num_items <= std::size_t(std::numeric_limits<int>::max()))
	{
	TestSegments<KeyT, int>(h_keys, static_cast<int>(num_items), max_segments, pre_sorted);
	}
	if (pre_sorted && num_items <= std::size_t(std::numeric_limits<std::uint32_t>::max()))
	{
	TestSegments<KeyT, std::uint32_t>(h_keys, static_cast<std::uint32_t>(num_items), max_segments, pre_sorted);
	}
	TestSegments<KeyT, std::size_t>(h_keys, num_items, max_segments, pre_sorted);
	}


	/**
	* Test different (sub)lengths and number of segments
	*/
	template <typename KeyT>
	void TestSizes(KeyT* h_keys,
	std::size_t max_items,
	int max_segments,
	bool pre_sorted)
	{
	if (pre_sorted)
	{
	// run a specific list of sizes, up to max_items
	std::size_t sizes[] = {g_smallest_pre_sorted_num_items, 4350000007ull};
	for (std::size_t num_items : sizes)
	{
	if (num_items > max_items) break;
	TestNumItems(h_keys, num_items, max_segments, pre_sorted);
	}
	}
	else
	{
	for (std::size_t num_items = max_items;
	num_items > 1;
	num_items = cub::DivideAndRoundUp(num_items, 64))
	{
	TestNumItems(h_keys, num_items, max_segments, pre_sorted);
	}
	}
	}

	/**
	* Test key sampling distributions
	*/
	template <typename KeyT, bool WITH_PRE_SORTED>
	void TestGen(
	std::size_t max_items,
	int max_segments)
	{
	if (max_items == ~std::size_t(0))
	{
	max_items = 8000003;
	}

	if (max_segments < 0)
	{
	max_segments = 5003;
	}

	std::unique_ptr<KeyT[]> h_keys(new KeyT[max_items]);

	// Test trivial problems sizes
	h_keys[0] = static_cast<KeyT>(42);
	TestNumItems(h_keys.get(), 0, 0, false);
	TestNumItems(h_keys.get(), 1, 1, false);

	for (int entropy_reduction = 0; entropy_reduction <= 6; entropy_reduction += 6)
	{
	printf("\nTesting random %s keys with entropy reduction factor %d\n", typeid(KeyT).name(), entropy_reduction); fflush(stdout);
	InitializeKeyBits(RANDOM, h_keys.get(), max_items, entropy_reduction);
	TestSizes(h_keys.get(), max_items, max_segments, false);
	}

	if (cub::Traits<KeyT>::CATEGORY == cub::FLOATING_POINT)
	{
	printf("\nTesting random %s keys with some replaced with -0.0 or +0.0 \n", typeid(KeyT).name());
	fflush(stdout);
	InitializeKeyBits(RANDOM_MINUS_PLUS_ZERO, h_keys.get(), max_items, 0);
	// This just tests +/- 0 handling -- don't need to test multiple sizes
	TestNumItems(h_keys.get(), max_items, max_segments, false);
	}

	printf("\nTesting uniform %s keys\n", typeid(KeyT).name()); fflush(stdout);
	InitializeKeyBits(UNIFORM, h_keys.get(), max_items, 0);
	TestSizes(h_keys.get(), max_items, max_segments, false);

	printf("\nTesting natural number %s keys\n", typeid(KeyT).name()); fflush(stdout);
	InitializeKeyBits(INTEGER_SEED, h_keys.get(), max_items, 0);
	TestSizes(h_keys.get(), max_items, max_segments, false);

	if (WITH_PRE_SORTED)
	{
	// Presorting is only used for testing large input arrays.
	const std::size_t large_num_items = std::size_t(4350000007ull);

	// A conservative check for memory, as we don't know ValueT or whether
	// the overwrite is allowed until later.
	// For ValueT, the check is actually exact unless TEST_VALUE_TYPE == 3.
	if (!HasEnoughMemory<KeyT, SmallestValueT>(large_num_items, true))
	{
	printf("Skipping the permutation-based test due to insufficient device memory\n");
	return;
	}

	h_keys.reset(nullptr); // Explicitly free old buffer before allocating.
	h_keys.reset(new KeyT[large_num_items]);

	printf("\nTesting pre-sorted and randomly permuted %s keys\n", typeid(KeyT).name());
	fflush(stdout);
	InitializeKeysSorted(h_keys.get(), large_num_items);
	fflush(stdout);
	TestSizes(h_keys.get(), large_num_items, max_segments, true);
	fflush(stdout);
	}
	}


	//---------------------------------------------------------------------
	// Simple test
	//---------------------------------------------------------------------

	template <
	Backend BACKEND,
	typename KeyT,
	typename ValueT,
	bool IS_DESCENDING>
	void Test(
	std::size_t num_items,
	int num_segments,
	GenMode gen_mode,
	int entropy_reduction,
	int begin_bit,
	int end_bit)
	{
	const bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;

	KeyT *h_keys = new KeyT[num_items];
	std::size_t *h_reference_ranks = NULL;
	KeyT *h_reference_keys = NULL;
	ValueT *h_values = NULL;
	ValueT *h_reference_values = NULL;
	size_t *h_segment_offsets = new std::size_t[num_segments + 1];

	std::size_t* d_segment_offsets = nullptr;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_segment_offsets, sizeof(std::size_t) (num_segments + 1)));

	if (end_bit < 0)
	end_bit = sizeof(KeyT) * 8;

	InitializeKeyBits(gen_mode, h_keys, num_items, entropy_reduction);
	InitializeSegments(num_items, num_segments, h_segment_offsets);
	CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(std::size_t) * (num_segments + 1), cudaMemcpyHostToDevice));
	InitializeSolution<IS_DESCENDING, !KEYS_ONLY>(
	h_keys, num_items, num_segments, false, h_segment_offsets,
	begin_bit, end_bit, h_reference_ranks, h_reference_keys);

	if (!KEYS_ONLY)
	{
	h_values = new ValueT[num_items];
	h_reference_values = new ValueT[num_items];

	for (std::size_t i = 0; i < num_items; ++i)
	{
	InitValue(INTEGER_SEED, h_values[i], i);
	InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
	}
	}
	if (h_reference_ranks) delete[] h_reference_ranks;

	printf("\nTesting bits [%d,%d) of %s keys with gen-mode %d\n", begin_bit, end_bit, typeid(KeyT).name(), gen_mode); fflush(stdout);
	Test<BACKEND, IS_DESCENDING>(
	h_keys, h_values,
	num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
	begin_bit, end_bit, h_reference_keys, h_reference_values);

	if (h_keys) delete[] h_keys;
	if (h_reference_keys) delete[] h_reference_keys;
	if (h_values) delete[] h_values;
	if (h_reference_values) delete[] h_reference_values;
	if (h_segment_offsets) delete[] h_segment_offsets;
	if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
	}

	#if TEST_VALUE_TYPE == 0
	void TestUnspecifiedRanges()
	{
	const std::size_t num_items = 1024 * 1024;
	const std::size_t max_segments = 42;
	const std::size_t avg_segment_size = num_items / max_segments;

	for (int iteration = 0; iteration < 4; iteration++)
	{
	thrust::host_vector<int> h_offsets_begin;
	thrust::host_vector<int> h_offsets_end;

	h_offsets_begin.reserve(max_segments + 1);
	h_offsets_end.reserve(max_segments + 1);

	{
	int offset = 0;

	for (std::size_t sid = 0; sid < max_segments; sid++)
	{
	const int segment_size =
	static_cast<int>(RandomValue(avg_segment_size));

	const bool segment_is_utilized = segment_size > 0
	&& RandomValue(100) > 60;

	if (segment_is_utilized)
	{
	h_offsets_begin.push_back(offset);
	h_offsets_end.push_back(offset + segment_size);
	}

	offset += segment_size;
	}

	if (h_offsets_begin.empty())
	{
	h_offsets_begin.push_back(avg_segment_size);
	h_offsets_end.push_back(num_items);
	}
	}

	thrust::device_vector<int> keys(num_items);
	thrust::device_vector<int> values(num_items);

	thrust::sequence(keys.rbegin(), keys.rend());
	thrust::sequence(values.rbegin(), values.rend());

	thrust::device_vector<int> d_offsets_begin = h_offsets_begin;
	thrust::device_vector<int> d_offsets_end = h_offsets_end;

	thrust::device_vector<int> expected_keys = keys;
	thrust::device_vector<int> expected_values = values;

	const int num_segments = static_cast<int>(h_offsets_begin.size());

	thrust::device_vector<int> result_keys = keys;
	thrust::device_vector<int> result_values = values;

	for (int sid = 0; sid < num_segments; sid++)
	{
	const int segment_begin = h_offsets_begin[sid];
	const int segment_end = h_offsets_end[sid];

	thrust::sort_by_key(expected_keys.begin() + segment_begin,
	expected_keys.begin() + segment_end,
	expected_values.begin() + segment_begin);
	}

	{
	cub::DoubleBuffer<int> keys_buffer(
	thrust::raw_pointer_cast(keys.data()),
	thrust::raw_pointer_cast(result_keys.data()));

	cub::DoubleBuffer<int> values_buffer(
	thrust::raw_pointer_cast(values.data()),
	thrust::raw_pointer_cast(result_values.data()));

	std::size_t temp_storage_bytes{};
	std::uint8_t *d_temp_storage{nullptr};

	CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs(
	d_temp_storage, temp_storage_bytes,
	keys_buffer, values_buffer,
	num_items, num_segments,
	thrust::raw_pointer_cast(d_offsets_begin.data()),
	thrust::raw_pointer_cast(d_offsets_end.data()),
	0, sizeof(int) * 8));

	thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
	d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

	CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs(
	d_temp_storage, temp_storage_bytes,
	keys_buffer, values_buffer,
	num_items, num_segments,
	thrust::raw_pointer_cast(d_offsets_begin.data()),
	thrust::raw_pointer_cast(d_offsets_end.data()),
	0, sizeof(int) * 8));

	for (int sid = 0; sid < num_segments; sid++)
	{
	const int segment_begin = h_offsets_begin[sid];
	const int segment_end = h_offsets_end[sid];

	if (keys_buffer.selector == 0)
	{
	thrust::copy(
	keys.begin() + segment_begin,
	keys.begin() + segment_end,
	result_keys.begin() + segment_begin);
	}

	if (values_buffer.selector == 0)
	{
	thrust::copy(
	values.begin() + segment_begin,
	values.begin() + segment_end,
	result_values.begin() + segment_begin);
	}
	}
	}

	AssertEquals(result_keys, expected_keys);
	AssertEquals(result_values, expected_values);

	thrust::sequence(keys.rbegin(), keys.rend());
	thrust::sequence(values.rbegin(), values.rend());

	result_keys = keys;
	result_values = values;

	{
	std::size_t temp_storage_bytes{};
	std::uint8_t *d_temp_storage{};

	CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs(
	d_temp_storage, temp_storage_bytes,
	thrust::raw_pointer_cast(keys.data()),
	thrust::raw_pointer_cast(result_keys.data()),
	thrust::raw_pointer_cast(values.data()),
	thrust::raw_pointer_cast(result_values.data()),
	num_items, num_segments,
	thrust::raw_pointer_cast(d_offsets_begin.data()),
	thrust::raw_pointer_cast(d_offsets_end.data()),
	0, sizeof(int) * 8));

	thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
	d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

	CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs(
	d_temp_storage, temp_storage_bytes,
	thrust::raw_pointer_cast(keys.data()),
	thrust::raw_pointer_cast(result_keys.data()),
	thrust::raw_pointer_cast(values.data()),
	thrust::raw_pointer_cast(result_values.data()),
	num_items, num_segments,
	thrust::raw_pointer_cast(d_offsets_begin.data()),
	thrust::raw_pointer_cast(d_offsets_end.data()),
	0, sizeof(int) * 8));
	}

	AssertEquals(result_values, expected_values);
	AssertEquals(result_keys, expected_keys);
	}
	}
	#endif

	#if TEST_KEY_BYTES == 4
	// Following tests check that new decomposer API doesn't break old API.
	// It's disabled because some compilers don't like implicit conversions, which
	// is required for the test. Once we figure out how to temporarily enable conversion, we can
	// re-enable the test.
	#define ENABLING_CONVERSION_IS_FIGURED_OUT 0
	#if ENABLING_CONVERSION_IS_FIGURED_OUT
	struct bit_selector
	{
	int bit;

	operator int() const
	{
	return bit;
	}
	};

	template <class BeginBitT, class EndBitT, class... Ts>
	void device_radix_sort_keys_allows_implicit_conversions_for_bits_helper(BeginBitT begin_bit, EndBitT end_bit, Ts... args)
	{
	const int num_items = 0;

	{
	std::size_t temp_storage_bytes = 0;
	std::uint8_t *d_temp_storage = nullptr;

	cub::DeviceRadixSort::SortKeys(d_temp_storage,
	temp_storage_bytes,
	args...,
	num_items,
	begin_bit,
	end_bit);
	}

	{
	std::size_t temp_storage_bytes = 0;
	std::uint8_t *d_temp_storage = nullptr;

	cub::DeviceRadixSort::SortKeys(d_temp_storage,
	temp_storage_bytes,
	args...,
	num_items,
	begin_bit);
	}

	{
	std::size_t temp_storage_bytes = 0;
	std::uint8_t *d_temp_storage = nullptr;

	cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
	temp_storage_bytes,
	args...,
	num_items,
	begin_bit,
	end_bit);
	}

	{
	std::size_t temp_storage_bytes = 0;
	std::uint8_t *d_temp_storage = nullptr;

	cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
	temp_storage_bytes,
	args...,
	num_items,
	begin_bit);
	}
	}

	template <class BeginBitT, class EndBitT, class... Ts>
	void device_radix_sort_pairs_allows_implicit_conversions_for_bits_helper(BeginBitT begin_bit,
	EndBitT end_bit,
	Ts... args)
	{
	const int num_items = 0;

	{
	std::size_t temp_storage_bytes = 0;
	std::uint8_t *d_temp_storage = nullptr;

	cub::DeviceRadixSort::SortPairs(d_temp_storage,
	temp_storage_bytes,
	args...,
	num_items,
	begin_bit,
	end_bit);
	}

	{
	std::size_t temp_storage_bytes = 0;
	std::uint8_t *d_temp_storage = nullptr;

	cub::DeviceRadixSort::SortPairs(d_temp_storage,
	temp_storage_bytes,
	args...,
	num_items,
	begin_bit);
	}

	{
	std::size_t temp_storage_bytes = 0;
	std::uint8_t *d_temp_storage = nullptr;

	cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
	temp_storage_bytes,
	args...,
	num_items,
	begin_bit,
	end_bit);
	}

	{
	std::size_t temp_storage_bytes = 0;
	std::uint8_t *d_temp_storage = nullptr;

	cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
	temp_storage_bytes,
	args...,
	num_items,
	begin_bit);
	}
	}

	template <class BeginBitT, class EndBitT>
	void device_radix_sort_allows_implicit_conversions_for_bits(BeginBitT begin_bit, EndBitT end_bit)
	{
	int *d_i_ptr = nullptr;

	device_radix_sort_keys_allows_implicit_conversions_for_bits_helper(begin_bit,
	end_bit,
	d_i_ptr,
	d_i_ptr);

	cub::DoubleBuffer<int> keys(d_i_ptr, d_i_ptr);
	device_radix_sort_keys_allows_implicit_conversions_for_bits_helper(begin_bit, end_bit, keys);

	#if TEST_VALUE_TYPE == 2
	unsigned int *d_u_ptr = nullptr;

	device_radix_sort_pairs_allows_implicit_conversions_for_bits_helper(begin_bit,
	end_bit,
	d_i_ptr,
	d_i_ptr,
	d_u_ptr,
	d_u_ptr);

	cub::DoubleBuffer<unsigned int> pairs(d_u_ptr, d_u_ptr);
	device_radix_sort_pairs_allows_implicit_conversions_for_bits_helper(begin_bit,
	end_bit,
	keys,
	pairs);
	#endif
	}

	void device_radix_sort_allows_implicit_conversions_for_bits()
	{
	int begin_i = 0;
	long long int begin_lli = 0;
	bit_selector begin_bs{0};

	int end_i = 2;
	long long int end_lli = 2;
	bit_selector end_bs{2};

	device_radix_sort_allows_implicit_conversions_for_bits(begin_i, end_i);
	device_radix_sort_allows_implicit_conversions_for_bits(begin_i, end_lli);
	device_radix_sort_allows_implicit_conversions_for_bits(begin_i, end_bs);
	device_radix_sort_allows_implicit_conversions_for_bits(begin_lli, end_i);
	device_radix_sort_allows_implicit_conversions_for_bits(begin_lli, end_lli);
	device_radix_sort_allows_implicit_conversions_for_bits(begin_lli, end_bs);
	device_radix_sort_allows_implicit_conversions_for_bits(begin_bs, end_i);
	device_radix_sort_allows_implicit_conversions_for_bits(begin_bs, end_lli);
	device_radix_sort_allows_implicit_conversions_for_bits(begin_bs, end_bs);
	}
	#endif // ENABLING_CONVERSION_IS_FIGURED_OUT
	#endif // TEST_KEY_BYTES == 4

	//---------------------------------------------------------------------
	// Main
	//---------------------------------------------------------------------

	/**
	* Main
	*/
	int main(int argc, char** argv)
	{
	std::size_t num_items = ~std::size_t(0);
	int num_segments = -1;

	// Initialize command line
	CommandLineArgs args(argc, argv);
	g_verbose = args.CheckCmdLineFlag("v");
	args.GetCmdLineArgument("n", num_items);
	args.GetCmdLineArgument("s", num_segments);
	args.GetCmdLineArgument("i", g_timing_iterations);

	// Print usage
	if (args.CheckCmdLineFlag("help"))
	{
	printf("%s "
	"[--n=<input items> "
	"[--s=<num segments> "
	"[--i=<timing iterations> "
	"[--device=<device-id>] "
	"[--v] "
	"\n", argv[0]);
	exit(0);
	}

	// Initialize device
	CubDebugExit(args.DeviceInit());

	// %PARAM% TEST_CDP cdp 0:1
	// %PARAM% TEST_KEY_BYTES bytes 1:2:4:8:16
	// %PARAM% TEST_VALUE_TYPE pairs 0:1:2:3
	// 0->Keys only
	// 1->uchar
	// 2->uint
	// 3->[ull,TestBar] (TestBar is cheap to build, included here to
	// reduce total number of targets)

	// To reduce testing time, some key types are only tested when not
	// testing pairs:
	#if TEST_VALUE_TYPE == 0
	#define TEST_EXTENDED_KEY_TYPES
	#endif

	// Compile/run thorough tests
	#if TEST_KEY_BYTES == 1

	TestGen<signed char, true> (num_items, num_segments);

	#ifdef TEST_EXTENDED_KEY_TYPES
	TestGen<bool, false> (num_items, num_segments);
	TestGen<signed char, false> (num_items, num_segments);
	TestGen<unsigned char, false> (num_items, num_segments);
	#endif // TEST_EXTENDED_KEY_TYPES

	#elif TEST_KEY_BYTES == 2
	TestGen<unsigned short, true> (num_items, num_segments);

	#ifdef TEST_EXTENDED_KEY_TYPES
	TestGen<short, false> (num_items, num_segments);

	#if !_NVHPC_CUDA
	TestGen<half_t, false> (num_items, num_segments);
	#endif // CTK >= 9

	#if !_NVHPC_CUDA
	#if !defined(__ICC)
	// Fails with `-0 != 0` with ICC for unknown reasons. See #333.
	TestGen<bfloat16_t, false> (num_items, num_segments);
	#endif // !ICC
	#endif // CTK >= 11

	#endif // TEST_EXTENDED_KEY_TYPES

	#elif TEST_KEY_BYTES == 4

	TestGen<int, true> (num_items, num_segments);

	#if TEST_VALUE_TYPE == 0
	TestUnspecifiedRanges();
	#endif

	#if ENABLING_CONVERSION_IS_FIGURED_OUT
	device_radix_sort_allows_implicit_conversions_for_bits();
	#endif

	#ifdef TEST_EXTENDED_KEY_TYPES
	TestGen<float, false> (num_items, num_segments);
	TestGen<unsigned int, false> (num_items, num_segments);
	#endif // TEST_EXTENDED_KEY_TYPES

	#elif TEST_KEY_BYTES == 8

	TestGen<double, true> (num_items, num_segments);

	#ifdef TEST_EXTENDED_KEY_TYPES
	TestGen<long long, false> (num_items, num_segments);
	TestGen<unsigned long long, false>(num_items, num_segments);
	#endif // TEST_EXTENDED_KEY_TYPES

	#elif TEST_KEY_BYTES == 16

	#if CUB_IS_INT128_ENABLED
	TestGen<__int128_t, false>(num_items, num_segments);
	TestGen<__uint128_t, false>(num_items, num_segments);
	#else
	// Fix unused static function for MSVC
	BackendToString(CUB);
	#endif

	#endif // TEST_KEY_BYTES switch

	return 0;
	}