thrust / dependencies /cub /test /test_device_radix_sort.cu
camenduru's picture
thanks to nvidia ❤
8ae5fc5
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Test of DeviceRadixSort utilities
******************************************************************************/
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <algorithm>
#include <climits>
#include <cstdio>
#include <limits>
#include <memory>
#include <random>
#include <type_traits>
#include <typeinfo>
#include <vector>
#if !_NVHPC_CUDA
#include <cuda_fp16.h>
#endif
#if !_NVHPC_CUDA
#include <cuda_bf16.h>
#endif
#include <cub/detail/device_synchronize.cuh>
#include <cub/device/device_radix_sort.cuh>
#include <cub/device/device_segmented_radix_sort.cuh>
#include <cub/iterator/transform_input_iterator.cuh>
#include <cub/util_allocator.cuh>
#include <cub/util_math.cuh>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/host_vector.h>
#include <thrust/mismatch.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
#include "test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
bool g_verbose = false;
int g_timing_iterations = 0;
std::size_t g_smallest_pre_sorted_num_items = (std::size_t(1) << 32) - 42;
CachingDeviceAllocator g_allocator(true);
// Dispatch types
enum Backend
{
CUB, // CUB method (allows overwriting of input)
CUB_NO_OVERWRITE, // CUB method (disallows overwriting of input)
CUB_SEGMENTED, // CUB method (allows overwriting of input)
CUB_SEGMENTED_NO_OVERWRITE, // CUB method (disallows overwriting of input)
// Same as above, but launches kernels from device using CDP.
CDP,
CDP_NO_OVERWRITE,
CDP_SEGMENTED,
CDP_SEGMENTED_NO_OVERWRITE,
};
static const char* BackendToString(Backend b)
{
switch (b)
{
case CUB:
return "CUB";
case CUB_NO_OVERWRITE:
return "CUB_NO_OVERWRITE";
case CUB_SEGMENTED:
return "CUB_SEGMENTED";
case CUB_SEGMENTED_NO_OVERWRITE:
return "CUB_SEGMENTED_NO_OVERWRITE";
case CDP:
return "CDP";
case CDP_NO_OVERWRITE:
return "CDP_NO_OVERWRITE";
case CDP_SEGMENTED:
return "CDP_SEGMENTED";
case CDP_SEGMENTED_NO_OVERWRITE:
return "CDP_SEGMENTED_NO_OVERWRITE";
default:
break;
}
return "";
}
//---------------------------------------------------------------------
// Dispatch to different DeviceRadixSort entrypoints
//---------------------------------------------------------------------
/**
* Dispatch to CUB sorting entrypoint (specialized for ascending)
*/
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
typename NumItemsT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
Int2Type<false> /*is_descending*/,
Int2Type<CUB> /*dispatch_to*/,
int */*d_selector*/,
size_t */*d_temp_storage_bytes*/,
cudaError_t */*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT> &d_keys,
DoubleBuffer<ValueT> &d_values,
NumItemsT num_items,
int /*num_segments*/,
BeginOffsetIteratorT /*d_segment_begin_offsets*/,
EndOffsetIteratorT /*d_segment_end_offsets*/,
int begin_bit,
int end_bit)
{
return DeviceRadixSort::SortPairs(
d_temp_storage, temp_storage_bytes,
d_keys, d_values,
num_items, begin_bit, end_bit);
}
/**
* Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for ascending)
*/
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
typename NumItemsT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
Int2Type<false> /*is_descending*/,
Int2Type<CUB_NO_OVERWRITE> /*dispatch_to*/,
int */*d_selector*/,
size_t */*d_temp_storage_bytes*/,
cudaError_t */*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT> &d_keys,
DoubleBuffer<ValueT> &d_values,
NumItemsT num_items,
int /*num_segments*/,
BeginOffsetIteratorT /*d_segment_begin_offsets*/,
EndOffsetIteratorT /*d_segment_end_offsets*/,
int begin_bit,
int end_bit)
{
KeyT const *const_keys_itr = d_keys.Current();
ValueT const *const_values_itr = d_values.Current();
cudaError_t retval = DeviceRadixSort::SortPairs(
d_temp_storage, temp_storage_bytes,
const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
num_items, begin_bit, end_bit);
d_keys.selector ^= 1;
d_values.selector ^= 1;
return retval;
}
/**
* Dispatch to CUB sorting entrypoint (specialized for descending)
*/
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
typename NumItemsT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
Int2Type<true> /*is_descending*/,
Int2Type<CUB> /*dispatch_to*/,
int */*d_selector*/,
size_t */*d_temp_storage_bytes*/,
cudaError_t */*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT> &d_keys,
DoubleBuffer<ValueT> &d_values,
NumItemsT num_items,
int /*num_segments*/,
BeginOffsetIteratorT /*d_segment_begin_offsets*/,
EndOffsetIteratorT /*d_segment_end_offsets*/,
int begin_bit,
int end_bit)
{
return DeviceRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes,
d_keys, d_values, num_items,
begin_bit, end_bit);
}
/**
* Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for descending)
*/
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
typename NumItemsT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
Int2Type<true> /*is_descending*/,
Int2Type<CUB_NO_OVERWRITE> /*dispatch_to*/,
int */*d_selector*/,
size_t */*d_temp_storage_bytes*/,
cudaError_t */*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT> &d_keys,
DoubleBuffer<ValueT> &d_values,
NumItemsT num_items,
int /*num_segments*/,
BeginOffsetIteratorT /*d_segment_begin_offsets*/,
EndOffsetIteratorT /*d_segment_end_offsets*/,
int begin_bit,
int end_bit)
{
KeyT const *const_keys_itr = d_keys.Current();
ValueT const *const_values_itr = d_values.Current();
cudaError_t retval = DeviceRadixSort::SortPairsDescending(
d_temp_storage, temp_storage_bytes,
const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
num_items, begin_bit, end_bit);
d_keys.selector ^= 1;
d_values.selector ^= 1;
return retval;
}
//---------------------------------------------------------------------
// Dispatch to different DeviceRadixSort entrypoints
//---------------------------------------------------------------------
// Validates that `num_items` fits into `int`
// TODO(canonizer): remove this check once num_items is templated for segmented sort.
template <typename NumItemsT>
__host__ __device__ bool ValidateNumItemsForSegmentedSort(NumItemsT num_items)
{
if (static_cast<long long int>(num_items) <
static_cast<long long int>(INT_MAX))
{
return true;
}
else
{
printf("cub::DeviceSegmentedRadixSort is currently limited by %d items but "
"%lld were provided\n",
INT_MAX,
static_cast<long long int>(num_items));
}
return false;
}
/**
* Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for ascending)
*/
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
typename NumItemsT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
Int2Type<false> /*is_descending*/,
Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
int */*d_selector*/,
size_t */*d_temp_storage_bytes*/,
cudaError_t */*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT> &d_keys,
DoubleBuffer<ValueT> &d_values,
NumItemsT num_items,
int num_segments,
BeginOffsetIteratorT d_segment_begin_offsets,
EndOffsetIteratorT d_segment_end_offsets,
int begin_bit,
int end_bit)
{
if (ValidateNumItemsForSegmentedSort(num_items))
{
return DeviceSegmentedRadixSort::SortPairs(d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
static_cast<int>(num_items),
num_segments,
d_segment_begin_offsets,
d_segment_end_offsets,
begin_bit,
end_bit);
}
return cudaErrorInvalidValue;
}
/**
* Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for ascending)
*/
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
typename NumItemsT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
Int2Type<false> /*is_descending*/,
Int2Type<CUB_SEGMENTED_NO_OVERWRITE> /*dispatch_to*/,
int */*d_selector*/,
size_t */*d_temp_storage_bytes*/,
cudaError_t */*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT> &d_keys,
DoubleBuffer<ValueT> &d_values,
NumItemsT num_items,
int num_segments,
BeginOffsetIteratorT d_segment_begin_offsets,
EndOffsetIteratorT d_segment_end_offsets,
int begin_bit,
int end_bit)
{
if (ValidateNumItemsForSegmentedSort(num_items))
{
KeyT const *const_keys_itr = d_keys.Current();
ValueT const *const_values_itr = d_values.Current();
cudaError_t retval =
DeviceSegmentedRadixSort::SortPairs(d_temp_storage,
temp_storage_bytes,
const_keys_itr,
d_keys.Alternate(),
const_values_itr,
d_values.Alternate(),
static_cast<int>(num_items),
num_segments,
d_segment_begin_offsets,
d_segment_end_offsets,
begin_bit,
end_bit);
d_keys.selector ^= 1;
d_values.selector ^= 1;
return retval;
}
return cudaErrorInvalidValue;
}
/**
* Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for descending)
*/
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
typename NumItemsT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
Int2Type<true> /*is_descending*/,
Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
int */*d_selector*/,
size_t */*d_temp_storage_bytes*/,
cudaError_t */*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT> &d_keys,
DoubleBuffer<ValueT> &d_values,
NumItemsT num_items,
int num_segments,
BeginOffsetIteratorT d_segment_begin_offsets,
EndOffsetIteratorT d_segment_end_offsets,
int begin_bit,
int end_bit)
{
if (ValidateNumItemsForSegmentedSort(num_items))
{
return DeviceSegmentedRadixSort::SortPairsDescending(
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
static_cast<int>(num_items),
num_segments,
d_segment_begin_offsets,
d_segment_end_offsets,
begin_bit,
end_bit);
}
return cudaErrorInvalidValue;
}
/**
* Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for descending)
*/
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
typename NumItemsT>
CUB_RUNTIME_FUNCTION
cudaError_t Dispatch(
Int2Type<true> /*is_descending*/,
Int2Type<CUB_SEGMENTED_NO_OVERWRITE> /*dispatch_to*/,
int */*d_selector*/,
size_t */*d_temp_storage_bytes*/,
cudaError_t */*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
DoubleBuffer<KeyT> &d_keys,
DoubleBuffer<ValueT> &d_values,
NumItemsT num_items,
int num_segments,
BeginOffsetIteratorT d_segment_begin_offsets,
EndOffsetIteratorT d_segment_end_offsets,
int begin_bit,
int end_bit)
{
if (ValidateNumItemsForSegmentedSort(num_items))
{
KeyT const *const_keys_itr = d_keys.Current();
ValueT const *const_values_itr = d_values.Current();
cudaError_t retval =
DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage,
temp_storage_bytes,
const_keys_itr,
d_keys.Alternate(),
const_values_itr,
d_values.Alternate(),
static_cast<int>(num_items),
num_segments,
d_segment_begin_offsets,
d_segment_end_offsets,
begin_bit,
end_bit);
d_keys.selector ^= 1;
d_values.selector ^= 1;
return retval;
}
return cudaErrorInvalidValue;
}
//---------------------------------------------------------------------
// CUDA Nested Parallelism Test Kernel
//---------------------------------------------------------------------
#if TEST_CDP == 1
/**
* Simple wrapper kernel to invoke DeviceRadixSort
*/
template <int IsDescending,
int CubBackend,
typename KeyT,
typename ValueT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT,
typename NumItemsT>
__global__ void CDPDispatchKernel(Int2Type<IsDescending> is_descending,
Int2Type<CubBackend> cub_backend,
int *d_selector,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void *d_temp_storage,
size_t temp_storage_bytes,
DoubleBuffer<KeyT> d_keys,
DoubleBuffer<ValueT> d_values,
NumItemsT num_items,
int num_segments,
BeginOffsetIteratorT d_segment_begin_offsets,
EndOffsetIteratorT d_segment_end_offsets,
int begin_bit,
int end_bit)
{
*d_cdp_error = Dispatch(is_descending,
cub_backend,
d_selector,
d_temp_storage_bytes,
d_cdp_error,
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
num_segments,
d_segment_begin_offsets,
d_segment_end_offsets,
begin_bit,
end_bit);
*d_temp_storage_bytes = temp_storage_bytes;
*d_selector = d_keys.selector;
}
/**
* Launch kernel and dispatch on device. Should only be called from host code.
* The CubBackend should be one of the non-CDP CUB backends to invoke from the
* device.
*/
template <int IsDescending,
int CubBackend,
typename KeyT,
typename ValueT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT,
typename NumItemsT>
cudaError_t LaunchCDPKernel(Int2Type<IsDescending> is_descending,
Int2Type<CubBackend> cub_backend,
int *d_selector,
size_t *d_temp_storage_bytes,
cudaError_t *d_cdp_error,
void *d_temp_storage,
size_t &temp_storage_bytes,
DoubleBuffer<KeyT> &d_keys,
DoubleBuffer<ValueT> &d_values,
NumItemsT num_items,
int num_segments,
BeginOffsetIteratorT d_segment_begin_offsets,
EndOffsetIteratorT d_segment_end_offsets,
int begin_bit,
int end_bit)
{
// Invoke kernel to invoke device-side dispatch:
cudaError_t retval =
thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0)
.doit(CDPDispatchKernel<IsDescending,
CubBackend,
KeyT,
ValueT,
BeginOffsetIteratorT,
EndOffsetIteratorT,
NumItemsT>,
is_descending,
cub_backend,
d_selector,
d_temp_storage_bytes,
d_cdp_error,
d_temp_storage,
temp_storage_bytes,
d_keys,
d_values,
num_items,
num_segments,
d_segment_begin_offsets,
d_segment_end_offsets,
begin_bit,
end_bit);
CubDebugExit(retval);
CubDebugExit(cub::detail::device_synchronize());
// Copy out selector
CubDebugExit(cudaMemcpy(&d_keys.selector,
d_selector,
sizeof(int) * 1,
cudaMemcpyDeviceToHost));
d_values.selector = d_keys.selector;
// Copy out temp_storage_bytes
CubDebugExit(cudaMemcpy(&temp_storage_bytes,
d_temp_storage_bytes,
sizeof(size_t) * 1,
cudaMemcpyDeviceToHost));
// Copy out error
CubDebugExit(cudaMemcpy(&retval,
d_cdp_error,
sizeof(cudaError_t) * 1,
cudaMemcpyDeviceToHost));
return retval;
}
// Specializations of Dispatch that translate the CDP backend to the appropriate
// CUB backend, and uses the CUB backend to launch the CDP kernel.
#define DEFINE_CDP_DISPATCHER(CdpBackend, CubBackend) \
template <int IsDescending, \
typename KeyT, \
typename ValueT, \
typename BeginOffsetIteratorT, \
typename EndOffsetIteratorT, \
typename NumItemsT> \
cudaError_t Dispatch(Int2Type<IsDescending> is_descending, \
Int2Type<CdpBackend> /*dispatch_to*/, \
int *d_selector, \
size_t *d_temp_storage_bytes, \
cudaError_t *d_cdp_error, \
\
void *d_temp_storage, \
size_t &temp_storage_bytes, \
DoubleBuffer<KeyT> &d_keys, \
DoubleBuffer<ValueT> &d_values, \
NumItemsT num_items, \
int num_segments, \
BeginOffsetIteratorT d_segment_begin_offsets, \
EndOffsetIteratorT d_segment_end_offsets, \
int begin_bit, \
int end_bit) \
{ \
Int2Type<CubBackend> cub_backend{}; \
return LaunchCDPKernel(is_descending, \
cub_backend, \
d_selector, \
d_temp_storage_bytes, \
d_cdp_error, \
d_temp_storage, \
temp_storage_bytes, \
d_keys, \
d_values, \
num_items, \
num_segments, \
d_segment_begin_offsets, \
d_segment_end_offsets, \
begin_bit, \
end_bit); \
}
DEFINE_CDP_DISPATCHER(CDP, CUB)
DEFINE_CDP_DISPATCHER(CDP_NO_OVERWRITE, CUB_NO_OVERWRITE)
DEFINE_CDP_DISPATCHER(CDP_SEGMENTED, CUB_SEGMENTED)
DEFINE_CDP_DISPATCHER(CDP_SEGMENTED_NO_OVERWRITE, CUB_SEGMENTED_NO_OVERWRITE)
#undef DEFINE_CDP_DISPATCHER
#endif // TEST_CDP
//---------------------------------------------------------------------
// Problem generation
//---------------------------------------------------------------------
/**
* Simple key-value pairing
*/
template <
typename KeyT,
typename ValueT>
struct Pair
{
KeyT key;
ValueT value;
bool operator<(const Pair &b) const
{
return (key < b.key);
}
};
/**
* Simple key-value pairing (specialized for bool types)
*/
template <typename ValueT>
struct Pair<bool, ValueT>
{
bool key;
ValueT value;
bool operator<(const Pair &b) const
{
return (!key && b.key);
}
};
/**
* Initialize key data
*/
template <typename KeyT, typename NumItemsT>
void InitializeKeyBits(
GenMode gen_mode,
KeyT *h_keys,
NumItemsT num_items,
int /*entropy_reduction*/)
{
for (NumItemsT i = 0; i < num_items; ++i)
InitValue(gen_mode, h_keys[i], i);
}
template <typename KeyT,
typename UnsignedBits = typename cub::Traits<KeyT>::UnsignedBits>
UnsignedBits KeyBits(KeyT key)
{
UnsignedBits bits;
memcpy(&bits, &key, sizeof(KeyT));
return bits;
}
/** Initialize the reference array monotonically. */
template <typename KeyT, typename NumItemsT>
void InitializeKeysSorted(
KeyT *h_keys,
NumItemsT num_items)
{
using TraitsT = cub::Traits<KeyT>;
using UnsignedBits = typename TraitsT::UnsignedBits;
// Numbers to generate random runs.
UnsignedBits max_inc = 1 << (sizeof(UnsignedBits) < 4 ? 3 :
(sizeof(UnsignedBits) < 8 ? 14 : 24));
UnsignedBits min_bits = TraitsT::TwiddleIn(KeyBits(TraitsT::Lowest()));
UnsignedBits max_bits = TraitsT::TwiddleIn(KeyBits(TraitsT::Max()));
NumItemsT max_run = std::max(
NumItemsT(double(num_items) * (max_inc + 1) / max_bits),
NumItemsT(1 << 14));
UnsignedBits *h_key_bits = reinterpret_cast<UnsignedBits*>(h_keys);
NumItemsT i = 0;
// Start with the minimum twiddled key.
UnsignedBits twiddled_key = min_bits;
while (i < num_items)
{
// Generate random increment (avoid overflow).
UnsignedBits inc_bits = 0;
RandomBits(inc_bits);
// twiddled_key < max_bits at this point.
UnsignedBits inc = static_cast<UnsignedBits>(std::min(1 + inc_bits % max_inc, max_bits - twiddled_key));
twiddled_key += inc;
// Generate random run length (ensure there are enough values to fill the rest).
NumItemsT run_bits = 0;
RandomBits(run_bits);
NumItemsT run_length = std::min(1 + run_bits % max_run, num_items - i);
if (twiddled_key == max_bits) run_length = num_items - i;
NumItemsT run_end = i + run_length;
// Fill the array.
UnsignedBits key = TraitsT::TwiddleOut(twiddled_key);
// Avoid -0.0 for floating-point keys.
UnsignedBits negative_zero = UnsignedBits(1) << UnsignedBits(sizeof(UnsignedBits) * 8 - 1);
if (TraitsT::CATEGORY == cub::FLOATING_POINT && key == negative_zero)
{
key = 0;
}
for (; i < run_end; ++i)
{
h_key_bits[i] = key;
}
}
}
/**
* Initialize solution
*/
template <bool IS_DESCENDING, bool WANT_RANKS, typename KeyT, typename NumItemsT>
void InitializeSolution(
KeyT *h_keys,
NumItemsT num_items,
int num_segments,
bool pre_sorted,
NumItemsT *h_segment_offsets,
int begin_bit,
int end_bit,
NumItemsT *&h_reference_ranks,
KeyT *&h_reference_keys)
{
if (num_items == 0)
{
h_reference_ranks = nullptr;
h_reference_keys = nullptr;
return;
}
if (pre_sorted)
{
printf("Shuffling reference solution on CPU\n");
// Note: begin_bit and end_bit are ignored here, and assumed to have the
// default values (begin_bit == 0, end_bit == 8 * sizeof(KeyT)).
// Otherwise, pre-sorting won't work, as it doesn't necessarily
// correspond to the order of keys sorted by a subrange of bits.
// num_segments is also ignored as assumed to be 1, as pre-sorted tests
// are currently not supported for multiple segments.
//
// Pre-sorted tests with non-default begin_bit, end_bit or num_segments
// != 1 are skipped in TestBits() and TestSegments(), respectively.
AssertEquals(begin_bit, 0);
AssertEquals(end_bit, static_cast<int>(8 * sizeof(KeyT)));
AssertEquals(num_segments, 1);
// Copy to the reference solution.
h_reference_keys = new KeyT[num_items];
if (IS_DESCENDING)
{
// Copy in reverse.
for (NumItemsT i = 0; i < num_items; ++i)
{
h_reference_keys[i] = h_keys[num_items - 1 - i];
}
// Copy back.
memcpy(h_keys, h_reference_keys, num_items * sizeof(KeyT));
}
else
{
memcpy(h_reference_keys, h_keys, num_items * sizeof(KeyT));
}
// Summarize the pre-sorted array (element, 1st position, count).
struct Element
{
KeyT key;
NumItemsT num;
NumItemsT index;
};
std::vector<Element> summary;
KeyT cur_key = h_reference_keys[0];
summary.push_back(Element{cur_key, 1, 0});
for (NumItemsT i = 1; i < num_items; ++i)
{
KeyT key = h_reference_keys[i];
if (key == cur_key)
{
// Same key.
summary.back().num++;
continue;
}
// Different key.
cur_key = key;
summary.push_back(Element{cur_key, 1, i});
}
// Generate a random permutation from the summary. Such a complicated
// approach is used to permute the array and compute ranks in a
// cache-friendly way and in a short time.
if (WANT_RANKS)
{
h_reference_ranks = new NumItemsT[num_items];
}
NumItemsT max_run = 32;
NumItemsT run = 0;
NumItemsT i = 0;
while (summary.size() > 0)
{
// Pick up a random element and a run.
NumItemsT bits = 0;
RandomBits(bits);
NumItemsT summary_id = bits % summary.size();
Element& element = summary[summary_id];
run = std::min(1 + bits % (max_run - 1), element.num);
for (NumItemsT j = 0; j < run; ++j)
{
h_keys[i + j] = element.key;
if (WANT_RANKS)
{
h_reference_ranks[element.index + j] = i + j;
}
}
i += run;
element.index += run;
element.num -= run;
if (element.num == 0)
{
// Remove the empty entry.
std::swap(summary[summary_id], summary.back());
summary.pop_back();
}
}
printf(" Done.\n");
}
else
{
typedef Pair<KeyT, NumItemsT> PairT;
PairT *h_pairs = new PairT[num_items];
int num_bits = end_bit - begin_bit;
for (NumItemsT i = 0; i < num_items; ++i)
{
// Mask off unwanted portions
if (num_bits < static_cast<int>(sizeof(KeyT) * 8))
{
using UnsignedBits = typename cub::Traits<KeyT>::UnsignedBits;
UnsignedBits base = 0;
memcpy(&base, &h_keys[i], sizeof(KeyT));
base &= ((UnsignedBits{1} << num_bits) - 1) << begin_bit;
memcpy(&h_pairs[i].key, &base, sizeof(KeyT));
}
else
{
h_pairs[i].key = h_keys[i];
}
h_pairs[i].value = i;
}
printf("\nSorting reference solution on CPU "
"(%zd items, %d segments, %zd items/seg)...",
static_cast<std::size_t>(num_items),
num_segments,
static_cast<std::size_t>(num_items / num_segments));
fflush(stdout);
for (int i = 0; i < num_segments; ++i)
{
if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
std::stable_sort( h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
}
printf(" Done.\n"); fflush(stdout);
if (WANT_RANKS)
{
h_reference_ranks = new NumItemsT[num_items];
}
h_reference_keys = new KeyT[num_items];
for (NumItemsT i = 0; i < num_items; ++i)
{
if (WANT_RANKS)
{
h_reference_ranks[i] = h_pairs[i].value;
}
h_reference_keys[i] = h_keys[h_pairs[i].value];
}
if (h_pairs) delete[] h_pairs;
}
}
template <bool IS_DESCENDING, typename KeyT, typename NumItemsT>
void ResetKeys(KeyT *h_keys, NumItemsT num_items, bool pre_sorted, KeyT *reference_keys)
{
if (!pre_sorted) return;
// Copy the reference keys back.
if (IS_DESCENDING)
{
// Keys need to be copied in reverse.
for (NumItemsT i = 0; i < num_items; ++i)
{
h_keys[i] = reference_keys[num_items - 1 - i];
}
}
else
{
memcpy(h_keys, reference_keys, num_items * sizeof(KeyT));
}
}
//---------------------------------------------------------------------
// Test generation
//---------------------------------------------------------------------
template <typename T>
struct UnwrapHalfAndBfloat16 {
using Type = T;
};
#if !_NVHPC_CUDA
template <>
struct UnwrapHalfAndBfloat16<half_t> {
using Type = __half;
};
#endif
#if !_NVHPC_CUDA
template <>
struct UnwrapHalfAndBfloat16<bfloat16_t> {
using Type = __nv_bfloat16;
};
#endif
template <class T, class NumItemsT>
int compare_device_arrays(T* host_reference, T* d_tmp_buffer, T* d_data, NumItemsT num_items)
{
CubDebugExit(cudaMemcpy(d_tmp_buffer, host_reference, sizeof(T) * num_items, cudaMemcpyHostToDevice));
auto d_reference_begin = d_tmp_buffer;
auto d_reference_end = d_tmp_buffer + num_items;
auto err = thrust::mismatch(thrust::device, d_reference_begin, d_reference_end, d_data);
if (err.first != d_reference_end)
{
const auto index = thrust::distance(d_reference_begin, err.first);
return CompareDeviceResults(host_reference + index, d_data + index, 1, true, g_verbose);
}
return 0;
}
template <class NumItemsT>
int compare_device_arrays(
CUB_NS_QUALIFIER::NullType */* h_reference */,
CUB_NS_QUALIFIER::NullType */* d_tmp_buffer */,
CUB_NS_QUALIFIER::NullType */* d_data */,
NumItemsT /* num_items */)
{
return 0;
}
/**
* Test DeviceRadixSort
*/
template <
Backend BACKEND,
bool IS_DESCENDING,
typename KeyT,
typename ValueT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT,
typename NumItemsT>
void Test(
KeyT *h_keys,
ValueT *h_values,
NumItemsT num_items,
int num_segments,
BeginOffsetIteratorT d_segment_begin_offsets,
EndOffsetIteratorT d_segment_end_offsets,
int begin_bit,
int end_bit,
KeyT *h_reference_keys,
ValueT *h_reference_values)
{
// Key alias type
using KeyAliasT = typename UnwrapHalfAndBfloat16<KeyT>::Type;
const bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
printf("%s %s cub::DeviceRadixSort %zd items, %d segments, "
"%d-byte keys (%s) %d-byte values (%s), %d-byte num_items (%s), "
"descending %d, begin_bit %d, end_bit %d\n",
BackendToString(BACKEND),
(KEYS_ONLY) ? "keys-only" : "key-value",
static_cast<std::size_t>(num_items),
num_segments,
static_cast<int>(sizeof(KeyT)),
typeid(KeyT).name(),
(KEYS_ONLY) ? 0 : static_cast<int>(sizeof(ValueT)),
typeid(ValueT).name(),
static_cast<int>(sizeof(NumItemsT)),
typeid(NumItemsT).name(),
IS_DESCENDING,
begin_bit,
end_bit);
if (g_verbose)
{
printf("Input keys:\n");
DisplayResults(h_keys, num_items);
printf("\n\n");
}
// Allocate device arrays
DoubleBuffer<KeyAliasT> d_keys;
DoubleBuffer<ValueT> d_values;
int *d_selector;
size_t *d_temp_storage_bytes;
cudaError_t *d_cdp_error;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(KeyT) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(KeyT) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_selector, sizeof(int) * 1));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
if (!KEYS_ONLY)
{
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(ValueT) * num_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(ValueT) * num_items));
}
// Allocate temporary storage (and make it un-aligned)
size_t temp_storage_bytes = 0;
void *d_temp_storage = NULL;
CubDebugExit(Dispatch(
Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes, d_keys, d_values,
num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
begin_bit, end_bit));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + 1));
void* mis_aligned_temp = static_cast<char*>(d_temp_storage) + 1;
// Initialize/clear device arrays
d_keys.selector = 0;
CubDebugExit(cudaMemcpy(d_keys.d_buffers[0], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_keys.d_buffers[1], 0, sizeof(KeyT) * num_items));
if (!KEYS_ONLY)
{
d_values.selector = 0;
CubDebugExit(cudaMemcpy(d_values.d_buffers[0], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_values.d_buffers[1], 0, sizeof(ValueT) * num_items));
}
// Run warmup/correctness iteration
CubDebugExit(Dispatch(
Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
begin_bit, end_bit));
// Flush any stdout/stderr
fflush(stdout);
fflush(stderr);
// Check for correctness (and display results, if specified)
printf("Warmup done. Checking results:\n"); fflush(stdout);
int compare = 0;
// If in/out API is used, we are not allowed to overwrite the input.
// Let's check that the input buffer is not overwritten by the algorithm.
if (BACKEND == CUB_NO_OVERWRITE)
{
KeyT *d_input_keys = reinterpret_cast<KeyT*>(d_keys.d_buffers[0]);
if (temp_storage_bytes < sizeof(KeyT) * num_items)
{
// For small input sizes, temporary storage is not large enough to fit keys.
compare = CompareDeviceResults(h_keys, d_input_keys, num_items, true, g_verbose);
}
else
{
// If overwrite is not allowed, temporary storage is large enough to fit keys.
KeyT* temp_keys = reinterpret_cast<KeyT*>(d_temp_storage);
compare = compare_device_arrays(h_keys, temp_keys, d_input_keys, num_items);
printf("\t Compare input keys: %s ", compare ? "FAIL" : "PASS"); fflush(stdout);
}
}
// After the previous check is done, we can safely reuse alternative buffer to store
// the reference results and compare current output.
compare |= compare_device_arrays(h_reference_keys,
reinterpret_cast<KeyT *>(d_keys.Alternate()),
reinterpret_cast<KeyT *>(d_keys.Current()),
num_items);
printf("\t Compare keys (selector %d): %s ", d_keys.selector, compare ? "FAIL" : "PASS"); fflush(stdout);
if (!KEYS_ONLY)
{
int values_compare = compare_device_arrays(h_reference_values,
d_values.Alternate(),
d_values.Current(),
num_items);
compare |= values_compare;
printf("\t Compare values (selector %d): %s ", d_values.selector, values_compare ? "FAIL" : "PASS"); fflush(stdout);
}
// Performance
if (g_timing_iterations)
printf("\nPerforming timing iterations:\n"); fflush(stdout);
GpuTimer gpu_timer;
float elapsed_millis = 0.0f;
for (int i = 0; i < g_timing_iterations; ++i)
{
// Initialize/clear device arrays
CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_keys.d_buffers[d_keys.selector ^ 1], 0, sizeof(KeyT) * num_items));
if (!KEYS_ONLY)
{
CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_values.d_buffers[d_values.selector ^ 1], 0, sizeof(ValueT) * num_items));
}
gpu_timer.Start();
CubDebugExit(Dispatch(
Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
begin_bit, end_bit));
gpu_timer.Stop();
elapsed_millis += gpu_timer.ElapsedMillis();
}
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = (KEYS_ONLY) ?
giga_rate * sizeof(KeyT) * 2 :
giga_rate * (sizeof(KeyT) + sizeof(ValueT)) * 2;
printf("\n%.3f elapsed ms, %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", elapsed_millis, avg_millis, giga_rate, giga_bandwidth);
}
printf("\n\n");
// Cleanup
if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
if (d_selector) CubDebugExit(g_allocator.DeviceFree(d_selector));
if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
// Correctness asserts
AssertEquals(0, compare);
}
// Returns whether there is enough memory for the test.
template <typename KeyT, typename ValueT>
bool HasEnoughMemory(std::size_t num_items, bool overwrite)
{
std::size_t total_mem = TotalGlobalMem();
std::size_t value_size = std::is_same<ValueT, NullType>::value
? 0
: sizeof(ValueT);
// A conservative estimate of the amount of memory required.
double factor = overwrite ? 2.25 : 3.25;
std::size_t test_mem = static_cast<std::size_t>
(num_items * (sizeof(KeyT) + value_size) * factor);
return test_mem < total_mem;
}
/**
* Test backend
*/
template <bool IS_DESCENDING,
typename KeyT,
typename ValueT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT,
typename NumItemsT>
void TestBackend(KeyT *h_keys,
NumItemsT num_items,
int num_segments,
BeginOffsetIteratorT d_segment_begin_offsets,
EndOffsetIteratorT d_segment_end_offsets,
int begin_bit,
int end_bit,
KeyT *h_reference_keys,
NumItemsT *h_reference_ranks)
{
#if TEST_CDP == 0
constexpr auto NonSegmentedOverwrite = CUB;
constexpr auto NonSegmentedNoOverwrite = CUB_NO_OVERWRITE;
constexpr auto SegmentedOverwrite = CUB_SEGMENTED;
constexpr auto SegmentedNoOverwrite = CUB_SEGMENTED_NO_OVERWRITE;
#else // TEST_CDP
constexpr auto NonSegmentedOverwrite = CDP;
constexpr auto NonSegmentedNoOverwrite = CDP_NO_OVERWRITE;
constexpr auto SegmentedOverwrite = CDP_SEGMENTED;
constexpr auto SegmentedNoOverwrite = CDP_SEGMENTED_NO_OVERWRITE;
#endif // TEST_CDP
const bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
// A conservative check assuming overwrite is allowed.
if (!HasEnoughMemory<KeyT, ValueT>(static_cast<std::size_t>(num_items), true))
{
printf("Skipping the test due to insufficient device memory\n");
return;
}
std::unique_ptr<ValueT[]> h_value_data{};
ValueT *h_values = nullptr;
ValueT *h_reference_values = nullptr;
if (!KEYS_ONLY)
{
h_value_data.reset(new ValueT[2 * static_cast<std::size_t>(num_items)]);
h_values = h_value_data.get();
h_reference_values = h_value_data.get() + num_items;
for (NumItemsT i = 0; i < num_items; ++i)
{
InitValue(INTEGER_SEED, h_values[i], i);
InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
}
}
// Skip segmented sort if num_items isn't int.
// TODO(64bit-seg-sort): re-enable these tests once num_items is templated for
// segmented sort.
if (std::is_same<NumItemsT, int>::value)
{
printf("Testing segmented sort with overwrite\n");
Test<SegmentedOverwrite, IS_DESCENDING>(h_keys,
h_values,
num_items,
num_segments,
d_segment_begin_offsets,
d_segment_end_offsets,
begin_bit,
end_bit,
h_reference_keys,
h_reference_values);
printf("Testing segmented sort with no overwrite\n");
Test<SegmentedNoOverwrite, IS_DESCENDING>(h_keys,
h_values,
num_items,
num_segments,
d_segment_begin_offsets,
d_segment_end_offsets,
begin_bit,
end_bit,
h_reference_keys,
h_reference_values);
}
else
{
printf("Skipping segmented sort tests (NumItemsT != int)\n");
}
if (num_segments == 1)
{
printf("Testing non-segmented sort with overwrite\n");
Test<NonSegmentedOverwrite, IS_DESCENDING>(h_keys,
h_values,
num_items,
num_segments,
d_segment_begin_offsets,
d_segment_end_offsets,
begin_bit,
end_bit,
h_reference_keys,
h_reference_values);
if (HasEnoughMemory<KeyT, ValueT>(static_cast<std::size_t>(num_items),
false))
{
printf("Testing non-segmented sort with no overwrite\n");
Test<NonSegmentedNoOverwrite, IS_DESCENDING>(h_keys,
h_values,
num_items,
num_segments,
d_segment_begin_offsets,
d_segment_end_offsets,
begin_bit,
end_bit,
h_reference_keys,
h_reference_values);
}
else
{
printf("Skipping no-overwrite tests with %zd items due to "
"insufficient memory\n",
static_cast<std::size_t>(num_items));
}
}
}
// Smallest value type for TEST_VALUE_TYPE.
// Unless TEST_VALUE_TYPE == 3, this is the only value type tested.
#if TEST_VALUE_TYPE == 0
// Test keys-only
using SmallestValueT = NullType;
#elif TEST_VALUE_TYPE == 1
// Test with 8b value
using SmallestValueT = unsigned char;
#elif TEST_VALUE_TYPE == 2
// Test with 32b value
using SmallestValueT = unsigned int;
// Test with 64b value
#elif TEST_VALUE_TYPE == 3
using SmallestValueT = unsigned long long;
#endif
/**
* Test value type
*/
template <bool IS_DESCENDING, typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename NumItemsT>
void TestValueTypes(
KeyT *h_keys,
NumItemsT num_items,
int num_segments,
bool pre_sorted,
NumItemsT *h_segment_offsets,
BeginOffsetIteratorT d_segment_begin_offsets,
EndOffsetIteratorT d_segment_end_offsets,
int begin_bit,
int end_bit)
{
// Initialize the solution
NumItemsT *h_reference_ranks = NULL;
KeyT *h_reference_keys = NULL;
// If TEST_VALUE_TYPE == 0, no values are sorted, only keys.
// Since ranks are only necessary when checking for values,
// they are not computed in this case.
InitializeSolution<IS_DESCENDING, TEST_VALUE_TYPE != 0>(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, begin_bit, end_bit, h_reference_ranks, h_reference_keys);
TestBackend<IS_DESCENDING, KeyT, SmallestValueT> (h_keys, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
#if TEST_VALUE_TYPE == 3
// Test with non-trivially-constructable value
// These are cheap to build, so lump them in with the 64b value tests.
TestBackend<IS_DESCENDING, KeyT, TestBar> (h_keys, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
#endif
// Cleanup
ResetKeys<IS_DESCENDING>(h_keys, num_items, pre_sorted, h_reference_keys);
if (h_reference_ranks) delete[] h_reference_ranks;
if (h_reference_keys) delete[] h_reference_keys;
}
/**
* Test ascending/descending
*/
template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
typename NumItemsT>
void TestDirection(
KeyT *h_keys,
NumItemsT num_items,
int num_segments,
bool pre_sorted,
NumItemsT *h_segment_offsets,
BeginOffsetIteratorT d_segment_begin_offsets,
EndOffsetIteratorT d_segment_end_offsets,
int begin_bit,
int end_bit)
{
TestValueTypes<true>(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit);
TestValueTypes<false>(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit);
}
/**
* Test different bit ranges
*/
template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
typename NumItemsT>
void TestBits(
KeyT *h_keys,
NumItemsT num_items,
int num_segments,
bool pre_sorted,
NumItemsT *h_segment_offsets,
BeginOffsetIteratorT d_segment_begin_offsets,
EndOffsetIteratorT d_segment_end_offsets)
{
// Don't test partial-word sorting for boolean, fp, or signed types (the bit-flipping techniques get in the way) or pre-sorted keys
if ((Traits<KeyT>::CATEGORY == UNSIGNED_INTEGER)
&& (!std::is_same<KeyT, bool>::value)
&& !pre_sorted)
{
// Partial bits
int begin_bit = 1;
int end_bit = (sizeof(KeyT) * 8) - 1;
printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout);
TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit);
// Equal bits
begin_bit = end_bit = 0;
printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout);
TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit);
// Across subword boundaries
int mid_bit = sizeof(KeyT) * 4;
printf("Testing key bits [%d,%d)\n", mid_bit - 1, mid_bit + 1); fflush(stdout);
TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, mid_bit - 1, mid_bit + 1);
}
printf("Testing key bits [%d,%d)\n", 0, int(sizeof(KeyT)) * 8); fflush(stdout);
TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, 0, sizeof(KeyT) * 8);
}
template<typename OffsetT>
struct TransformFunctor1
{
__host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const
{
return offset;
}
};
template<typename OffsetT>
struct TransformFunctor2
{
__host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const
{
return offset;
}
};
/**
* Test different segment iterators
*/
template <typename KeyT, typename NumItemsT>
void TestSegmentIterators(
KeyT *h_keys,
NumItemsT num_items,
int num_segments,
bool pre_sorted,
NumItemsT *h_segment_offsets,
NumItemsT *d_segment_offsets)
{
InitializeSegments(num_items, num_segments, h_segment_offsets);
CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(NumItemsT) * (num_segments + 1), cudaMemcpyHostToDevice));
// Test with segment pointer.
// This is also used to test non-segmented sort.
TestBits(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_offsets, d_segment_offsets + 1);
if (num_segments > 1)
{
// Test with transform iterators of different types
typedef TransformFunctor1<NumItemsT> TransformFunctor1T;
typedef TransformFunctor2<NumItemsT> TransformFunctor2T;
TransformInputIterator<NumItemsT, TransformFunctor1T, NumItemsT*, NumItemsT> d_segment_begin_offsets_itr(d_segment_offsets, TransformFunctor1T());
TransformInputIterator<NumItemsT, TransformFunctor2T, NumItemsT*, NumItemsT> d_segment_end_offsets_itr(d_segment_offsets + 1, TransformFunctor2T());
TestBits(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets_itr, d_segment_end_offsets_itr);
}
}
/**
* Test different segment compositions
*/
template <typename KeyT, typename NumItemsT>
void TestSegments(
KeyT *h_keys,
NumItemsT num_items,
int max_segments,
bool pre_sorted)
{
max_segments = static_cast<int>(CUB_MIN(num_items, static_cast<NumItemsT>(max_segments)));
NumItemsT *h_segment_offsets = new NumItemsT[max_segments + 1];
NumItemsT *d_segment_offsets = nullptr;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(NumItemsT) * (max_segments + 1)));
for (int num_segments = max_segments; num_segments > 1; num_segments = cub::DivideAndRoundUp(num_segments, 64))
{
// Pre-sorted tests are not supported for segmented sort
if (num_items / num_segments < 128 * 1000 && !pre_sorted) {
// Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
TestSegmentIterators(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_offsets);
}
}
// Test single segment
if (num_items > 0)
{
if (num_items < 128 * 1000 || pre_sorted)
{
// Right now we assign a single thread block to each segment, so lets
// keep it to under 128K items per segment
TestSegmentIterators(h_keys,
num_items,
1,
pre_sorted,
h_segment_offsets,
d_segment_offsets);
}
}
if (h_segment_offsets) delete[] h_segment_offsets;
if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
}
/**
* Test different NumItemsT, i.e. types of num_items
*/
template <typename KeyT>
void TestNumItems(KeyT *h_keys,
std::size_t num_items,
int max_segments,
bool pre_sorted)
{
if (!pre_sorted && num_items <= std::size_t(std::numeric_limits<int>::max()))
{
TestSegments<KeyT, int>(h_keys, static_cast<int>(num_items), max_segments, pre_sorted);
}
if (pre_sorted && num_items <= std::size_t(std::numeric_limits<std::uint32_t>::max()))
{
TestSegments<KeyT, std::uint32_t>(h_keys, static_cast<std::uint32_t>(num_items), max_segments, pre_sorted);
}
TestSegments<KeyT, std::size_t>(h_keys, num_items, max_segments, pre_sorted);
}
/**
* Test different (sub)lengths and number of segments
*/
template <typename KeyT>
void TestSizes(KeyT* h_keys,
std::size_t max_items,
int max_segments,
bool pre_sorted)
{
if (pre_sorted)
{
// run a specific list of sizes, up to max_items
std::size_t sizes[] = {g_smallest_pre_sorted_num_items, 4350000007ull};
for (std::size_t num_items : sizes)
{
if (num_items > max_items) break;
TestNumItems(h_keys, num_items, max_segments, pre_sorted);
}
}
else
{
for (std::size_t num_items = max_items;
num_items > 1;
num_items = cub::DivideAndRoundUp(num_items, 64))
{
TestNumItems(h_keys, num_items, max_segments, pre_sorted);
}
}
}
/**
* Test key sampling distributions
*/
template <typename KeyT, bool WITH_PRE_SORTED>
void TestGen(
std::size_t max_items,
int max_segments)
{
if (max_items == ~std::size_t(0))
{
max_items = 8000003;
}
if (max_segments < 0)
{
max_segments = 5003;
}
std::unique_ptr<KeyT[]> h_keys(new KeyT[max_items]);
// Test trivial problems sizes
h_keys[0] = static_cast<KeyT>(42);
TestNumItems(h_keys.get(), 0, 0, false);
TestNumItems(h_keys.get(), 1, 1, false);
for (int entropy_reduction = 0; entropy_reduction <= 6; entropy_reduction += 6)
{
printf("\nTesting random %s keys with entropy reduction factor %d\n", typeid(KeyT).name(), entropy_reduction); fflush(stdout);
InitializeKeyBits(RANDOM, h_keys.get(), max_items, entropy_reduction);
TestSizes(h_keys.get(), max_items, max_segments, false);
}
if (cub::Traits<KeyT>::CATEGORY == cub::FLOATING_POINT)
{
printf("\nTesting random %s keys with some replaced with -0.0 or +0.0 \n", typeid(KeyT).name());
fflush(stdout);
InitializeKeyBits(RANDOM_MINUS_PLUS_ZERO, h_keys.get(), max_items, 0);
// This just tests +/- 0 handling -- don't need to test multiple sizes
TestNumItems(h_keys.get(), max_items, max_segments, false);
}
printf("\nTesting uniform %s keys\n", typeid(KeyT).name()); fflush(stdout);
InitializeKeyBits(UNIFORM, h_keys.get(), max_items, 0);
TestSizes(h_keys.get(), max_items, max_segments, false);
printf("\nTesting natural number %s keys\n", typeid(KeyT).name()); fflush(stdout);
InitializeKeyBits(INTEGER_SEED, h_keys.get(), max_items, 0);
TestSizes(h_keys.get(), max_items, max_segments, false);
if (WITH_PRE_SORTED)
{
// Presorting is only used for testing large input arrays.
const std::size_t large_num_items = std::size_t(4350000007ull);
// A conservative check for memory, as we don't know ValueT or whether
// the overwrite is allowed until later.
// For ValueT, the check is actually exact unless TEST_VALUE_TYPE == 3.
if (!HasEnoughMemory<KeyT, SmallestValueT>(large_num_items, true))
{
printf("Skipping the permutation-based test due to insufficient device memory\n");
return;
}
h_keys.reset(nullptr); // Explicitly free old buffer before allocating.
h_keys.reset(new KeyT[large_num_items]);
printf("\nTesting pre-sorted and randomly permuted %s keys\n", typeid(KeyT).name());
fflush(stdout);
InitializeKeysSorted(h_keys.get(), large_num_items);
fflush(stdout);
TestSizes(h_keys.get(), large_num_items, max_segments, true);
fflush(stdout);
}
}
//---------------------------------------------------------------------
// Simple test
//---------------------------------------------------------------------
template <
Backend BACKEND,
typename KeyT,
typename ValueT,
bool IS_DESCENDING>
void Test(
std::size_t num_items,
int num_segments,
GenMode gen_mode,
int entropy_reduction,
int begin_bit,
int end_bit)
{
const bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
KeyT *h_keys = new KeyT[num_items];
std::size_t *h_reference_ranks = NULL;
KeyT *h_reference_keys = NULL;
ValueT *h_values = NULL;
ValueT *h_reference_values = NULL;
size_t *h_segment_offsets = new std::size_t[num_segments + 1];
std::size_t* d_segment_offsets = nullptr;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(std::size_t) * (num_segments + 1)));
if (end_bit < 0)
end_bit = sizeof(KeyT) * 8;
InitializeKeyBits(gen_mode, h_keys, num_items, entropy_reduction);
InitializeSegments(num_items, num_segments, h_segment_offsets);
CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(std::size_t) * (num_segments + 1), cudaMemcpyHostToDevice));
InitializeSolution<IS_DESCENDING, !KEYS_ONLY>(
h_keys, num_items, num_segments, false, h_segment_offsets,
begin_bit, end_bit, h_reference_ranks, h_reference_keys);
if (!KEYS_ONLY)
{
h_values = new ValueT[num_items];
h_reference_values = new ValueT[num_items];
for (std::size_t i = 0; i < num_items; ++i)
{
InitValue(INTEGER_SEED, h_values[i], i);
InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
}
}
if (h_reference_ranks) delete[] h_reference_ranks;
printf("\nTesting bits [%d,%d) of %s keys with gen-mode %d\n", begin_bit, end_bit, typeid(KeyT).name(), gen_mode); fflush(stdout);
Test<BACKEND, IS_DESCENDING>(
h_keys, h_values,
num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
begin_bit, end_bit, h_reference_keys, h_reference_values);
if (h_keys) delete[] h_keys;
if (h_reference_keys) delete[] h_reference_keys;
if (h_values) delete[] h_values;
if (h_reference_values) delete[] h_reference_values;
if (h_segment_offsets) delete[] h_segment_offsets;
if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
}
#if TEST_VALUE_TYPE == 0
void TestUnspecifiedRanges()
{
const std::size_t num_items = 1024 * 1024;
const std::size_t max_segments = 42;
const std::size_t avg_segment_size = num_items / max_segments;
for (int iteration = 0; iteration < 4; iteration++)
{
thrust::host_vector<int> h_offsets_begin;
thrust::host_vector<int> h_offsets_end;
h_offsets_begin.reserve(max_segments + 1);
h_offsets_end.reserve(max_segments + 1);
{
int offset = 0;
for (std::size_t sid = 0; sid < max_segments; sid++)
{
const int segment_size =
static_cast<int>(RandomValue(avg_segment_size));
const bool segment_is_utilized = segment_size > 0
&& RandomValue(100) > 60;
if (segment_is_utilized)
{
h_offsets_begin.push_back(offset);
h_offsets_end.push_back(offset + segment_size);
}
offset += segment_size;
}
if (h_offsets_begin.empty())
{
h_offsets_begin.push_back(avg_segment_size);
h_offsets_end.push_back(num_items);
}
}
thrust::device_vector<int> keys(num_items);
thrust::device_vector<int> values(num_items);
thrust::sequence(keys.rbegin(), keys.rend());
thrust::sequence(values.rbegin(), values.rend());
thrust::device_vector<int> d_offsets_begin = h_offsets_begin;
thrust::device_vector<int> d_offsets_end = h_offsets_end;
thrust::device_vector<int> expected_keys = keys;
thrust::device_vector<int> expected_values = values;
const int num_segments = static_cast<int>(h_offsets_begin.size());
thrust::device_vector<int> result_keys = keys;
thrust::device_vector<int> result_values = values;
for (int sid = 0; sid < num_segments; sid++)
{
const int segment_begin = h_offsets_begin[sid];
const int segment_end = h_offsets_end[sid];
thrust::sort_by_key(expected_keys.begin() + segment_begin,
expected_keys.begin() + segment_end,
expected_values.begin() + segment_begin);
}
{
cub::DoubleBuffer<int> keys_buffer(
thrust::raw_pointer_cast(keys.data()),
thrust::raw_pointer_cast(result_keys.data()));
cub::DoubleBuffer<int> values_buffer(
thrust::raw_pointer_cast(values.data()),
thrust::raw_pointer_cast(result_values.data()));
std::size_t temp_storage_bytes{};
std::uint8_t *d_temp_storage{nullptr};
CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs(
d_temp_storage, temp_storage_bytes,
keys_buffer, values_buffer,
num_items, num_segments,
thrust::raw_pointer_cast(d_offsets_begin.data()),
thrust::raw_pointer_cast(d_offsets_end.data()),
0, sizeof(int) * 8));
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs(
d_temp_storage, temp_storage_bytes,
keys_buffer, values_buffer,
num_items, num_segments,
thrust::raw_pointer_cast(d_offsets_begin.data()),
thrust::raw_pointer_cast(d_offsets_end.data()),
0, sizeof(int) * 8));
for (int sid = 0; sid < num_segments; sid++)
{
const int segment_begin = h_offsets_begin[sid];
const int segment_end = h_offsets_end[sid];
if (keys_buffer.selector == 0)
{
thrust::copy(
keys.begin() + segment_begin,
keys.begin() + segment_end,
result_keys.begin() + segment_begin);
}
if (values_buffer.selector == 0)
{
thrust::copy(
values.begin() + segment_begin,
values.begin() + segment_end,
result_values.begin() + segment_begin);
}
}
}
AssertEquals(result_keys, expected_keys);
AssertEquals(result_values, expected_values);
thrust::sequence(keys.rbegin(), keys.rend());
thrust::sequence(values.rbegin(), values.rend());
result_keys = keys;
result_values = values;
{
std::size_t temp_storage_bytes{};
std::uint8_t *d_temp_storage{};
CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs(
d_temp_storage, temp_storage_bytes,
thrust::raw_pointer_cast(keys.data()),
thrust::raw_pointer_cast(result_keys.data()),
thrust::raw_pointer_cast(values.data()),
thrust::raw_pointer_cast(result_values.data()),
num_items, num_segments,
thrust::raw_pointer_cast(d_offsets_begin.data()),
thrust::raw_pointer_cast(d_offsets_end.data()),
0, sizeof(int) * 8));
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs(
d_temp_storage, temp_storage_bytes,
thrust::raw_pointer_cast(keys.data()),
thrust::raw_pointer_cast(result_keys.data()),
thrust::raw_pointer_cast(values.data()),
thrust::raw_pointer_cast(result_values.data()),
num_items, num_segments,
thrust::raw_pointer_cast(d_offsets_begin.data()),
thrust::raw_pointer_cast(d_offsets_end.data()),
0, sizeof(int) * 8));
}
AssertEquals(result_values, expected_values);
AssertEquals(result_keys, expected_keys);
}
}
#endif
#if TEST_KEY_BYTES == 4
// Following tests check that new decomposer API doesn't break old API.
// It's disabled because some compilers don't like implicit conversions, which
// is required for the test. Once we figure out how to temporarily enable conversion, we can
// re-enable the test.
#define ENABLING_CONVERSION_IS_FIGURED_OUT 0
#if ENABLING_CONVERSION_IS_FIGURED_OUT
struct bit_selector
{
int bit;
operator int() const
{
return bit;
}
};
template <class BeginBitT, class EndBitT, class... Ts>
void device_radix_sort_keys_allows_implicit_conversions_for_bits_helper(BeginBitT begin_bit, EndBitT end_bit, Ts... args)
{
const int num_items = 0;
{
std::size_t temp_storage_bytes = 0;
std::uint8_t *d_temp_storage = nullptr;
cub::DeviceRadixSort::SortKeys(d_temp_storage,
temp_storage_bytes,
args...,
num_items,
begin_bit,
end_bit);
}
{
std::size_t temp_storage_bytes = 0;
std::uint8_t *d_temp_storage = nullptr;
cub::DeviceRadixSort::SortKeys(d_temp_storage,
temp_storage_bytes,
args...,
num_items,
begin_bit);
}
{
std::size_t temp_storage_bytes = 0;
std::uint8_t *d_temp_storage = nullptr;
cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
temp_storage_bytes,
args...,
num_items,
begin_bit,
end_bit);
}
{
std::size_t temp_storage_bytes = 0;
std::uint8_t *d_temp_storage = nullptr;
cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
temp_storage_bytes,
args...,
num_items,
begin_bit);
}
}
template <class BeginBitT, class EndBitT, class... Ts>
void device_radix_sort_pairs_allows_implicit_conversions_for_bits_helper(BeginBitT begin_bit,
EndBitT end_bit,
Ts... args)
{
const int num_items = 0;
{
std::size_t temp_storage_bytes = 0;
std::uint8_t *d_temp_storage = nullptr;
cub::DeviceRadixSort::SortPairs(d_temp_storage,
temp_storage_bytes,
args...,
num_items,
begin_bit,
end_bit);
}
{
std::size_t temp_storage_bytes = 0;
std::uint8_t *d_temp_storage = nullptr;
cub::DeviceRadixSort::SortPairs(d_temp_storage,
temp_storage_bytes,
args...,
num_items,
begin_bit);
}
{
std::size_t temp_storage_bytes = 0;
std::uint8_t *d_temp_storage = nullptr;
cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
temp_storage_bytes,
args...,
num_items,
begin_bit,
end_bit);
}
{
std::size_t temp_storage_bytes = 0;
std::uint8_t *d_temp_storage = nullptr;
cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
temp_storage_bytes,
args...,
num_items,
begin_bit);
}
}
template <class BeginBitT, class EndBitT>
void device_radix_sort_allows_implicit_conversions_for_bits(BeginBitT begin_bit, EndBitT end_bit)
{
int *d_i_ptr = nullptr;
device_radix_sort_keys_allows_implicit_conversions_for_bits_helper(begin_bit,
end_bit,
d_i_ptr,
d_i_ptr);
cub::DoubleBuffer<int> keys(d_i_ptr, d_i_ptr);
device_radix_sort_keys_allows_implicit_conversions_for_bits_helper(begin_bit, end_bit, keys);
#if TEST_VALUE_TYPE == 2
unsigned int *d_u_ptr = nullptr;
device_radix_sort_pairs_allows_implicit_conversions_for_bits_helper(begin_bit,
end_bit,
d_i_ptr,
d_i_ptr,
d_u_ptr,
d_u_ptr);
cub::DoubleBuffer<unsigned int> pairs(d_u_ptr, d_u_ptr);
device_radix_sort_pairs_allows_implicit_conversions_for_bits_helper(begin_bit,
end_bit,
keys,
pairs);
#endif
}
void device_radix_sort_allows_implicit_conversions_for_bits()
{
int begin_i = 0;
long long int begin_lli = 0;
bit_selector begin_bs{0};
int end_i = 2;
long long int end_lli = 2;
bit_selector end_bs{2};
device_radix_sort_allows_implicit_conversions_for_bits(begin_i, end_i);
device_radix_sort_allows_implicit_conversions_for_bits(begin_i, end_lli);
device_radix_sort_allows_implicit_conversions_for_bits(begin_i, end_bs);
device_radix_sort_allows_implicit_conversions_for_bits(begin_lli, end_i);
device_radix_sort_allows_implicit_conversions_for_bits(begin_lli, end_lli);
device_radix_sort_allows_implicit_conversions_for_bits(begin_lli, end_bs);
device_radix_sort_allows_implicit_conversions_for_bits(begin_bs, end_i);
device_radix_sort_allows_implicit_conversions_for_bits(begin_bs, end_lli);
device_radix_sort_allows_implicit_conversions_for_bits(begin_bs, end_bs);
}
#endif // ENABLING_CONVERSION_IS_FIGURED_OUT
#endif // TEST_KEY_BYTES == 4
//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------
/**
* Main
*/
int main(int argc, char** argv)
{
std::size_t num_items = ~std::size_t(0);
int num_segments = -1;
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("n", num_items);
args.GetCmdLineArgument("s", num_segments);
args.GetCmdLineArgument("i", g_timing_iterations);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--n=<input items> "
"[--s=<num segments> "
"[--i=<timing iterations> "
"[--device=<device-id>] "
"[--v] "
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
// %PARAM% TEST_CDP cdp 0:1
// %PARAM% TEST_KEY_BYTES bytes 1:2:4:8:16
// %PARAM% TEST_VALUE_TYPE pairs 0:1:2:3
// 0->Keys only
// 1->uchar
// 2->uint
// 3->[ull,TestBar] (TestBar is cheap to build, included here to
// reduce total number of targets)
// To reduce testing time, some key types are only tested when not
// testing pairs:
#if TEST_VALUE_TYPE == 0
#define TEST_EXTENDED_KEY_TYPES
#endif
// Compile/run thorough tests
#if TEST_KEY_BYTES == 1
TestGen<signed char, true> (num_items, num_segments);
#ifdef TEST_EXTENDED_KEY_TYPES
TestGen<bool, false> (num_items, num_segments);
TestGen<signed char, false> (num_items, num_segments);
TestGen<unsigned char, false> (num_items, num_segments);
#endif // TEST_EXTENDED_KEY_TYPES
#elif TEST_KEY_BYTES == 2
TestGen<unsigned short, true> (num_items, num_segments);
#ifdef TEST_EXTENDED_KEY_TYPES
TestGen<short, false> (num_items, num_segments);
#if !_NVHPC_CUDA
TestGen<half_t, false> (num_items, num_segments);
#endif // CTK >= 9
#if !_NVHPC_CUDA
#if !defined(__ICC)
// Fails with `-0 != 0` with ICC for unknown reasons. See #333.
TestGen<bfloat16_t, false> (num_items, num_segments);
#endif // !ICC
#endif // CTK >= 11
#endif // TEST_EXTENDED_KEY_TYPES
#elif TEST_KEY_BYTES == 4
TestGen<int, true> (num_items, num_segments);
#if TEST_VALUE_TYPE == 0
TestUnspecifiedRanges();
#endif
#if ENABLING_CONVERSION_IS_FIGURED_OUT
device_radix_sort_allows_implicit_conversions_for_bits();
#endif
#ifdef TEST_EXTENDED_KEY_TYPES
TestGen<float, false> (num_items, num_segments);
TestGen<unsigned int, false> (num_items, num_segments);
#endif // TEST_EXTENDED_KEY_TYPES
#elif TEST_KEY_BYTES == 8
TestGen<double, true> (num_items, num_segments);
#ifdef TEST_EXTENDED_KEY_TYPES
TestGen<long long, false> (num_items, num_segments);
TestGen<unsigned long long, false>(num_items, num_segments);
#endif // TEST_EXTENDED_KEY_TYPES
#elif TEST_KEY_BYTES == 16
#if CUB_IS_INT128_ENABLED
TestGen<__int128_t, false>(num_items, num_segments);
TestGen<__uint128_t, false>(num_items, num_segments);
#else
// Fix unused static function for MSVC
BackendToString(CUB);
#endif
#endif // TEST_KEY_BYTES switch
return 0;
}