/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of DeviceRadixSort utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include #if !_NVHPC_CUDA #include #endif #if !_NVHPC_CUDA #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include "test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; int g_timing_iterations = 0; std::size_t g_smallest_pre_sorted_num_items = (std::size_t(1) << 32) - 42; CachingDeviceAllocator g_allocator(true); // Dispatch types enum Backend { CUB, // CUB method (allows overwriting of input) CUB_NO_OVERWRITE, // CUB method (disallows overwriting of input) CUB_SEGMENTED, // CUB method (allows overwriting of input) CUB_SEGMENTED_NO_OVERWRITE, // CUB method (disallows overwriting of input) // Same as above, but launches kernels from device using CDP. CDP, CDP_NO_OVERWRITE, CDP_SEGMENTED, CDP_SEGMENTED_NO_OVERWRITE, }; static const char* BackendToString(Backend b) { switch (b) { case CUB: return "CUB"; case CUB_NO_OVERWRITE: return "CUB_NO_OVERWRITE"; case CUB_SEGMENTED: return "CUB_SEGMENTED"; case CUB_SEGMENTED_NO_OVERWRITE: return "CUB_SEGMENTED_NO_OVERWRITE"; case CDP: return "CDP"; case CDP_NO_OVERWRITE: return "CDP_NO_OVERWRITE"; case CDP_SEGMENTED: return "CDP_SEGMENTED"; case CDP_SEGMENTED_NO_OVERWRITE: return "CDP_SEGMENTED_NO_OVERWRITE"; default: break; } return ""; } //--------------------------------------------------------------------- // Dispatch to different DeviceRadixSort entrypoints //--------------------------------------------------------------------- /** * Dispatch to CUB sorting entrypoint (specialized for ascending) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, int */*d_selector*/, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int /*num_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, int begin_bit, int end_bit) { return DeviceRadixSort::SortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit); } /** * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for ascending) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, int */*d_selector*/, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int /*num_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, int begin_bit, int end_bit) { KeyT const *const_keys_itr = d_keys.Current(); ValueT const *const_values_itr = d_values.Current(); cudaError_t retval = DeviceRadixSort::SortPairs( d_temp_storage, temp_storage_bytes, const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(), num_items, begin_bit, end_bit); d_keys.selector ^= 1; d_values.selector ^= 1; return retval; } /** * Dispatch to CUB sorting entrypoint (specialized for descending) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, int */*d_selector*/, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int /*num_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, int begin_bit, int end_bit) { return DeviceRadixSort::SortPairsDescending( d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit); } /** * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for descending) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, int */*d_selector*/, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int /*num_segments*/, BeginOffsetIteratorT /*d_segment_begin_offsets*/, EndOffsetIteratorT /*d_segment_end_offsets*/, int begin_bit, int end_bit) { KeyT const *const_keys_itr = d_keys.Current(); ValueT const *const_values_itr = d_values.Current(); cudaError_t retval = DeviceRadixSort::SortPairsDescending( d_temp_storage, temp_storage_bytes, const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(), num_items, begin_bit, end_bit); d_keys.selector ^= 1; d_values.selector ^= 1; return retval; } //--------------------------------------------------------------------- // Dispatch to different DeviceRadixSort entrypoints //--------------------------------------------------------------------- // Validates that `num_items` fits into `int` // TODO(canonizer): remove this check once num_items is templated for segmented sort. template __host__ __device__ bool ValidateNumItemsForSegmentedSort(NumItemsT num_items) { if (static_cast(num_items) < static_cast(INT_MAX)) { return true; } else { printf("cub::DeviceSegmentedRadixSort is currently limited by %d items but " "%lld were provided\n", INT_MAX, static_cast(num_items)); } return false; } /** * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for ascending) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, int */*d_selector*/, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int num_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit) { if (ValidateNumItemsForSegmentedSort(num_items)) { return DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, static_cast(num_items), num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); } return cudaErrorInvalidValue; } /** * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for ascending) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, int */*d_selector*/, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int num_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit) { if (ValidateNumItemsForSegmentedSort(num_items)) { KeyT const *const_keys_itr = d_keys.Current(); ValueT const *const_values_itr = d_values.Current(); cudaError_t retval = DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(), static_cast(num_items), num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); d_keys.selector ^= 1; d_values.selector ^= 1; return retval; } return cudaErrorInvalidValue; } /** * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for descending) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, int */*d_selector*/, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int num_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit) { if (ValidateNumItemsForSegmentedSort(num_items)) { return DeviceSegmentedRadixSort::SortPairsDescending( d_temp_storage, temp_storage_bytes, d_keys, d_values, static_cast(num_items), num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); } return cudaErrorInvalidValue; } /** * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for descending) */ template CUB_RUNTIME_FUNCTION cudaError_t Dispatch( Int2Type /*is_descending*/, Int2Type /*dispatch_to*/, int */*d_selector*/, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int num_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit) { if (ValidateNumItemsForSegmentedSort(num_items)) { KeyT const *const_keys_itr = d_keys.Current(); ValueT const *const_values_itr = d_values.Current(); cudaError_t retval = DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(), static_cast(num_items), num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); d_keys.selector ^= 1; d_values.selector ^= 1; return retval; } return cudaErrorInvalidValue; } //--------------------------------------------------------------------- // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- #if TEST_CDP == 1 /** * Simple wrapper kernel to invoke DeviceRadixSort */ template __global__ void CDPDispatchKernel(Int2Type is_descending, Int2Type cub_backend, int *d_selector, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t temp_storage_bytes, DoubleBuffer d_keys, DoubleBuffer d_values, NumItemsT num_items, int num_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit) { *d_cdp_error = Dispatch(is_descending, cub_backend, d_selector, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); *d_temp_storage_bytes = temp_storage_bytes; *d_selector = d_keys.selector; } /** * Launch kernel and dispatch on device. Should only be called from host code. * The CubBackend should be one of the non-CDP CUB backends to invoke from the * device. */ template cudaError_t LaunchCDPKernel(Int2Type is_descending, Int2Type cub_backend, int *d_selector, size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, void *d_temp_storage, size_t &temp_storage_bytes, DoubleBuffer &d_keys, DoubleBuffer &d_values, NumItemsT num_items, int num_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit) { // Invoke kernel to invoke device-side dispatch: cudaError_t retval = thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, 0) .doit(CDPDispatchKernel, is_descending, cub_backend, d_selector, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); CubDebugExit(retval); CubDebugExit(cub::detail::device_synchronize()); // Copy out selector CubDebugExit(cudaMemcpy(&d_keys.selector, d_selector, sizeof(int) * 1, cudaMemcpyDeviceToHost)); d_values.selector = d_keys.selector; // Copy out temp_storage_bytes CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); // Copy out error CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); return retval; } // Specializations of Dispatch that translate the CDP backend to the appropriate // CUB backend, and uses the CUB backend to launch the CDP kernel. #define DEFINE_CDP_DISPATCHER(CdpBackend, CubBackend) \ template \ cudaError_t Dispatch(Int2Type is_descending, \ Int2Type /*dispatch_to*/, \ int *d_selector, \ size_t *d_temp_storage_bytes, \ cudaError_t *d_cdp_error, \ \ void *d_temp_storage, \ size_t &temp_storage_bytes, \ DoubleBuffer &d_keys, \ DoubleBuffer &d_values, \ NumItemsT num_items, \ int num_segments, \ BeginOffsetIteratorT d_segment_begin_offsets, \ EndOffsetIteratorT d_segment_end_offsets, \ int begin_bit, \ int end_bit) \ { \ Int2Type cub_backend{}; \ return LaunchCDPKernel(is_descending, \ cub_backend, \ d_selector, \ d_temp_storage_bytes, \ d_cdp_error, \ d_temp_storage, \ temp_storage_bytes, \ d_keys, \ d_values, \ num_items, \ num_segments, \ d_segment_begin_offsets, \ d_segment_end_offsets, \ begin_bit, \ end_bit); \ } DEFINE_CDP_DISPATCHER(CDP, CUB) DEFINE_CDP_DISPATCHER(CDP_NO_OVERWRITE, CUB_NO_OVERWRITE) DEFINE_CDP_DISPATCHER(CDP_SEGMENTED, CUB_SEGMENTED) DEFINE_CDP_DISPATCHER(CDP_SEGMENTED_NO_OVERWRITE, CUB_SEGMENTED_NO_OVERWRITE) #undef DEFINE_CDP_DISPATCHER #endif // TEST_CDP //--------------------------------------------------------------------- // Problem generation //--------------------------------------------------------------------- /** * Simple key-value pairing */ template < typename KeyT, typename ValueT> struct Pair { KeyT key; ValueT value; bool operator<(const Pair &b) const { return (key < b.key); } }; /** * Simple key-value pairing (specialized for bool types) */ template struct Pair { bool key; ValueT value; bool operator<(const Pair &b) const { return (!key && b.key); } }; /** * Initialize key data */ template void InitializeKeyBits( GenMode gen_mode, KeyT *h_keys, NumItemsT num_items, int /*entropy_reduction*/) { for (NumItemsT i = 0; i < num_items; ++i) InitValue(gen_mode, h_keys[i], i); } template ::UnsignedBits> UnsignedBits KeyBits(KeyT key) { UnsignedBits bits; memcpy(&bits, &key, sizeof(KeyT)); return bits; } /** Initialize the reference array monotonically. */ template void InitializeKeysSorted( KeyT *h_keys, NumItemsT num_items) { using TraitsT = cub::Traits; using UnsignedBits = typename TraitsT::UnsignedBits; // Numbers to generate random runs. UnsignedBits max_inc = 1 << (sizeof(UnsignedBits) < 4 ? 3 : (sizeof(UnsignedBits) < 8 ? 14 : 24)); UnsignedBits min_bits = TraitsT::TwiddleIn(KeyBits(TraitsT::Lowest())); UnsignedBits max_bits = TraitsT::TwiddleIn(KeyBits(TraitsT::Max())); NumItemsT max_run = std::max( NumItemsT(double(num_items) * (max_inc + 1) / max_bits), NumItemsT(1 << 14)); UnsignedBits *h_key_bits = reinterpret_cast(h_keys); NumItemsT i = 0; // Start with the minimum twiddled key. UnsignedBits twiddled_key = min_bits; while (i < num_items) { // Generate random increment (avoid overflow). UnsignedBits inc_bits = 0; RandomBits(inc_bits); // twiddled_key < max_bits at this point. UnsignedBits inc = static_cast(std::min(1 + inc_bits % max_inc, max_bits - twiddled_key)); twiddled_key += inc; // Generate random run length (ensure there are enough values to fill the rest). NumItemsT run_bits = 0; RandomBits(run_bits); NumItemsT run_length = std::min(1 + run_bits % max_run, num_items - i); if (twiddled_key == max_bits) run_length = num_items - i; NumItemsT run_end = i + run_length; // Fill the array. UnsignedBits key = TraitsT::TwiddleOut(twiddled_key); // Avoid -0.0 for floating-point keys. UnsignedBits negative_zero = UnsignedBits(1) << UnsignedBits(sizeof(UnsignedBits) * 8 - 1); if (TraitsT::CATEGORY == cub::FLOATING_POINT && key == negative_zero) { key = 0; } for (; i < run_end; ++i) { h_key_bits[i] = key; } } } /** * Initialize solution */ template void InitializeSolution( KeyT *h_keys, NumItemsT num_items, int num_segments, bool pre_sorted, NumItemsT *h_segment_offsets, int begin_bit, int end_bit, NumItemsT *&h_reference_ranks, KeyT *&h_reference_keys) { if (num_items == 0) { h_reference_ranks = nullptr; h_reference_keys = nullptr; return; } if (pre_sorted) { printf("Shuffling reference solution on CPU\n"); // Note: begin_bit and end_bit are ignored here, and assumed to have the // default values (begin_bit == 0, end_bit == 8 * sizeof(KeyT)). // Otherwise, pre-sorting won't work, as it doesn't necessarily // correspond to the order of keys sorted by a subrange of bits. // num_segments is also ignored as assumed to be 1, as pre-sorted tests // are currently not supported for multiple segments. // // Pre-sorted tests with non-default begin_bit, end_bit or num_segments // != 1 are skipped in TestBits() and TestSegments(), respectively. AssertEquals(begin_bit, 0); AssertEquals(end_bit, static_cast(8 * sizeof(KeyT))); AssertEquals(num_segments, 1); // Copy to the reference solution. h_reference_keys = new KeyT[num_items]; if (IS_DESCENDING) { // Copy in reverse. for (NumItemsT i = 0; i < num_items; ++i) { h_reference_keys[i] = h_keys[num_items - 1 - i]; } // Copy back. memcpy(h_keys, h_reference_keys, num_items * sizeof(KeyT)); } else { memcpy(h_reference_keys, h_keys, num_items * sizeof(KeyT)); } // Summarize the pre-sorted array (element, 1st position, count). struct Element { KeyT key; NumItemsT num; NumItemsT index; }; std::vector summary; KeyT cur_key = h_reference_keys[0]; summary.push_back(Element{cur_key, 1, 0}); for (NumItemsT i = 1; i < num_items; ++i) { KeyT key = h_reference_keys[i]; if (key == cur_key) { // Same key. summary.back().num++; continue; } // Different key. cur_key = key; summary.push_back(Element{cur_key, 1, i}); } // Generate a random permutation from the summary. Such a complicated // approach is used to permute the array and compute ranks in a // cache-friendly way and in a short time. if (WANT_RANKS) { h_reference_ranks = new NumItemsT[num_items]; } NumItemsT max_run = 32; NumItemsT run = 0; NumItemsT i = 0; while (summary.size() > 0) { // Pick up a random element and a run. NumItemsT bits = 0; RandomBits(bits); NumItemsT summary_id = bits % summary.size(); Element& element = summary[summary_id]; run = std::min(1 + bits % (max_run - 1), element.num); for (NumItemsT j = 0; j < run; ++j) { h_keys[i + j] = element.key; if (WANT_RANKS) { h_reference_ranks[element.index + j] = i + j; } } i += run; element.index += run; element.num -= run; if (element.num == 0) { // Remove the empty entry. std::swap(summary[summary_id], summary.back()); summary.pop_back(); } } printf(" Done.\n"); } else { typedef Pair PairT; PairT *h_pairs = new PairT[num_items]; int num_bits = end_bit - begin_bit; for (NumItemsT i = 0; i < num_items; ++i) { // Mask off unwanted portions if (num_bits < static_cast(sizeof(KeyT) * 8)) { using UnsignedBits = typename cub::Traits::UnsignedBits; UnsignedBits base = 0; memcpy(&base, &h_keys[i], sizeof(KeyT)); base &= ((UnsignedBits{1} << num_bits) - 1) << begin_bit; memcpy(&h_pairs[i].key, &base, sizeof(KeyT)); } else { h_pairs[i].key = h_keys[i]; } h_pairs[i].value = i; } printf("\nSorting reference solution on CPU " "(%zd items, %d segments, %zd items/seg)...", static_cast(num_items), num_segments, static_cast(num_items / num_segments)); fflush(stdout); for (int i = 0; i < num_segments; ++i) { if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]); std::stable_sort( h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]); if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]); } printf(" Done.\n"); fflush(stdout); if (WANT_RANKS) { h_reference_ranks = new NumItemsT[num_items]; } h_reference_keys = new KeyT[num_items]; for (NumItemsT i = 0; i < num_items; ++i) { if (WANT_RANKS) { h_reference_ranks[i] = h_pairs[i].value; } h_reference_keys[i] = h_keys[h_pairs[i].value]; } if (h_pairs) delete[] h_pairs; } } template void ResetKeys(KeyT *h_keys, NumItemsT num_items, bool pre_sorted, KeyT *reference_keys) { if (!pre_sorted) return; // Copy the reference keys back. if (IS_DESCENDING) { // Keys need to be copied in reverse. for (NumItemsT i = 0; i < num_items; ++i) { h_keys[i] = reference_keys[num_items - 1 - i]; } } else { memcpy(h_keys, reference_keys, num_items * sizeof(KeyT)); } } //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- template struct UnwrapHalfAndBfloat16 { using Type = T; }; #if !_NVHPC_CUDA template <> struct UnwrapHalfAndBfloat16 { using Type = __half; }; #endif #if !_NVHPC_CUDA template <> struct UnwrapHalfAndBfloat16 { using Type = __nv_bfloat16; }; #endif template int compare_device_arrays(T* host_reference, T* d_tmp_buffer, T* d_data, NumItemsT num_items) { CubDebugExit(cudaMemcpy(d_tmp_buffer, host_reference, sizeof(T) * num_items, cudaMemcpyHostToDevice)); auto d_reference_begin = d_tmp_buffer; auto d_reference_end = d_tmp_buffer + num_items; auto err = thrust::mismatch(thrust::device, d_reference_begin, d_reference_end, d_data); if (err.first != d_reference_end) { const auto index = thrust::distance(d_reference_begin, err.first); return CompareDeviceResults(host_reference + index, d_data + index, 1, true, g_verbose); } return 0; } template int compare_device_arrays( CUB_NS_QUALIFIER::NullType */* h_reference */, CUB_NS_QUALIFIER::NullType */* d_tmp_buffer */, CUB_NS_QUALIFIER::NullType */* d_data */, NumItemsT /* num_items */) { return 0; } /** * Test DeviceRadixSort */ template < Backend BACKEND, bool IS_DESCENDING, typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename NumItemsT> void Test( KeyT *h_keys, ValueT *h_values, NumItemsT num_items, int num_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit, KeyT *h_reference_keys, ValueT *h_reference_values) { // Key alias type using KeyAliasT = typename UnwrapHalfAndBfloat16::Type; const bool KEYS_ONLY = std::is_same::value; printf("%s %s cub::DeviceRadixSort %zd items, %d segments, " "%d-byte keys (%s) %d-byte values (%s), %d-byte num_items (%s), " "descending %d, begin_bit %d, end_bit %d\n", BackendToString(BACKEND), (KEYS_ONLY) ? "keys-only" : "key-value", static_cast(num_items), num_segments, static_cast(sizeof(KeyT)), typeid(KeyT).name(), (KEYS_ONLY) ? 0 : static_cast(sizeof(ValueT)), typeid(ValueT).name(), static_cast(sizeof(NumItemsT)), typeid(NumItemsT).name(), IS_DESCENDING, begin_bit, end_bit); if (g_verbose) { printf("Input keys:\n"); DisplayResults(h_keys, num_items); printf("\n\n"); } // Allocate device arrays DoubleBuffer d_keys; DoubleBuffer d_values; int *d_selector; size_t *d_temp_storage_bytes; cudaError_t *d_cdp_error; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(KeyT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(KeyT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_selector, sizeof(int) * 1)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); if (!KEYS_ONLY) { CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(ValueT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(ValueT) * num_items)); } // Allocate temporary storage (and make it un-aligned) size_t temp_storage_bytes = 0; void *d_temp_storage = NULL; CubDebugExit(Dispatch( Int2Type(), Int2Type(), d_selector, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + 1)); void* mis_aligned_temp = static_cast(d_temp_storage) + 1; // Initialize/clear device arrays d_keys.selector = 0; CubDebugExit(cudaMemcpy(d_keys.d_buffers[0], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemset(d_keys.d_buffers[1], 0, sizeof(KeyT) * num_items)); if (!KEYS_ONLY) { d_values.selector = 0; CubDebugExit(cudaMemcpy(d_values.d_buffers[0], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemset(d_values.d_buffers[1], 0, sizeof(ValueT) * num_items)); } // Run warmup/correctness iteration CubDebugExit(Dispatch( Int2Type(), Int2Type(), d_selector, d_temp_storage_bytes, d_cdp_error, mis_aligned_temp, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit)); // Flush any stdout/stderr fflush(stdout); fflush(stderr); // Check for correctness (and display results, if specified) printf("Warmup done. Checking results:\n"); fflush(stdout); int compare = 0; // If in/out API is used, we are not allowed to overwrite the input. // Let's check that the input buffer is not overwritten by the algorithm. if (BACKEND == CUB_NO_OVERWRITE) { KeyT *d_input_keys = reinterpret_cast(d_keys.d_buffers[0]); if (temp_storage_bytes < sizeof(KeyT) * num_items) { // For small input sizes, temporary storage is not large enough to fit keys. compare = CompareDeviceResults(h_keys, d_input_keys, num_items, true, g_verbose); } else { // If overwrite is not allowed, temporary storage is large enough to fit keys. KeyT* temp_keys = reinterpret_cast(d_temp_storage); compare = compare_device_arrays(h_keys, temp_keys, d_input_keys, num_items); printf("\t Compare input keys: %s ", compare ? "FAIL" : "PASS"); fflush(stdout); } } // After the previous check is done, we can safely reuse alternative buffer to store // the reference results and compare current output. compare |= compare_device_arrays(h_reference_keys, reinterpret_cast(d_keys.Alternate()), reinterpret_cast(d_keys.Current()), num_items); printf("\t Compare keys (selector %d): %s ", d_keys.selector, compare ? "FAIL" : "PASS"); fflush(stdout); if (!KEYS_ONLY) { int values_compare = compare_device_arrays(h_reference_values, d_values.Alternate(), d_values.Current(), num_items); compare |= values_compare; printf("\t Compare values (selector %d): %s ", d_values.selector, values_compare ? "FAIL" : "PASS"); fflush(stdout); } // Performance if (g_timing_iterations) printf("\nPerforming timing iterations:\n"); fflush(stdout); GpuTimer gpu_timer; float elapsed_millis = 0.0f; for (int i = 0; i < g_timing_iterations; ++i) { // Initialize/clear device arrays CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemset(d_keys.d_buffers[d_keys.selector ^ 1], 0, sizeof(KeyT) * num_items)); if (!KEYS_ONLY) { CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemset(d_values.d_buffers[d_values.selector ^ 1], 0, sizeof(ValueT) * num_items)); } gpu_timer.Start(); CubDebugExit(Dispatch( Int2Type(), Int2Type(), d_selector, d_temp_storage_bytes, d_cdp_error, mis_aligned_temp, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit)); gpu_timer.Stop(); elapsed_millis += gpu_timer.ElapsedMillis(); } // Display performance if (g_timing_iterations > 0) { float avg_millis = elapsed_millis / g_timing_iterations; float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; float giga_bandwidth = (KEYS_ONLY) ? giga_rate * sizeof(KeyT) * 2 : giga_rate * (sizeof(KeyT) + sizeof(ValueT)) * 2; printf("\n%.3f elapsed ms, %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", elapsed_millis, avg_millis, giga_rate, giga_bandwidth); } printf("\n\n"); // Cleanup if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0])); if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1])); if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0])); if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1])); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); if (d_selector) CubDebugExit(g_allocator.DeviceFree(d_selector)); if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); // Correctness asserts AssertEquals(0, compare); } // Returns whether there is enough memory for the test. template bool HasEnoughMemory(std::size_t num_items, bool overwrite) { std::size_t total_mem = TotalGlobalMem(); std::size_t value_size = std::is_same::value ? 0 : sizeof(ValueT); // A conservative estimate of the amount of memory required. double factor = overwrite ? 2.25 : 3.25; std::size_t test_mem = static_cast (num_items * (sizeof(KeyT) + value_size) * factor); return test_mem < total_mem; } /** * Test backend */ template void TestBackend(KeyT *h_keys, NumItemsT num_items, int num_segments, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit, KeyT *h_reference_keys, NumItemsT *h_reference_ranks) { #if TEST_CDP == 0 constexpr auto NonSegmentedOverwrite = CUB; constexpr auto NonSegmentedNoOverwrite = CUB_NO_OVERWRITE; constexpr auto SegmentedOverwrite = CUB_SEGMENTED; constexpr auto SegmentedNoOverwrite = CUB_SEGMENTED_NO_OVERWRITE; #else // TEST_CDP constexpr auto NonSegmentedOverwrite = CDP; constexpr auto NonSegmentedNoOverwrite = CDP_NO_OVERWRITE; constexpr auto SegmentedOverwrite = CDP_SEGMENTED; constexpr auto SegmentedNoOverwrite = CDP_SEGMENTED_NO_OVERWRITE; #endif // TEST_CDP const bool KEYS_ONLY = std::is_same::value; // A conservative check assuming overwrite is allowed. if (!HasEnoughMemory(static_cast(num_items), true)) { printf("Skipping the test due to insufficient device memory\n"); return; } std::unique_ptr h_value_data{}; ValueT *h_values = nullptr; ValueT *h_reference_values = nullptr; if (!KEYS_ONLY) { h_value_data.reset(new ValueT[2 * static_cast(num_items)]); h_values = h_value_data.get(); h_reference_values = h_value_data.get() + num_items; for (NumItemsT i = 0; i < num_items; ++i) { InitValue(INTEGER_SEED, h_values[i], i); InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]); } } // Skip segmented sort if num_items isn't int. // TODO(64bit-seg-sort): re-enable these tests once num_items is templated for // segmented sort. if (std::is_same::value) { printf("Testing segmented sort with overwrite\n"); Test(h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values); printf("Testing segmented sort with no overwrite\n"); Test(h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values); } else { printf("Skipping segmented sort tests (NumItemsT != int)\n"); } if (num_segments == 1) { printf("Testing non-segmented sort with overwrite\n"); Test(h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values); if (HasEnoughMemory(static_cast(num_items), false)) { printf("Testing non-segmented sort with no overwrite\n"); Test(h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values); } else { printf("Skipping no-overwrite tests with %zd items due to " "insufficient memory\n", static_cast(num_items)); } } } // Smallest value type for TEST_VALUE_TYPE. // Unless TEST_VALUE_TYPE == 3, this is the only value type tested. #if TEST_VALUE_TYPE == 0 // Test keys-only using SmallestValueT = NullType; #elif TEST_VALUE_TYPE == 1 // Test with 8b value using SmallestValueT = unsigned char; #elif TEST_VALUE_TYPE == 2 // Test with 32b value using SmallestValueT = unsigned int; // Test with 64b value #elif TEST_VALUE_TYPE == 3 using SmallestValueT = unsigned long long; #endif /** * Test value type */ template void TestValueTypes( KeyT *h_keys, NumItemsT num_items, int num_segments, bool pre_sorted, NumItemsT *h_segment_offsets, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit) { // Initialize the solution NumItemsT *h_reference_ranks = NULL; KeyT *h_reference_keys = NULL; // If TEST_VALUE_TYPE == 0, no values are sorted, only keys. // Since ranks are only necessary when checking for values, // they are not computed in this case. InitializeSolution(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, begin_bit, end_bit, h_reference_ranks, h_reference_keys); TestBackend (h_keys, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks); #if TEST_VALUE_TYPE == 3 // Test with non-trivially-constructable value // These are cheap to build, so lump them in with the 64b value tests. TestBackend (h_keys, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks); #endif // Cleanup ResetKeys(h_keys, num_items, pre_sorted, h_reference_keys); if (h_reference_ranks) delete[] h_reference_ranks; if (h_reference_keys) delete[] h_reference_keys; } /** * Test ascending/descending */ template void TestDirection( KeyT *h_keys, NumItemsT num_items, int num_segments, bool pre_sorted, NumItemsT *h_segment_offsets, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets, int begin_bit, int end_bit) { TestValueTypes(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); TestValueTypes(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); } /** * Test different bit ranges */ template void TestBits( KeyT *h_keys, NumItemsT num_items, int num_segments, bool pre_sorted, NumItemsT *h_segment_offsets, BeginOffsetIteratorT d_segment_begin_offsets, EndOffsetIteratorT d_segment_end_offsets) { // Don't test partial-word sorting for boolean, fp, or signed types (the bit-flipping techniques get in the way) or pre-sorted keys if ((Traits::CATEGORY == UNSIGNED_INTEGER) && (!std::is_same::value) && !pre_sorted) { // Partial bits int begin_bit = 1; int end_bit = (sizeof(KeyT) * 8) - 1; printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout); TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); // Equal bits begin_bit = end_bit = 0; printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout); TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit); // Across subword boundaries int mid_bit = sizeof(KeyT) * 4; printf("Testing key bits [%d,%d)\n", mid_bit - 1, mid_bit + 1); fflush(stdout); TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, mid_bit - 1, mid_bit + 1); } printf("Testing key bits [%d,%d)\n", 0, int(sizeof(KeyT)) * 8); fflush(stdout); TestDirection(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets, d_segment_end_offsets, 0, sizeof(KeyT) * 8); } template struct TransformFunctor1 { __host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const { return offset; } }; template struct TransformFunctor2 { __host__ __device__ __forceinline__ OffsetT operator()(OffsetT offset) const { return offset; } }; /** * Test different segment iterators */ template void TestSegmentIterators( KeyT *h_keys, NumItemsT num_items, int num_segments, bool pre_sorted, NumItemsT *h_segment_offsets, NumItemsT *d_segment_offsets) { InitializeSegments(num_items, num_segments, h_segment_offsets); CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(NumItemsT) * (num_segments + 1), cudaMemcpyHostToDevice)); // Test with segment pointer. // This is also used to test non-segmented sort. TestBits(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_offsets, d_segment_offsets + 1); if (num_segments > 1) { // Test with transform iterators of different types typedef TransformFunctor1 TransformFunctor1T; typedef TransformFunctor2 TransformFunctor2T; TransformInputIterator d_segment_begin_offsets_itr(d_segment_offsets, TransformFunctor1T()); TransformInputIterator d_segment_end_offsets_itr(d_segment_offsets + 1, TransformFunctor2T()); TestBits(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_begin_offsets_itr, d_segment_end_offsets_itr); } } /** * Test different segment compositions */ template void TestSegments( KeyT *h_keys, NumItemsT num_items, int max_segments, bool pre_sorted) { max_segments = static_cast(CUB_MIN(num_items, static_cast(max_segments))); NumItemsT *h_segment_offsets = new NumItemsT[max_segments + 1]; NumItemsT *d_segment_offsets = nullptr; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(NumItemsT) * (max_segments + 1))); for (int num_segments = max_segments; num_segments > 1; num_segments = cub::DivideAndRoundUp(num_segments, 64)) { // Pre-sorted tests are not supported for segmented sort if (num_items / num_segments < 128 * 1000 && !pre_sorted) { // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment TestSegmentIterators(h_keys, num_items, num_segments, pre_sorted, h_segment_offsets, d_segment_offsets); } } // Test single segment if (num_items > 0) { if (num_items < 128 * 1000 || pre_sorted) { // Right now we assign a single thread block to each segment, so lets // keep it to under 128K items per segment TestSegmentIterators(h_keys, num_items, 1, pre_sorted, h_segment_offsets, d_segment_offsets); } } if (h_segment_offsets) delete[] h_segment_offsets; if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets)); } /** * Test different NumItemsT, i.e. types of num_items */ template void TestNumItems(KeyT *h_keys, std::size_t num_items, int max_segments, bool pre_sorted) { if (!pre_sorted && num_items <= std::size_t(std::numeric_limits::max())) { TestSegments(h_keys, static_cast(num_items), max_segments, pre_sorted); } if (pre_sorted && num_items <= std::size_t(std::numeric_limits::max())) { TestSegments(h_keys, static_cast(num_items), max_segments, pre_sorted); } TestSegments(h_keys, num_items, max_segments, pre_sorted); } /** * Test different (sub)lengths and number of segments */ template void TestSizes(KeyT* h_keys, std::size_t max_items, int max_segments, bool pre_sorted) { if (pre_sorted) { // run a specific list of sizes, up to max_items std::size_t sizes[] = {g_smallest_pre_sorted_num_items, 4350000007ull}; for (std::size_t num_items : sizes) { if (num_items > max_items) break; TestNumItems(h_keys, num_items, max_segments, pre_sorted); } } else { for (std::size_t num_items = max_items; num_items > 1; num_items = cub::DivideAndRoundUp(num_items, 64)) { TestNumItems(h_keys, num_items, max_segments, pre_sorted); } } } /** * Test key sampling distributions */ template void TestGen( std::size_t max_items, int max_segments) { if (max_items == ~std::size_t(0)) { max_items = 8000003; } if (max_segments < 0) { max_segments = 5003; } std::unique_ptr h_keys(new KeyT[max_items]); // Test trivial problems sizes h_keys[0] = static_cast(42); TestNumItems(h_keys.get(), 0, 0, false); TestNumItems(h_keys.get(), 1, 1, false); for (int entropy_reduction = 0; entropy_reduction <= 6; entropy_reduction += 6) { printf("\nTesting random %s keys with entropy reduction factor %d\n", typeid(KeyT).name(), entropy_reduction); fflush(stdout); InitializeKeyBits(RANDOM, h_keys.get(), max_items, entropy_reduction); TestSizes(h_keys.get(), max_items, max_segments, false); } if (cub::Traits::CATEGORY == cub::FLOATING_POINT) { printf("\nTesting random %s keys with some replaced with -0.0 or +0.0 \n", typeid(KeyT).name()); fflush(stdout); InitializeKeyBits(RANDOM_MINUS_PLUS_ZERO, h_keys.get(), max_items, 0); // This just tests +/- 0 handling -- don't need to test multiple sizes TestNumItems(h_keys.get(), max_items, max_segments, false); } printf("\nTesting uniform %s keys\n", typeid(KeyT).name()); fflush(stdout); InitializeKeyBits(UNIFORM, h_keys.get(), max_items, 0); TestSizes(h_keys.get(), max_items, max_segments, false); printf("\nTesting natural number %s keys\n", typeid(KeyT).name()); fflush(stdout); InitializeKeyBits(INTEGER_SEED, h_keys.get(), max_items, 0); TestSizes(h_keys.get(), max_items, max_segments, false); if (WITH_PRE_SORTED) { // Presorting is only used for testing large input arrays. const std::size_t large_num_items = std::size_t(4350000007ull); // A conservative check for memory, as we don't know ValueT or whether // the overwrite is allowed until later. // For ValueT, the check is actually exact unless TEST_VALUE_TYPE == 3. if (!HasEnoughMemory(large_num_items, true)) { printf("Skipping the permutation-based test due to insufficient device memory\n"); return; } h_keys.reset(nullptr); // Explicitly free old buffer before allocating. h_keys.reset(new KeyT[large_num_items]); printf("\nTesting pre-sorted and randomly permuted %s keys\n", typeid(KeyT).name()); fflush(stdout); InitializeKeysSorted(h_keys.get(), large_num_items); fflush(stdout); TestSizes(h_keys.get(), large_num_items, max_segments, true); fflush(stdout); } } //--------------------------------------------------------------------- // Simple test //--------------------------------------------------------------------- template < Backend BACKEND, typename KeyT, typename ValueT, bool IS_DESCENDING> void Test( std::size_t num_items, int num_segments, GenMode gen_mode, int entropy_reduction, int begin_bit, int end_bit) { const bool KEYS_ONLY = std::is_same::value; KeyT *h_keys = new KeyT[num_items]; std::size_t *h_reference_ranks = NULL; KeyT *h_reference_keys = NULL; ValueT *h_values = NULL; ValueT *h_reference_values = NULL; size_t *h_segment_offsets = new std::size_t[num_segments + 1]; std::size_t* d_segment_offsets = nullptr; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(std::size_t) * (num_segments + 1))); if (end_bit < 0) end_bit = sizeof(KeyT) * 8; InitializeKeyBits(gen_mode, h_keys, num_items, entropy_reduction); InitializeSegments(num_items, num_segments, h_segment_offsets); CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(std::size_t) * (num_segments + 1), cudaMemcpyHostToDevice)); InitializeSolution( h_keys, num_items, num_segments, false, h_segment_offsets, begin_bit, end_bit, h_reference_ranks, h_reference_keys); if (!KEYS_ONLY) { h_values = new ValueT[num_items]; h_reference_values = new ValueT[num_items]; for (std::size_t i = 0; i < num_items; ++i) { InitValue(INTEGER_SEED, h_values[i], i); InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]); } } if (h_reference_ranks) delete[] h_reference_ranks; printf("\nTesting bits [%d,%d) of %s keys with gen-mode %d\n", begin_bit, end_bit, typeid(KeyT).name(), gen_mode); fflush(stdout); Test( h_keys, h_values, num_items, num_segments, d_segment_offsets, d_segment_offsets + 1, begin_bit, end_bit, h_reference_keys, h_reference_values); if (h_keys) delete[] h_keys; if (h_reference_keys) delete[] h_reference_keys; if (h_values) delete[] h_values; if (h_reference_values) delete[] h_reference_values; if (h_segment_offsets) delete[] h_segment_offsets; if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets)); } #if TEST_VALUE_TYPE == 0 void TestUnspecifiedRanges() { const std::size_t num_items = 1024 * 1024; const std::size_t max_segments = 42; const std::size_t avg_segment_size = num_items / max_segments; for (int iteration = 0; iteration < 4; iteration++) { thrust::host_vector h_offsets_begin; thrust::host_vector h_offsets_end; h_offsets_begin.reserve(max_segments + 1); h_offsets_end.reserve(max_segments + 1); { int offset = 0; for (std::size_t sid = 0; sid < max_segments; sid++) { const int segment_size = static_cast(RandomValue(avg_segment_size)); const bool segment_is_utilized = segment_size > 0 && RandomValue(100) > 60; if (segment_is_utilized) { h_offsets_begin.push_back(offset); h_offsets_end.push_back(offset + segment_size); } offset += segment_size; } if (h_offsets_begin.empty()) { h_offsets_begin.push_back(avg_segment_size); h_offsets_end.push_back(num_items); } } thrust::device_vector keys(num_items); thrust::device_vector values(num_items); thrust::sequence(keys.rbegin(), keys.rend()); thrust::sequence(values.rbegin(), values.rend()); thrust::device_vector d_offsets_begin = h_offsets_begin; thrust::device_vector d_offsets_end = h_offsets_end; thrust::device_vector expected_keys = keys; thrust::device_vector expected_values = values; const int num_segments = static_cast(h_offsets_begin.size()); thrust::device_vector result_keys = keys; thrust::device_vector result_values = values; for (int sid = 0; sid < num_segments; sid++) { const int segment_begin = h_offsets_begin[sid]; const int segment_end = h_offsets_end[sid]; thrust::sort_by_key(expected_keys.begin() + segment_begin, expected_keys.begin() + segment_end, expected_values.begin() + segment_begin); } { cub::DoubleBuffer keys_buffer( thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(result_keys.data())); cub::DoubleBuffer values_buffer( thrust::raw_pointer_cast(values.data()), thrust::raw_pointer_cast(result_values.data())); std::size_t temp_storage_bytes{}; std::uint8_t *d_temp_storage{nullptr}; CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs( d_temp_storage, temp_storage_bytes, keys_buffer, values_buffer, num_items, num_segments, thrust::raw_pointer_cast(d_offsets_begin.data()), thrust::raw_pointer_cast(d_offsets_end.data()), 0, sizeof(int) * 8)); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs( d_temp_storage, temp_storage_bytes, keys_buffer, values_buffer, num_items, num_segments, thrust::raw_pointer_cast(d_offsets_begin.data()), thrust::raw_pointer_cast(d_offsets_end.data()), 0, sizeof(int) * 8)); for (int sid = 0; sid < num_segments; sid++) { const int segment_begin = h_offsets_begin[sid]; const int segment_end = h_offsets_end[sid]; if (keys_buffer.selector == 0) { thrust::copy( keys.begin() + segment_begin, keys.begin() + segment_end, result_keys.begin() + segment_begin); } if (values_buffer.selector == 0) { thrust::copy( values.begin() + segment_begin, values.begin() + segment_end, result_values.begin() + segment_begin); } } } AssertEquals(result_keys, expected_keys); AssertEquals(result_values, expected_values); thrust::sequence(keys.rbegin(), keys.rend()); thrust::sequence(values.rbegin(), values.rend()); result_keys = keys; result_values = values; { std::size_t temp_storage_bytes{}; std::uint8_t *d_temp_storage{}; CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs( d_temp_storage, temp_storage_bytes, thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(result_keys.data()), thrust::raw_pointer_cast(values.data()), thrust::raw_pointer_cast(result_values.data()), num_items, num_segments, thrust::raw_pointer_cast(d_offsets_begin.data()), thrust::raw_pointer_cast(d_offsets_end.data()), 0, sizeof(int) * 8)); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); CubDebugExit(cub::DeviceSegmentedRadixSort::SortPairs( d_temp_storage, temp_storage_bytes, thrust::raw_pointer_cast(keys.data()), thrust::raw_pointer_cast(result_keys.data()), thrust::raw_pointer_cast(values.data()), thrust::raw_pointer_cast(result_values.data()), num_items, num_segments, thrust::raw_pointer_cast(d_offsets_begin.data()), thrust::raw_pointer_cast(d_offsets_end.data()), 0, sizeof(int) * 8)); } AssertEquals(result_values, expected_values); AssertEquals(result_keys, expected_keys); } } #endif #if TEST_KEY_BYTES == 4 // Following tests check that new decomposer API doesn't break old API. // It's disabled because some compilers don't like implicit conversions, which // is required for the test. Once we figure out how to temporarily enable conversion, we can // re-enable the test. #define ENABLING_CONVERSION_IS_FIGURED_OUT 0 #if ENABLING_CONVERSION_IS_FIGURED_OUT struct bit_selector { int bit; operator int() const { return bit; } }; template void device_radix_sort_keys_allows_implicit_conversions_for_bits_helper(BeginBitT begin_bit, EndBitT end_bit, Ts... args) { const int num_items = 0; { std::size_t temp_storage_bytes = 0; std::uint8_t *d_temp_storage = nullptr; cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, args..., num_items, begin_bit, end_bit); } { std::size_t temp_storage_bytes = 0; std::uint8_t *d_temp_storage = nullptr; cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, args..., num_items, begin_bit); } { std::size_t temp_storage_bytes = 0; std::uint8_t *d_temp_storage = nullptr; cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, args..., num_items, begin_bit, end_bit); } { std::size_t temp_storage_bytes = 0; std::uint8_t *d_temp_storage = nullptr; cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, args..., num_items, begin_bit); } } template void device_radix_sort_pairs_allows_implicit_conversions_for_bits_helper(BeginBitT begin_bit, EndBitT end_bit, Ts... args) { const int num_items = 0; { std::size_t temp_storage_bytes = 0; std::uint8_t *d_temp_storage = nullptr; cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, args..., num_items, begin_bit, end_bit); } { std::size_t temp_storage_bytes = 0; std::uint8_t *d_temp_storage = nullptr; cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, args..., num_items, begin_bit); } { std::size_t temp_storage_bytes = 0; std::uint8_t *d_temp_storage = nullptr; cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, args..., num_items, begin_bit, end_bit); } { std::size_t temp_storage_bytes = 0; std::uint8_t *d_temp_storage = nullptr; cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, args..., num_items, begin_bit); } } template void device_radix_sort_allows_implicit_conversions_for_bits(BeginBitT begin_bit, EndBitT end_bit) { int *d_i_ptr = nullptr; device_radix_sort_keys_allows_implicit_conversions_for_bits_helper(begin_bit, end_bit, d_i_ptr, d_i_ptr); cub::DoubleBuffer keys(d_i_ptr, d_i_ptr); device_radix_sort_keys_allows_implicit_conversions_for_bits_helper(begin_bit, end_bit, keys); #if TEST_VALUE_TYPE == 2 unsigned int *d_u_ptr = nullptr; device_radix_sort_pairs_allows_implicit_conversions_for_bits_helper(begin_bit, end_bit, d_i_ptr, d_i_ptr, d_u_ptr, d_u_ptr); cub::DoubleBuffer pairs(d_u_ptr, d_u_ptr); device_radix_sort_pairs_allows_implicit_conversions_for_bits_helper(begin_bit, end_bit, keys, pairs); #endif } void device_radix_sort_allows_implicit_conversions_for_bits() { int begin_i = 0; long long int begin_lli = 0; bit_selector begin_bs{0}; int end_i = 2; long long int end_lli = 2; bit_selector end_bs{2}; device_radix_sort_allows_implicit_conversions_for_bits(begin_i, end_i); device_radix_sort_allows_implicit_conversions_for_bits(begin_i, end_lli); device_radix_sort_allows_implicit_conversions_for_bits(begin_i, end_bs); device_radix_sort_allows_implicit_conversions_for_bits(begin_lli, end_i); device_radix_sort_allows_implicit_conversions_for_bits(begin_lli, end_lli); device_radix_sort_allows_implicit_conversions_for_bits(begin_lli, end_bs); device_radix_sort_allows_implicit_conversions_for_bits(begin_bs, end_i); device_radix_sort_allows_implicit_conversions_for_bits(begin_bs, end_lli); device_radix_sort_allows_implicit_conversions_for_bits(begin_bs, end_bs); } #endif // ENABLING_CONVERSION_IS_FIGURED_OUT #endif // TEST_KEY_BYTES == 4 //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { std::size_t num_items = ~std::size_t(0); int num_segments = -1; // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("s", num_segments); args.GetCmdLineArgument("i", g_timing_iterations); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--s= " "[--i= " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // %PARAM% TEST_CDP cdp 0:1 // %PARAM% TEST_KEY_BYTES bytes 1:2:4:8:16 // %PARAM% TEST_VALUE_TYPE pairs 0:1:2:3 // 0->Keys only // 1->uchar // 2->uint // 3->[ull,TestBar] (TestBar is cheap to build, included here to // reduce total number of targets) // To reduce testing time, some key types are only tested when not // testing pairs: #if TEST_VALUE_TYPE == 0 #define TEST_EXTENDED_KEY_TYPES #endif // Compile/run thorough tests #if TEST_KEY_BYTES == 1 TestGen (num_items, num_segments); #ifdef TEST_EXTENDED_KEY_TYPES TestGen (num_items, num_segments); TestGen (num_items, num_segments); TestGen (num_items, num_segments); #endif // TEST_EXTENDED_KEY_TYPES #elif TEST_KEY_BYTES == 2 TestGen (num_items, num_segments); #ifdef TEST_EXTENDED_KEY_TYPES TestGen (num_items, num_segments); #if !_NVHPC_CUDA TestGen (num_items, num_segments); #endif // CTK >= 9 #if !_NVHPC_CUDA #if !defined(__ICC) // Fails with `-0 != 0` with ICC for unknown reasons. See #333. TestGen (num_items, num_segments); #endif // !ICC #endif // CTK >= 11 #endif // TEST_EXTENDED_KEY_TYPES #elif TEST_KEY_BYTES == 4 TestGen (num_items, num_segments); #if TEST_VALUE_TYPE == 0 TestUnspecifiedRanges(); #endif #if ENABLING_CONVERSION_IS_FIGURED_OUT device_radix_sort_allows_implicit_conversions_for_bits(); #endif #ifdef TEST_EXTENDED_KEY_TYPES TestGen (num_items, num_segments); TestGen (num_items, num_segments); #endif // TEST_EXTENDED_KEY_TYPES #elif TEST_KEY_BYTES == 8 TestGen (num_items, num_segments); #ifdef TEST_EXTENDED_KEY_TYPES TestGen (num_items, num_segments); TestGen(num_items, num_segments); #endif // TEST_EXTENDED_KEY_TYPES #elif TEST_KEY_BYTES == 16 #if CUB_IS_INT128_ENABLED TestGen<__int128_t, false>(num_items, num_segments); TestGen<__uint128_t, false>(num_items, num_segments); #else // Fix unused static function for MSVC BackendToString(CUB); #endif #endif // TEST_KEY_BYTES switch return 0; }