| /****************************************************************************** | |
| * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. | |
| * | |
| * Redistribution and use in source and binary forms, with or without | |
| * modification, are permitted provided that the following conditions are met: | |
| * * Redistributions of source code must retain the above copyright | |
| * notice, this list of conditions and the following disclaimer. | |
| * * Redistributions in binary form must reproduce the above copyright | |
| * notice, this list of conditions and the following disclaimer in the | |
| * documentation and/or other materials provided with the distribution. | |
| * * Neither the name of the NVIDIA CORPORATION nor the | |
| * names of its contributors may be used to endorse or promote products | |
| * derived from this software without specific prior written permission. | |
| * | |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| * | |
| ******************************************************************************/ | |
| #include <cub/device/device_copy.cuh> | |
| #include <cub/util_ptx.cuh> | |
| #include <thrust/device_vector.h> | |
| #include <thrust/host_vector.h> | |
| #include <thrust/iterator/constant_iterator.h> | |
| #include <thrust/iterator/transform_iterator.h> | |
| #include <thrust/iterator/transform_output_iterator.h> | |
| #include <thrust/logical.h> | |
| #include <thrust/sequence.h> | |
| #include <thrust/tuple.h> | |
| #include <algorithm> | |
| #include <cstdint> | |
| #include <limits> | |
| #include <numeric> | |
| #include <random> | |
| #include <type_traits> | |
| #include <vector> | |
| #include "test_util.h" | |
| /** | |
| * @brief Host-side random data generation | |
| */ | |
| template <typename T> | |
| void GenerateRandomData( | |
| T *rand_out, | |
| const std::size_t num_items, | |
| const T min_rand_val = std::numeric_limits<T>::min(), | |
| const T max_rand_val = std::numeric_limits<T>::max(), | |
| const std::uint_fast32_t seed = 320981U, | |
| typename std::enable_if<std::is_integral<T>::value && (sizeof(T) >= 2)>::type * = nullptr) | |
| { | |
| // initialize random number generator | |
| std::mt19937 rng(seed); | |
| std::uniform_int_distribution<T> uni_dist(min_rand_val, max_rand_val); | |
| // generate random numbers | |
| for (std::size_t i = 0; i < num_items; ++i) | |
| { | |
| rand_out[i] = uni_dist(rng); | |
| } | |
| } | |
| /** | |
| * @brief Used for generating a shuffled but cohesive sequence of output-range offsets for the | |
| * sequence of input-ranges. | |
| */ | |
| template <typename RangeOffsetT, typename ByteOffsetT, typename RangeSizeT> | |
| std::vector<ByteOffsetT> GetShuffledRangeOffsets(const std::vector<RangeSizeT> &range_sizes, | |
| const std::uint_fast32_t seed = 320981U) | |
| { | |
| RangeOffsetT num_ranges = static_cast<RangeOffsetT>(range_sizes.size()); | |
| // We're remapping the i-th range to pmt_idxs[i] | |
| std::mt19937 rng(seed); | |
| std::vector<RangeOffsetT> pmt_idxs(num_ranges); | |
| std::iota(pmt_idxs.begin(), pmt_idxs.end(), static_cast<RangeOffsetT>(0)); | |
| std::shuffle(std::begin(pmt_idxs), std::end(pmt_idxs), rng); | |
| // Compute the offsets using the new mapping | |
| ByteOffsetT running_offset = {}; | |
| std::vector<ByteOffsetT> permuted_offsets; | |
| permuted_offsets.reserve(num_ranges); | |
| for (auto permuted_range_idx : pmt_idxs) | |
| { | |
| permuted_offsets.emplace_back(running_offset); | |
| running_offset += range_sizes[permuted_range_idx]; | |
| } | |
| // Generate the scatter indexes that identify where each range was mapped to | |
| std::vector<RangeOffsetT> scatter_idxs(num_ranges); | |
| for (RangeOffsetT i = 0; i < num_ranges; i++) | |
| { | |
| scatter_idxs[pmt_idxs[i]] = i; | |
| } | |
| std::vector<ByteOffsetT> new_offsets(num_ranges); | |
| for (RangeOffsetT i = 0; i < num_ranges; i++) | |
| { | |
| new_offsets[i] = permuted_offsets[scatter_idxs[i]]; | |
| } | |
| return new_offsets; | |
| } | |
| template <size_t n, typename... T> | |
| typename std::enable_if<n >= thrust::tuple_size<thrust::tuple<T...>>::value>::type | |
| print_tuple(std::ostream &, const thrust::tuple<T...> &) | |
| {} | |
| template <size_t n, typename... T> | |
| typename std::enable_if<n + 1 <= thrust::tuple_size<thrust::tuple<T...>>::value>::type | |
| print_tuple(std::ostream &os, const thrust::tuple<T...> &tup) | |
| { | |
| if (n != 0) | |
| os << ", "; | |
| os << thrust::get<n>(tup); | |
| print_tuple<n + 1>(os, tup); | |
| } | |
| template <typename... T> | |
| std::ostream &operator<<(std::ostream &os, const thrust::tuple<T...> &tup) | |
| { | |
| os << "["; | |
| print_tuple<0>(os, tup); | |
| return os << "]"; | |
| } | |
| struct Identity | |
| { | |
| template <typename T> | |
| __host__ __device__ __forceinline__ T operator()(T x) | |
| { | |
| return x; | |
| } | |
| }; | |
| /** | |
| * @brief Function object class template that takes an offset and returns an iterator at the given | |
| * offset relative to a fixed base iterator. | |
| * | |
| * @tparam IteratorT The random-access iterator type to be returned | |
| */ | |
| template <typename IteratorT> | |
| struct OffsetToIteratorOp | |
| { | |
| template <typename OffsetT> | |
| __host__ __device__ __forceinline__ thrust::transform_output_iterator<Identity, IteratorT> | |
| operator()(OffsetT offset) const | |
| { | |
| return thrust::make_transform_output_iterator(base_it + offset, Identity{}); | |
| } | |
| IteratorT base_it; | |
| }; | |
| template <typename AtomicT> | |
| struct RepeatIndex | |
| { | |
| template <typename OffsetT> | |
| __host__ __device__ __forceinline__ thrust::constant_iterator<AtomicT> operator()(OffsetT i) | |
| { | |
| return thrust::constant_iterator<AtomicT>(static_cast<AtomicT>(i)); | |
| } | |
| }; | |
| enum class TestDataGen | |
| { | |
| // Random offsets into a data segment | |
| RANDOM, | |
| // Ranges cohesively reside next to each other | |
| CONSECUTIVE | |
| }; | |
| /** | |
| * @brief | |
| * | |
| * @tparam AtomicT The type of the elements being copied | |
| * @tparam RangeOffsetT Type used for indexing into the array of ranges | |
| * @tparam RangeSizeT Type used for indexing into individual elements of a range (large enough to | |
| * cover the max range size) | |
| * @tparam ByteOffsetT Type used for indexing into elements over *all* the ranges' sizes | |
| */ | |
| template <typename AtomicT, typename RangeOffsetT, typename RangeSizeT, typename ByteOffsetT> | |
| void RunTest(RangeOffsetT num_ranges, | |
| RangeSizeT min_range_size, | |
| RangeSizeT max_range_size, | |
| TestDataGen output_gen) | |
| { | |
| using SrcPtrT = AtomicT *; | |
| // Range segment data (their offsets and sizes) | |
| std::vector<RangeSizeT> h_range_sizes(num_ranges); | |
| thrust::counting_iterator<RangeOffsetT> iota(0); | |
| auto d_range_srcs = thrust::make_transform_iterator(iota, RepeatIndex<AtomicT>{}); | |
| std::vector<ByteOffsetT> h_offsets(num_ranges + 1); | |
| // Device-side resources | |
| AtomicT *d_out = nullptr; | |
| ByteOffsetT *d_offsets = nullptr; | |
| RangeSizeT *d_range_sizes = nullptr; | |
| void *d_temp_storage = nullptr; | |
| size_t temp_storage_bytes = 0; | |
| // Generate the range sizes | |
| GenerateRandomData(h_range_sizes.data(), h_range_sizes.size(), min_range_size, max_range_size); | |
| // Compute the total bytes to be copied | |
| std::partial_sum(h_range_sizes.begin(), h_range_sizes.end(), h_offsets.begin() + 1); | |
| const ByteOffsetT num_total_items = h_offsets.back(); | |
| const ByteOffsetT num_total_bytes = num_total_items * static_cast<ByteOffsetT>(sizeof(AtomicT)); | |
| h_offsets.pop_back(); | |
| constexpr int32_t shuffle_seed = 123241; | |
| // Shuffle output range source-offsets | |
| if (output_gen == TestDataGen::RANDOM) | |
| { | |
| h_offsets = GetShuffledRangeOffsets<RangeOffsetT, ByteOffsetT>(h_range_sizes, shuffle_seed); | |
| } | |
| // Initialize d_range_dsts | |
| OffsetToIteratorOp<AtomicT *> dst_transform_op{d_out}; | |
| auto d_range_dsts = thrust::make_transform_iterator(d_offsets, dst_transform_op); | |
| // Get temporary storage requirements | |
| CubDebugExit(cub::DeviceCopy::Batched(d_temp_storage, | |
| temp_storage_bytes, | |
| d_range_srcs, | |
| d_range_dsts, | |
| d_range_sizes, | |
| num_ranges)); | |
| // Check if there's sufficient device memory to run this test | |
| std::size_t total_required_mem = num_total_bytes + // | |
| (num_ranges * sizeof(d_offsets[0])) + // | |
| (num_ranges * sizeof(d_range_sizes[0])) + // | |
| temp_storage_bytes; // | |
| if (TotalGlobalMem() < total_required_mem) | |
| { | |
| std::cout | |
| << "Skipping the test due to insufficient device memory\n" // | |
| << " - Required: " << total_required_mem << " B, available: " << TotalGlobalMem() << " B\n" // | |
| << " - Skipped test instance: " // | |
| << " -> Min. range size: " << min_range_size << ", max. range size: " << max_range_size // | |
| << ", num_ranges: " << num_ranges // | |
| << ", out_gen: " << ((output_gen == TestDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE"); | |
| return; | |
| } | |
| cudaStream_t stream; | |
| cudaStreamCreate(&stream); | |
| // Allocate device memory | |
| CubDebugExit(cudaMalloc(&d_out, num_total_bytes)); | |
| CubDebugExit(cudaMalloc(&d_offsets, num_ranges * sizeof(d_offsets[0]))); | |
| CubDebugExit(cudaMalloc(&d_range_sizes, num_ranges * sizeof(d_range_sizes[0]))); | |
| CubDebugExit(cudaMalloc(&d_temp_storage, temp_storage_bytes)); | |
| std::unique_ptr<AtomicT[]> h_out(new AtomicT[num_total_items]); | |
| std::unique_ptr<AtomicT[]> h_gpu_results(new AtomicT[num_total_items]); | |
| // Prepare d_range_dsts | |
| dst_transform_op.base_it = d_out; | |
| d_range_dsts = thrust::make_transform_iterator(d_offsets, dst_transform_op); | |
| // Prepare d_offsets | |
| CubDebugExit(cudaMemcpyAsync(d_offsets, | |
| h_offsets.data(), | |
| h_offsets.size() * sizeof(h_offsets[0]), | |
| cudaMemcpyHostToDevice, | |
| stream)); | |
| // Prepare d_range_sizes | |
| CubDebugExit(cudaMemcpyAsync(d_range_sizes, | |
| h_range_sizes.data(), | |
| h_range_sizes.size() * sizeof(h_range_sizes[0]), | |
| cudaMemcpyHostToDevice, | |
| stream)); | |
| // Invoke device-side algorithm being under test | |
| CubDebugExit(cub::DeviceCopy::Batched(d_temp_storage, | |
| temp_storage_bytes, | |
| d_range_srcs, | |
| d_range_dsts, | |
| d_range_sizes, | |
| num_ranges, | |
| stream)); | |
| // Copy back the output range | |
| CubDebugExit( | |
| cudaMemcpyAsync(h_gpu_results.get(), d_out, num_total_bytes, cudaMemcpyDeviceToHost, stream)); | |
| // Make sure results have been copied back to the host | |
| CubDebugExit(cudaStreamSynchronize(stream)); | |
| // CPU-side result generation for verification | |
| for (RangeOffsetT i = 0; i < num_ranges; i++) | |
| { | |
| std::copy(d_range_srcs[i], d_range_srcs[i] + h_range_sizes[i], h_out.get() + h_offsets[i]); | |
| } | |
| const auto it_pair = | |
| std::mismatch(h_gpu_results.get(), h_gpu_results.get() + num_total_items, h_out.get()); | |
| if (it_pair.first != h_gpu_results.get() + num_total_items) | |
| { | |
| std::cout << "Mismatch at index " << it_pair.first - h_gpu_results.get() | |
| << ", CPU vs. GPU: " << *it_pair.second << ", " << *it_pair.first << "\n"; | |
| } | |
| AssertEquals(it_pair.first, h_gpu_results.get() + num_total_items); | |
| CubDebugExit(cudaFree(d_out)); | |
| CubDebugExit(cudaFree(d_offsets)); | |
| CubDebugExit(cudaFree(d_range_sizes)); | |
| CubDebugExit(cudaFree(d_temp_storage)); | |
| } | |
| struct object_with_non_trivial_ctor | |
| { | |
| static constexpr int MAGIC = 923390; | |
| int field; | |
| int magic; | |
| __host__ __device__ object_with_non_trivial_ctor() | |
| { | |
| magic = MAGIC; | |
| field = 0; | |
| } | |
| __host__ __device__ object_with_non_trivial_ctor(int f) | |
| { | |
| magic = MAGIC; | |
| field = f; | |
| } | |
| object_with_non_trivial_ctor(const object_with_non_trivial_ctor &x) = default; | |
| __host__ __device__ object_with_non_trivial_ctor &operator=(const object_with_non_trivial_ctor &x) | |
| { | |
| if (magic == MAGIC) | |
| { | |
| field = x.field; | |
| } | |
| return *this; | |
| } | |
| }; | |
| void nontrivial_constructor_test() | |
| { | |
| const int num_buffers = 3; | |
| thrust::device_vector<object_with_non_trivial_ctor> a(num_buffers, | |
| object_with_non_trivial_ctor(99)); | |
| thrust::device_vector<object_with_non_trivial_ctor> b(num_buffers); | |
| using iterator = thrust::device_vector<object_with_non_trivial_ctor>::iterator; | |
| thrust::device_vector<iterator> a_iter{a.begin(), a.begin() + 1, a.begin() + 2}; | |
| thrust::device_vector<iterator> b_iter{b.begin(), b.begin() + 1, b.begin() + 2}; | |
| auto sizes = thrust::make_constant_iterator(1); | |
| std::uint8_t *d_temp_storage{}; | |
| std::size_t temp_storage_bytes{}; | |
| cub::DeviceCopy::Batched(d_temp_storage, | |
| temp_storage_bytes, | |
| a_iter.begin(), | |
| b_iter.begin(), | |
| sizes, | |
| num_buffers); | |
| thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes); | |
| d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); | |
| cub::DeviceCopy::Batched(d_temp_storage, | |
| temp_storage_bytes, | |
| a_iter.begin(), | |
| b_iter.begin(), | |
| sizes, | |
| num_buffers); | |
| for (int i = 0; i < 10; i++) | |
| { | |
| object_with_non_trivial_ctor ha(a[i]); | |
| object_with_non_trivial_ctor hb(b[i]); | |
| int ia = ha.field; | |
| int ib = hb.field; | |
| if (ia != ib) | |
| { | |
| std::cerr << "error: " << ia << " != " << ib << "\n"; | |
| } | |
| } | |
| } | |
| int main(int argc, char **argv) | |
| { | |
| CommandLineArgs args(argc, argv); | |
| // Initialize device | |
| CubDebugExit(args.DeviceInit()); | |
| //--------------------------------------------------------------------- | |
| // DeviceCopy::Batched tests | |
| //--------------------------------------------------------------------- | |
| // Run the nontrivial constructor test suggested by senior-zero | |
| nontrivial_constructor_test(); | |
| // Type used for indexing into the array of ranges | |
| using RangeOffsetT = uint32_t; | |
| // Type used for indexing into individual elements of a range (large enough to cover the max range | |
| using RangeSizeT = uint32_t; | |
| // Type used for indexing into bytes over *all* the ranges' sizes | |
| using ByteOffsetT = uint32_t; | |
| // Total number of bytes that are targeted to be copied on each run | |
| const RangeOffsetT target_copy_size = 64U << 20; | |
| // The number of randomly | |
| constexpr std::size_t num_rnd_range_tests = 32; | |
| // Each range's size will be random within this interval | |
| std::vector<std::pair<std::size_t, std::size_t>> size_ranges = {{0, 1}, | |
| {1, 2}, | |
| {0, 16}, | |
| {1, 32}, | |
| {1, 1024}, | |
| {1, 32 * 1024}, | |
| {128 * 1024, 256 * 1024}, | |
| {target_copy_size, | |
| target_copy_size}}; | |
| std::mt19937 rng(0); | |
| std::uniform_int_distribution<std::size_t> size_dist(1, 1000000); | |
| for (std::size_t i = 0; i < num_rnd_range_tests; i++) | |
| { | |
| auto range_begin = size_dist(rng); | |
| auto range_end = size_dist(rng); | |
| if (range_begin > range_end) | |
| { | |
| std::swap(range_begin, range_end); | |
| } | |
| size_ranges.push_back({range_begin, range_end}); | |
| } | |
| for (const auto &size_range : size_ranges) | |
| { | |
| // The most granular type being copied. | |
| using AtomicCopyT = int64_t; | |
| RangeSizeT min_range_size = | |
| static_cast<RangeSizeT>(CUB_ROUND_UP_NEAREST(size_range.first, sizeof(AtomicCopyT))); | |
| RangeSizeT max_range_size = static_cast<RangeSizeT>( | |
| CUB_ROUND_UP_NEAREST(size_range.second, static_cast<RangeSizeT>(sizeof(AtomicCopyT)))); | |
| double average_range_size = (min_range_size + max_range_size) / 2.0; | |
| RangeOffsetT target_num_ranges = | |
| static_cast<RangeOffsetT>(target_copy_size / average_range_size); | |
| // Run tests with output ranges being consecutive | |
| RunTest<AtomicCopyT, RangeOffsetT, RangeSizeT, ByteOffsetT>(target_num_ranges, | |
| min_range_size, | |
| max_range_size, | |
| TestDataGen::CONSECUTIVE); | |
| // Run tests with output ranges being randomly shuffled | |
| RunTest<AtomicCopyT, RangeOffsetT, RangeSizeT, ByteOffsetT>(target_num_ranges, | |
| min_range_size, | |
| max_range_size, | |
| TestDataGen::RANDOM); | |
| } | |
| for (const auto &size_range : size_ranges) | |
| { | |
| // The most granular type being copied. | |
| using AtomicCopyT = thrust::tuple<int64_t, int32_t, int16_t, char, char>; | |
| RangeSizeT min_range_size = | |
| static_cast<RangeSizeT>(CUB_ROUND_UP_NEAREST(size_range.first, sizeof(AtomicCopyT))); | |
| RangeSizeT max_range_size = static_cast<RangeSizeT>( | |
| CUB_ROUND_UP_NEAREST(size_range.second, static_cast<RangeSizeT>(sizeof(AtomicCopyT)))); | |
| double average_range_size = (min_range_size + max_range_size) / 2.0; | |
| RangeOffsetT target_num_ranges = | |
| static_cast<RangeOffsetT>(target_copy_size / average_range_size); | |
| // Run tests with output ranges being consecutive | |
| RunTest<AtomicCopyT, RangeOffsetT, RangeSizeT, ByteOffsetT>(target_num_ranges, | |
| min_range_size, | |
| max_range_size, | |
| TestDataGen::CONSECUTIVE); | |
| // Run tests with output ranges being randomly shuffled | |
| RunTest<AtomicCopyT, RangeOffsetT, RangeSizeT, ByteOffsetT>(target_num_ranges, | |
| min_range_size, | |
| max_range_size, | |
| TestDataGen::RANDOM); | |
| } | |
| //--------------------------------------------------------------------- | |
| // DeviceCopy::Batched test with 64-bit offsets | |
| //--------------------------------------------------------------------- | |
| using ByteOffset64T = uint64_t; | |
| using RangeSize64T = uint64_t; | |
| ByteOffset64T large_target_copy_size = | |
| static_cast<ByteOffset64T>(std::numeric_limits<uint32_t>::max()) + (128ULL * 1024ULL * 1024ULL); | |
| // Make sure min_range_size is in fact smaller than max range size | |
| constexpr RangeOffsetT single_range = 1; | |
| // Run tests with output ranges being consecutive | |
| RunTest<uint8_t, RangeOffsetT, RangeSize64T, ByteOffset64T>(single_range, | |
| large_target_copy_size, | |
| large_target_copy_size, | |
| TestDataGen::CONSECUTIVE); | |
| } | |