/****************************************************************************** * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "test_util.h" /** * @brief Host-side random data generation */ template void GenerateRandomData( T *rand_out, const std::size_t num_items, const T min_rand_val = std::numeric_limits::min(), const T max_rand_val = std::numeric_limits::max(), const std::uint_fast32_t seed = 320981U, typename std::enable_if::value && (sizeof(T) >= 2)>::type * = nullptr) { // initialize random number generator std::mt19937 rng(seed); std::uniform_int_distribution uni_dist(min_rand_val, max_rand_val); // generate random numbers for (std::size_t i = 0; i < num_items; ++i) { rand_out[i] = uni_dist(rng); } } /** * @brief Used for generating a shuffled but cohesive sequence of output-range offsets for the * sequence of input-ranges. */ template std::vector GetShuffledRangeOffsets(const std::vector &range_sizes, const std::uint_fast32_t seed = 320981U) { RangeOffsetT num_ranges = static_cast(range_sizes.size()); // We're remapping the i-th range to pmt_idxs[i] std::mt19937 rng(seed); std::vector pmt_idxs(num_ranges); std::iota(pmt_idxs.begin(), pmt_idxs.end(), static_cast(0)); std::shuffle(std::begin(pmt_idxs), std::end(pmt_idxs), rng); // Compute the offsets using the new mapping ByteOffsetT running_offset = {}; std::vector permuted_offsets; permuted_offsets.reserve(num_ranges); for (auto permuted_range_idx : pmt_idxs) { permuted_offsets.emplace_back(running_offset); running_offset += range_sizes[permuted_range_idx]; } // Generate the scatter indexes that identify where each range was mapped to std::vector scatter_idxs(num_ranges); for (RangeOffsetT i = 0; i < num_ranges; i++) { scatter_idxs[pmt_idxs[i]] = i; } std::vector new_offsets(num_ranges); for (RangeOffsetT i = 0; i < num_ranges; i++) { new_offsets[i] = permuted_offsets[scatter_idxs[i]]; } return new_offsets; } template typename std::enable_if= thrust::tuple_size>::value>::type print_tuple(std::ostream &, const thrust::tuple &) {} template typename std::enable_if>::value>::type print_tuple(std::ostream &os, const thrust::tuple &tup) { if (n != 0) os << ", "; os << thrust::get(tup); print_tuple(os, tup); } template std::ostream &operator<<(std::ostream &os, const thrust::tuple &tup) { os << "["; print_tuple<0>(os, tup); return os << "]"; } struct Identity { template __host__ __device__ __forceinline__ T operator()(T x) { return x; } }; /** * @brief Function object class template that takes an offset and returns an iterator at the given * offset relative to a fixed base iterator. * * @tparam IteratorT The random-access iterator type to be returned */ template struct OffsetToIteratorOp { template __host__ __device__ __forceinline__ thrust::transform_output_iterator operator()(OffsetT offset) const { return thrust::make_transform_output_iterator(base_it + offset, Identity{}); } IteratorT base_it; }; template struct RepeatIndex { template __host__ __device__ __forceinline__ thrust::constant_iterator operator()(OffsetT i) { return thrust::constant_iterator(static_cast(i)); } }; enum class TestDataGen { // Random offsets into a data segment RANDOM, // Ranges cohesively reside next to each other CONSECUTIVE }; /** * @brief * * @tparam AtomicT The type of the elements being copied * @tparam RangeOffsetT Type used for indexing into the array of ranges * @tparam RangeSizeT Type used for indexing into individual elements of a range (large enough to * cover the max range size) * @tparam ByteOffsetT Type used for indexing into elements over *all* the ranges' sizes */ template void RunTest(RangeOffsetT num_ranges, RangeSizeT min_range_size, RangeSizeT max_range_size, TestDataGen output_gen) { using SrcPtrT = AtomicT *; // Range segment data (their offsets and sizes) std::vector h_range_sizes(num_ranges); thrust::counting_iterator iota(0); auto d_range_srcs = thrust::make_transform_iterator(iota, RepeatIndex{}); std::vector h_offsets(num_ranges + 1); // Device-side resources AtomicT *d_out = nullptr; ByteOffsetT *d_offsets = nullptr; RangeSizeT *d_range_sizes = nullptr; void *d_temp_storage = nullptr; size_t temp_storage_bytes = 0; // Generate the range sizes GenerateRandomData(h_range_sizes.data(), h_range_sizes.size(), min_range_size, max_range_size); // Compute the total bytes to be copied std::partial_sum(h_range_sizes.begin(), h_range_sizes.end(), h_offsets.begin() + 1); const ByteOffsetT num_total_items = h_offsets.back(); const ByteOffsetT num_total_bytes = num_total_items * static_cast(sizeof(AtomicT)); h_offsets.pop_back(); constexpr int32_t shuffle_seed = 123241; // Shuffle output range source-offsets if (output_gen == TestDataGen::RANDOM) { h_offsets = GetShuffledRangeOffsets(h_range_sizes, shuffle_seed); } // Initialize d_range_dsts OffsetToIteratorOp dst_transform_op{d_out}; auto d_range_dsts = thrust::make_transform_iterator(d_offsets, dst_transform_op); // Get temporary storage requirements CubDebugExit(cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, d_range_srcs, d_range_dsts, d_range_sizes, num_ranges)); // Check if there's sufficient device memory to run this test std::size_t total_required_mem = num_total_bytes + // (num_ranges * sizeof(d_offsets[0])) + // (num_ranges * sizeof(d_range_sizes[0])) + // temp_storage_bytes; // if (TotalGlobalMem() < total_required_mem) { std::cout << "Skipping the test due to insufficient device memory\n" // << " - Required: " << total_required_mem << " B, available: " << TotalGlobalMem() << " B\n" // << " - Skipped test instance: " // << " -> Min. range size: " << min_range_size << ", max. range size: " << max_range_size // << ", num_ranges: " << num_ranges // << ", out_gen: " << ((output_gen == TestDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE"); return; } cudaStream_t stream; cudaStreamCreate(&stream); // Allocate device memory CubDebugExit(cudaMalloc(&d_out, num_total_bytes)); CubDebugExit(cudaMalloc(&d_offsets, num_ranges * sizeof(d_offsets[0]))); CubDebugExit(cudaMalloc(&d_range_sizes, num_ranges * sizeof(d_range_sizes[0]))); CubDebugExit(cudaMalloc(&d_temp_storage, temp_storage_bytes)); std::unique_ptr h_out(new AtomicT[num_total_items]); std::unique_ptr h_gpu_results(new AtomicT[num_total_items]); // Prepare d_range_dsts dst_transform_op.base_it = d_out; d_range_dsts = thrust::make_transform_iterator(d_offsets, dst_transform_op); // Prepare d_offsets CubDebugExit(cudaMemcpyAsync(d_offsets, h_offsets.data(), h_offsets.size() * sizeof(h_offsets[0]), cudaMemcpyHostToDevice, stream)); // Prepare d_range_sizes CubDebugExit(cudaMemcpyAsync(d_range_sizes, h_range_sizes.data(), h_range_sizes.size() * sizeof(h_range_sizes[0]), cudaMemcpyHostToDevice, stream)); // Invoke device-side algorithm being under test CubDebugExit(cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, d_range_srcs, d_range_dsts, d_range_sizes, num_ranges, stream)); // Copy back the output range CubDebugExit( cudaMemcpyAsync(h_gpu_results.get(), d_out, num_total_bytes, cudaMemcpyDeviceToHost, stream)); // Make sure results have been copied back to the host CubDebugExit(cudaStreamSynchronize(stream)); // CPU-side result generation for verification for (RangeOffsetT i = 0; i < num_ranges; i++) { std::copy(d_range_srcs[i], d_range_srcs[i] + h_range_sizes[i], h_out.get() + h_offsets[i]); } const auto it_pair = std::mismatch(h_gpu_results.get(), h_gpu_results.get() + num_total_items, h_out.get()); if (it_pair.first != h_gpu_results.get() + num_total_items) { std::cout << "Mismatch at index " << it_pair.first - h_gpu_results.get() << ", CPU vs. GPU: " << *it_pair.second << ", " << *it_pair.first << "\n"; } AssertEquals(it_pair.first, h_gpu_results.get() + num_total_items); CubDebugExit(cudaFree(d_out)); CubDebugExit(cudaFree(d_offsets)); CubDebugExit(cudaFree(d_range_sizes)); CubDebugExit(cudaFree(d_temp_storage)); } struct object_with_non_trivial_ctor { static constexpr int MAGIC = 923390; int field; int magic; __host__ __device__ object_with_non_trivial_ctor() { magic = MAGIC; field = 0; } __host__ __device__ object_with_non_trivial_ctor(int f) { magic = MAGIC; field = f; } object_with_non_trivial_ctor(const object_with_non_trivial_ctor &x) = default; __host__ __device__ object_with_non_trivial_ctor &operator=(const object_with_non_trivial_ctor &x) { if (magic == MAGIC) { field = x.field; } return *this; } }; void nontrivial_constructor_test() { const int num_buffers = 3; thrust::device_vector a(num_buffers, object_with_non_trivial_ctor(99)); thrust::device_vector b(num_buffers); using iterator = thrust::device_vector::iterator; thrust::device_vector a_iter{a.begin(), a.begin() + 1, a.begin() + 2}; thrust::device_vector b_iter{b.begin(), b.begin() + 1, b.begin() + 2}; auto sizes = thrust::make_constant_iterator(1); std::uint8_t *d_temp_storage{}; std::size_t temp_storage_bytes{}; cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, a_iter.begin(), b_iter.begin(), sizes, num_buffers); thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, a_iter.begin(), b_iter.begin(), sizes, num_buffers); for (int i = 0; i < 10; i++) { object_with_non_trivial_ctor ha(a[i]); object_with_non_trivial_ctor hb(b[i]); int ia = ha.field; int ib = hb.field; if (ia != ib) { std::cerr << "error: " << ia << " != " << ib << "\n"; } } } int main(int argc, char **argv) { CommandLineArgs args(argc, argv); // Initialize device CubDebugExit(args.DeviceInit()); //--------------------------------------------------------------------- // DeviceCopy::Batched tests //--------------------------------------------------------------------- // Run the nontrivial constructor test suggested by senior-zero nontrivial_constructor_test(); // Type used for indexing into the array of ranges using RangeOffsetT = uint32_t; // Type used for indexing into individual elements of a range (large enough to cover the max range using RangeSizeT = uint32_t; // Type used for indexing into bytes over *all* the ranges' sizes using ByteOffsetT = uint32_t; // Total number of bytes that are targeted to be copied on each run const RangeOffsetT target_copy_size = 64U << 20; // The number of randomly constexpr std::size_t num_rnd_range_tests = 32; // Each range's size will be random within this interval std::vector> size_ranges = {{0, 1}, {1, 2}, {0, 16}, {1, 32}, {1, 1024}, {1, 32 * 1024}, {128 * 1024, 256 * 1024}, {target_copy_size, target_copy_size}}; std::mt19937 rng(0); std::uniform_int_distribution size_dist(1, 1000000); for (std::size_t i = 0; i < num_rnd_range_tests; i++) { auto range_begin = size_dist(rng); auto range_end = size_dist(rng); if (range_begin > range_end) { std::swap(range_begin, range_end); } size_ranges.push_back({range_begin, range_end}); } for (const auto &size_range : size_ranges) { // The most granular type being copied. using AtomicCopyT = int64_t; RangeSizeT min_range_size = static_cast(CUB_ROUND_UP_NEAREST(size_range.first, sizeof(AtomicCopyT))); RangeSizeT max_range_size = static_cast( CUB_ROUND_UP_NEAREST(size_range.second, static_cast(sizeof(AtomicCopyT)))); double average_range_size = (min_range_size + max_range_size) / 2.0; RangeOffsetT target_num_ranges = static_cast(target_copy_size / average_range_size); // Run tests with output ranges being consecutive RunTest(target_num_ranges, min_range_size, max_range_size, TestDataGen::CONSECUTIVE); // Run tests with output ranges being randomly shuffled RunTest(target_num_ranges, min_range_size, max_range_size, TestDataGen::RANDOM); } for (const auto &size_range : size_ranges) { // The most granular type being copied. using AtomicCopyT = thrust::tuple; RangeSizeT min_range_size = static_cast(CUB_ROUND_UP_NEAREST(size_range.first, sizeof(AtomicCopyT))); RangeSizeT max_range_size = static_cast( CUB_ROUND_UP_NEAREST(size_range.second, static_cast(sizeof(AtomicCopyT)))); double average_range_size = (min_range_size + max_range_size) / 2.0; RangeOffsetT target_num_ranges = static_cast(target_copy_size / average_range_size); // Run tests with output ranges being consecutive RunTest(target_num_ranges, min_range_size, max_range_size, TestDataGen::CONSECUTIVE); // Run tests with output ranges being randomly shuffled RunTest(target_num_ranges, min_range_size, max_range_size, TestDataGen::RANDOM); } //--------------------------------------------------------------------- // DeviceCopy::Batched test with 64-bit offsets //--------------------------------------------------------------------- using ByteOffset64T = uint64_t; using RangeSize64T = uint64_t; ByteOffset64T large_target_copy_size = static_cast(std::numeric_limits::max()) + (128ULL * 1024ULL * 1024ULL); // Make sure min_range_size is in fact smaller than max range size constexpr RangeOffsetT single_range = 1; // Run tests with output ranges being consecutive RunTest(single_range, large_target_copy_size, large_target_copy_size, TestDataGen::CONSECUTIVE); }