/****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "test_util.h" /** * @brief Host-side random data generation */ template void GenerateRandomData( T *rand_out, const std::size_t num_items, const T min_rand_val = std::numeric_limits::min(), const T max_rand_val = std::numeric_limits::max(), const std::uint_fast32_t seed = 320981U, typename std::enable_if::value && (sizeof(T) >= 2)>::type * = nullptr) { // initialize random number generator std::mt19937 rng(seed); std::uniform_int_distribution uni_dist(min_rand_val, max_rand_val); // generate random numbers for (std::size_t i = 0; i < num_items; ++i) { rand_out[i] = uni_dist(rng); } } template void __global__ BaselineBatchMemCpyKernel(InputBufferIt input_buffer_it, OutputBufferIt output_buffer_it, BufferSizeIteratorT buffer_sizes, BufferOffsetT num_buffers) { BufferOffsetT gtid = blockDim.x * blockIdx.x + threadIdx.x; if (gtid >= num_buffers) { return; } for (BufferOffsetT i = 0; i < buffer_sizes[gtid]; i++) { reinterpret_cast(output_buffer_it[gtid])[i] = reinterpret_cast(input_buffer_it[gtid])[i]; } } template void InvokeBaselineBatchMemcpy(InputBufferIt input_buffer_it, OutputBufferIt output_buffer_it, BufferSizeIteratorT buffer_sizes, uint32_t num_buffers) { constexpr uint32_t block_threads = 128U; uint32_t num_blocks = (num_buffers + block_threads - 1) / block_threads; BaselineBatchMemCpyKernel<<>>(input_buffer_it, output_buffer_it, buffer_sizes, num_buffers); } template void __global__ BaselineBatchMemCpyPerBlockKernel(InputBufferIt input_buffer_it, OutputBufferIt output_buffer_it, BufferSizeIteratorT buffer_sizes, BufferOffsetT num_buffers) { BufferOffsetT gbid = blockIdx.x; if (gbid >= num_buffers) { return; } for (BufferOffsetT i = threadIdx.x; i < buffer_sizes[gbid] / 8; i += blockDim.x) { reinterpret_cast(output_buffer_it[gbid])[i] = reinterpret_cast(input_buffer_it[gbid])[i]; } } /** * @brief Used for generating a shuffled but cohesive sequence of output-buffer offsets for the * sequence of input-buffers. */ template std::vector GetShuffledBufferOffsets(const std::vector &buffer_sizes, const std::uint_fast32_t seed = 320981U) { BufferOffsetT num_buffers = static_cast(buffer_sizes.size()); // We're remapping the i-th buffer to pmt_idxs[i] std::mt19937 rng(seed); std::vector pmt_idxs(num_buffers); std::iota(pmt_idxs.begin(), pmt_idxs.end(), static_cast(0)); std::shuffle(std::begin(pmt_idxs), std::end(pmt_idxs), rng); // Compute the offsets using the new mapping ByteOffsetT running_offset = {}; std::vector permuted_offsets; permuted_offsets.reserve(num_buffers); for (auto permuted_buffer_idx : pmt_idxs) { permuted_offsets.emplace_back(running_offset); running_offset += buffer_sizes[permuted_buffer_idx]; } // Generate the scatter indexes that identify where each buffer was mapped to std::vector scatter_idxs(num_buffers); for (BufferOffsetT i = 0; i < num_buffers; i++) { scatter_idxs[pmt_idxs[i]] = i; } std::vector new_offsets(num_buffers); for (BufferOffsetT i = 0; i < num_buffers; i++) { new_offsets[i] = permuted_offsets[scatter_idxs[i]]; } return new_offsets; } /** * @brief Function object class template that takes an offset and returns an iterator at the given * offset relative to a fixed base iterator. * * @tparam IteratorT The random-access iterator type to be returned */ template struct OffsetToPtrOp { template __host__ __device__ __forceinline__ IteratorT operator()(T offset) const { return base_it + offset; } IteratorT base_it; }; enum class TestDataGen { // Random offsets into a data segment RANDOM, // Buffers cohesively reside next to each other CONSECUTIVE }; /** * @brief * * @tparam AtomicT The most granular type being copied. All source and destination pointers will be * aligned based on this type, the number of bytes being copied will be an integer multiple of this * type's size * @tparam BufferOffsetT Type used for indexing into the array of buffers * @tparam BufferSizeT Type used for indexing into individual bytes of a buffer (large enough to * cover the max buffer size) * @tparam ByteOffsetT Type used for indexing into bytes over *all* the buffers' sizes */ template void RunTest(BufferOffsetT num_buffers, BufferSizeT min_buffer_size, BufferSizeT max_buffer_size, TestDataGen input_gen, TestDataGen output_gen) { using SrcPtrT = uint8_t *; // Buffer segment data (their offsets and sizes) std::vector h_buffer_sizes(num_buffers); std::vector h_buffer_src_offsets(num_buffers); std::vector h_buffer_dst_offsets(num_buffers); // Device-side resources void *d_in = nullptr; void *d_out = nullptr; ByteOffsetT *d_buffer_src_offsets = nullptr; ByteOffsetT *d_buffer_dst_offsets = nullptr; BufferSizeT *d_buffer_sizes = nullptr; void *d_temp_storage = nullptr; size_t temp_storage_bytes = 0; // Generate the buffer sizes GenerateRandomData(h_buffer_sizes.data(), h_buffer_sizes.size(), min_buffer_size, max_buffer_size); // Make sure buffer sizes are a multiple of the most granular unit (one AtomicT) being copied // (round down) for (BufferOffsetT i = 0; i < num_buffers; i++) { h_buffer_sizes[i] = (h_buffer_sizes[i] / sizeof(AtomicT)) * sizeof(AtomicT); } // Compute the total bytes to be copied ByteOffsetT num_total_bytes = 0; for (BufferOffsetT i = 0; i < num_buffers; i++) { if (input_gen == TestDataGen::CONSECUTIVE) { h_buffer_src_offsets[i] = num_total_bytes; } if (output_gen == TestDataGen::CONSECUTIVE) { h_buffer_dst_offsets[i] = num_total_bytes; } num_total_bytes += h_buffer_sizes[i]; } // Shuffle input buffer source-offsets std::uint_fast32_t shuffle_seed = 320981U; if (input_gen == TestDataGen::RANDOM) { h_buffer_src_offsets = GetShuffledBufferOffsets(h_buffer_sizes, shuffle_seed); shuffle_seed += 42; } // Shuffle input buffer source-offsets if (output_gen == TestDataGen::RANDOM) { h_buffer_dst_offsets = GetShuffledBufferOffsets(h_buffer_sizes, shuffle_seed); } // Get temporary storage requirements CubDebugExit(cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, static_cast(nullptr), static_cast(nullptr), d_buffer_sizes, num_buffers)); // Check if there's sufficient device memory to run this test std::size_t total_required_mem = num_total_bytes + // num_total_bytes + // (num_buffers * sizeof(d_buffer_src_offsets[0])) + // (num_buffers * sizeof(d_buffer_dst_offsets[0])) + // (num_buffers * sizeof(d_buffer_sizes[0])) + // temp_storage_bytes; // if (TotalGlobalMem() < total_required_mem) { std::cout << "Skipping the test due to insufficient device memory\n" // << " - Required: " << total_required_mem << " B, available: " << TotalGlobalMem() << " B\n" // << " - Skipped test instance: " // << " -> Min. buffer size: " << min_buffer_size << ", max. buffer size: " << max_buffer_size // << ", num_buffers: " << num_buffers // << ", in_gen: " << ((input_gen == TestDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE") // << ", out_gen: " << ((output_gen == TestDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE"); return; } cudaEvent_t events[2]; cudaEventCreate(&events[0]); cudaEventCreate(&events[1]); cudaStream_t stream; cudaStreamCreate(&stream); // Allocate device memory CubDebugExit(cudaMalloc(&d_in, num_total_bytes)); CubDebugExit(cudaMalloc(&d_out, num_total_bytes)); CubDebugExit(cudaMalloc(&d_buffer_src_offsets, num_buffers * sizeof(d_buffer_src_offsets[0]))); CubDebugExit(cudaMalloc(&d_buffer_dst_offsets, num_buffers * sizeof(d_buffer_dst_offsets[0]))); CubDebugExit(cudaMalloc(&d_buffer_sizes, num_buffers * sizeof(d_buffer_sizes[0]))); CubDebugExit(cudaMalloc(&d_temp_storage, temp_storage_bytes)); // Populate the data source with random data using RandomInitAliasT = uint16_t; std::size_t num_aliased_factor = sizeof(RandomInitAliasT) / sizeof(uint8_t); std::size_t num_aliased_units = CUB_QUOTIENT_CEILING(num_total_bytes, num_aliased_factor); std::unique_ptr h_in(new uint8_t[num_aliased_units * num_aliased_factor]); std::unique_ptr h_out(new uint8_t[num_total_bytes]); std::unique_ptr h_gpu_results(new uint8_t[num_total_bytes]); // Generate random offsets into the random-bits data buffer GenerateRandomData(reinterpret_cast(h_in.get()), num_aliased_units); // Prepare d_buffer_srcs OffsetToPtrOp src_transform_op{static_cast(d_in)}; cub::TransformInputIterator, ByteOffsetT *> d_buffer_srcs( d_buffer_src_offsets, src_transform_op); // Prepare d_buffer_dsts OffsetToPtrOp dst_transform_op{static_cast(d_out)}; cub::TransformInputIterator, ByteOffsetT *> d_buffer_dsts( d_buffer_dst_offsets, dst_transform_op); // Prepare random data segment (which serves for the buffer sources) CubDebugExit(cudaMemcpyAsync(d_in, h_in.get(), num_total_bytes, cudaMemcpyHostToDevice, stream)); // Prepare d_buffer_src_offsets CubDebugExit(cudaMemcpyAsync(d_buffer_src_offsets, h_buffer_src_offsets.data(), h_buffer_src_offsets.size() * sizeof(h_buffer_src_offsets[0]), cudaMemcpyHostToDevice, stream)); // Prepare d_buffer_dst_offsets CubDebugExit(cudaMemcpyAsync(d_buffer_dst_offsets, h_buffer_dst_offsets.data(), h_buffer_dst_offsets.size() * sizeof(h_buffer_dst_offsets[0]), cudaMemcpyHostToDevice, stream)); // Prepare d_buffer_sizes CubDebugExit(cudaMemcpyAsync(d_buffer_sizes, h_buffer_sizes.data(), h_buffer_sizes.size() * sizeof(h_buffer_sizes[0]), cudaMemcpyHostToDevice, stream)); // Record event before algorithm cudaEventRecord(events[0], stream); // Invoke device-side algorithm being under test CubDebugExit(cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, d_buffer_srcs, d_buffer_dsts, d_buffer_sizes, num_buffers, stream)); // Record event after algorithm cudaEventRecord(events[1], stream); // Copy back the output buffer CubDebugExit( cudaMemcpyAsync(h_gpu_results.get(), d_out, num_total_bytes, cudaMemcpyDeviceToHost, stream)); // Make sure results have been copied back to the host CubDebugExit(cudaStreamSynchronize(stream)); // CPU-side result generation for verification for (BufferOffsetT i = 0; i < num_buffers; i++) { std::memcpy(h_out.get() + h_buffer_dst_offsets[i], h_in.get() + h_buffer_src_offsets[i], h_buffer_sizes[i]); } float duration = 0; cudaEventElapsedTime(&duration, events[0], events[1]); #ifdef CUB_TEST_BENCHMARK size_t stats_src_offsets = sizeof(ByteOffsetT) * num_buffers; size_t stats_dst_offsets = sizeof(ByteOffsetT) * num_buffers; size_t stats_sizes = sizeof(BufferSizeT) * num_buffers; size_t stats_data_copied = 2 * num_total_bytes; std::cout << "Min. buffer size: " << min_buffer_size << ", max. buffer size: " << max_buffer_size // << ", num_buffers: " << num_buffers // << ", in_gen: " << ((input_gen == TestDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE") // << ", out_gen: " << ((output_gen == TestDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE") // << ", src size: " << stats_src_offsets << ", dst size: " << stats_dst_offsets // << ", sizes size: " << stats_sizes << ", cpy_data_size: " << stats_data_copied // << ", total: " << (stats_src_offsets + stats_dst_offsets + stats_sizes + stats_data_copied) // << ", duration: " << duration // << ", BW: " << ((double)(stats_src_offsets + stats_dst_offsets + stats_sizes + stats_data_copied) / 1000000000.0) / (duration / 1000.0) << "GB/s \n"; #endif for (ByteOffsetT i = 0; i < num_total_bytes; i++) { if (h_gpu_results.get()[i] != h_out.get()[i]) { std::cout << "Mismatch at index " << i << ", CPU vs. GPU: " << static_cast(h_gpu_results.get()[i]) << ", " << static_cast(h_out.get()[i]) << "\n"; } AssertEquals(h_out.get()[i], h_gpu_results.get()[i]); } CubDebugExit(cudaFree(d_in)); CubDebugExit(cudaFree(d_out)); CubDebugExit(cudaFree(d_buffer_src_offsets)); CubDebugExit(cudaFree(d_buffer_dst_offsets)); CubDebugExit(cudaFree(d_buffer_sizes)); CubDebugExit(cudaFree(d_temp_storage)); } template __global__ void TestVectorizedCopyKernel(const void *d_in, void *d_out, ByteOffsetT copy_size) { cub::detail::VectorizedCopy(threadIdx.x, d_out, copy_size, d_in); } struct TupleMemberEqualityOp { template __host__ __device__ __forceinline__ bool operator()(T tuple) { return thrust::get<0>(tuple) == thrust::get<1>(tuple); } }; /** * @brief Tests the VectorizedCopy for various aligned and misaligned input and output pointers. * @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t) */ template void TestVectorizedCopy() { constexpr uint32_t threads_per_block = 8; std::vector in_offsets{0, 1, sizeof(uint32_t) - 1}; std::vector out_offsets{0, 1, sizeof(VectorT) - 1}; std::vector copy_sizes{0, 1, sizeof(uint32_t), sizeof(VectorT), 2 * threads_per_block * sizeof(VectorT)}; for (auto copy_sizes_it = std::begin(copy_sizes); copy_sizes_it < std::end(copy_sizes); copy_sizes_it++) { for (auto in_offsets_it = std::begin(in_offsets); in_offsets_it < std::end(in_offsets); in_offsets_it++) { for (auto out_offsets_it = std::begin(out_offsets); out_offsets_it < std::end(out_offsets); out_offsets_it++) { std::size_t in_offset = *in_offsets_it; std::size_t out_offset = *out_offsets_it; std::size_t copy_size = *copy_sizes_it; // Prepare data const std::size_t alloc_size_in = in_offset + copy_size; const std::size_t alloc_size_out = out_offset + copy_size; thrust::device_vector data_in(alloc_size_in); thrust::device_vector data_out(alloc_size_out); thrust::sequence(data_in.begin(), data_in.end(), static_cast(0)); thrust::fill_n(data_out.begin(), alloc_size_out, static_cast(0x42)); auto d_in = thrust::raw_pointer_cast(data_in.data()); auto d_out = thrust::raw_pointer_cast(data_out.data()); TestVectorizedCopyKernel <<<1, threads_per_block>>>(d_in + in_offset, d_out + out_offset, static_cast(copy_size)); auto zip_it = thrust::make_zip_iterator(data_in.begin() + in_offset, data_out.begin() + out_offset); bool success = thrust::all_of(zip_it, zip_it + copy_size, TupleMemberEqualityOp{}); AssertTrue(success); } } } } template __global__ void TestBitPackedCounterKernel(uint32_t *bins, uint32_t *increments, uint32_t *counts_out, uint32_t num_items) { using BitPackedCounterT = cub::detail::BitPackedCounter; BitPackedCounterT counter{}; for (uint32_t i = 0; i < num_items; i++) { counter.Add(bins[i], increments[i]); } for (uint32_t i = 0; i < NUM_ITEMS; i++) { counts_out[i] = counter.Get(i); } } /** * @brief Tests BitPackedCounter that's used for computing the histogram of buffer sizes (i.e., * small, medium, large). */ template void TestBitPackedCounter(const std::uint_fast32_t seed = 320981U) { constexpr uint32_t min_increment = 0; constexpr uint32_t max_increment = 4; constexpr double avg_increment = static_cast(min_increment) + (static_cast(max_increment - min_increment) / 2.0); std::uint32_t num_increments = static_cast(static_cast(MAX_ITEM_VALUE * NUM_ITEMS) / avg_increment); // Test input data std::array reference_counters{}; thrust::host_vector h_bins(num_increments); thrust::host_vector h_increments(num_increments); // Generate random test input data GenerateRandomData(thrust::raw_pointer_cast(h_bins.data()), num_increments, 0U, NUM_ITEMS - 1U, seed); GenerateRandomData(thrust::raw_pointer_cast(h_increments.data()), num_increments, min_increment, max_increment, (seed + 17)); // Make sure test data does not overflow any of the counters for (std::size_t i = 0; i < num_increments; i++) { // New increment for this bin would overflow => zero this increment if (reference_counters[h_bins[i]] + h_increments[i] >= MAX_ITEM_VALUE) { h_increments[i] = 0; } else { reference_counters[h_bins[i]] += h_increments[i]; } } // Device memory thrust::device_vector bins_in(num_increments); thrust::device_vector increments_in(num_increments); thrust::device_vector counts_out(NUM_ITEMS); // Initialize device-side test data bins_in = h_bins; increments_in = h_increments; // Memory for GPU-generated results thrust::host_vector host_counts(num_increments); // Reset counters to arbitrary random value thrust::fill(counts_out.begin(), counts_out.end(), 814920U); // Run tests with densely bit-packed counters TestBitPackedCounterKernel <<<1, 1>>>(thrust::raw_pointer_cast(bins_in.data()), thrust::raw_pointer_cast(increments_in.data()), thrust::raw_pointer_cast(counts_out.data()), num_increments); // Result verification host_counts = counts_out; for (uint32_t i = 0; i < NUM_ITEMS; i++) { AssertEquals(reference_counters[i], host_counts[i]); } // Reset counters to arbitrary random value thrust::fill(counts_out.begin(), counts_out.end(), 814920U); // Run tests with bit-packed counters, where bit-count is a power-of-two TestBitPackedCounterKernel <<<1, 1>>>(thrust::raw_pointer_cast(bins_in.data()), thrust::raw_pointer_cast(increments_in.data()), thrust::raw_pointer_cast(counts_out.data()), num_increments); // Result verification host_counts = counts_out; for (uint32_t i = 0; i < NUM_ITEMS; i++) { AssertEquals(reference_counters[i], host_counts[i]); } } int main(int argc, char **argv) { CommandLineArgs args(argc, argv); // Initialize device CubDebugExit(args.DeviceInit()); //--------------------------------------------------------------------- // VectorizedCopy tests //--------------------------------------------------------------------- TestVectorizedCopy(); TestVectorizedCopy(); //--------------------------------------------------------------------- // BitPackedCounter tests //--------------------------------------------------------------------- TestBitPackedCounter<1, 1>(); TestBitPackedCounter<1, (0x01U << 16)>(); TestBitPackedCounter<4, 1>(); TestBitPackedCounter<4, 2>(); TestBitPackedCounter<4, 255>(); TestBitPackedCounter<4, 256>(); TestBitPackedCounter<8, 1024>(); TestBitPackedCounter<32, 1>(); TestBitPackedCounter<32, 256>(); //--------------------------------------------------------------------- // DeviceMemcpy::Batched tests //--------------------------------------------------------------------- // The most granular type being copied. Buffer's will be aligned and their size be an integer // multiple of this type using AtomicCopyT = uint8_t; // Type used for indexing into the array of buffers using BufferOffsetT = uint32_t; // Type used for indexing into individual bytes of a buffer (large enough to cover the max buffer using BufferSizeT = uint32_t; // Type used for indexing into bytes over *all* the buffers' sizes using ByteOffsetT = uint32_t; // Total number of bytes that are targeted to be copied on each run const BufferOffsetT target_copy_size = 64U << 20; // The number of randomly constexpr std::size_t num_rnd_buffer_range_tests = 32; // Each buffer's size will be random within this interval std::vector> buffer_size_ranges = {{0, 1}, {1, 2}, {0, 16}, {1, 32}, {1, 1024}, {1, 32 * 1024}, {128 * 1024, 256 * 1024}, {target_copy_size, target_copy_size}}; std::mt19937 rng(0); std::uniform_int_distribution size_dist(1, 1000000); for (std::size_t i = 0; i < num_rnd_buffer_range_tests; i++) { auto range_begin = size_dist(rng); auto range_end = size_dist(rng); if (range_begin > range_end) { std::swap(range_begin, range_end); } buffer_size_ranges.push_back({range_begin, range_end}); } for (const auto &buffer_size_range : buffer_size_ranges) { BufferSizeT min_buffer_size = static_cast(CUB_ROUND_UP_NEAREST(buffer_size_range.first, sizeof(AtomicCopyT))); BufferSizeT max_buffer_size = static_cast(CUB_ROUND_UP_NEAREST(buffer_size_range.second, static_cast(sizeof(AtomicCopyT)))); double average_buffer_size = (min_buffer_size + max_buffer_size) / 2.0; BufferOffsetT target_num_buffers = static_cast(target_copy_size / average_buffer_size); // Run tests with input buffer being consecutive and output buffers being consecutive RunTest(target_num_buffers, min_buffer_size, max_buffer_size, TestDataGen::CONSECUTIVE, TestDataGen::CONSECUTIVE); // Run tests with input buffer being randomly shuffled and output buffers being randomly // shuffled RunTest(target_num_buffers, min_buffer_size, max_buffer_size, TestDataGen::RANDOM, TestDataGen::RANDOM); } //--------------------------------------------------------------------- // DeviceMemcpy::Batched test with 64-bit offsets //--------------------------------------------------------------------- using ByteOffset64T = uint64_t; using BufferSize64T = uint64_t; ByteOffset64T large_target_copy_size = static_cast(std::numeric_limits::max()) + (128ULL * 1024ULL * 1024ULL); // Make sure min_buffer_size is in fact smaller than max buffer size constexpr BufferOffsetT single_buffer = 1; // Run tests with input buffer being consecutive and output buffers being consecutive RunTest(single_buffer, large_target_copy_size, large_target_copy_size, TestDataGen::CONSECUTIVE, TestDataGen::CONSECUTIVE); }