| /****************************************************************************** | |
| * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. | |
| * | |
| * Redistribution and use in source and binary forms, with or without | |
| * modification, are permitted provided that the following conditions are met: | |
| * * Redistributions of source code must retain the above copyright | |
| * notice, this list of conditions and the following disclaimer. | |
| * * Redistributions in binary form must reproduce the above copyright | |
| * notice, this list of conditions and the following disclaimer in the | |
| * documentation and/or other materials provided with the distribution. | |
| * * Neither the name of the NVIDIA CORPORATION nor the | |
| * names of its contributors may be used to endorse or promote products | |
| * derived from this software without specific prior written permission. | |
| * | |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| * | |
| ******************************************************************************/ | |
| /** | |
| * \file | |
| * cub::AgentBatchMemcpy implements device-wide copying of a batch of device-accessible | |
| * source-buffers to device-accessible destination-buffers. | |
| */ | |
| #pragma once | |
| #include <cub/agent/single_pass_scan_operators.cuh> | |
| #include <cub/block/block_exchange.cuh> | |
| #include <cub/block/block_load.cuh> | |
| #include <cub/block/block_run_length_decode.cuh> | |
| #include <cub/block/block_scan.cuh> | |
| #include <cub/block/block_store.cuh> | |
| #include <cub/util_ptx.cuh> | |
| #include <cub/util_type.cuh> | |
| #include <cuda/std/type_traits> | |
| #include <cstdint> | |
| CUB_NAMESPACE_BEGIN | |
| namespace detail | |
| { | |
| template <bool PTR_IS_FOUR_BYTE_ALIGNED> | |
| __forceinline__ __device__ void LoadVectorAndFunnelShiftR(uint32_t const *aligned_ptr, | |
| uint32_t bit_shift, | |
| uint4 &data_out) | |
| { | |
| data_out = {aligned_ptr[0], aligned_ptr[1], aligned_ptr[2], aligned_ptr[3]}; | |
| if (!PTR_IS_FOUR_BYTE_ALIGNED) | |
| { | |
| uint32_t tail = aligned_ptr[4]; | |
| data_out.x = __funnelshift_r(data_out.x, data_out.y, bit_shift); | |
| data_out.y = __funnelshift_r(data_out.y, data_out.z, bit_shift); | |
| data_out.z = __funnelshift_r(data_out.z, data_out.w, bit_shift); | |
| data_out.w = __funnelshift_r(data_out.w, tail, bit_shift); | |
| } | |
| } | |
| template <bool PTR_IS_FOUR_BYTE_ALIGNED> | |
| __forceinline__ __device__ void LoadVectorAndFunnelShiftR(uint32_t const *aligned_ptr, | |
| uint32_t bit_shift, | |
| uint2 &data_out) | |
| { | |
| data_out = {aligned_ptr[0], aligned_ptr[1]}; | |
| if (!PTR_IS_FOUR_BYTE_ALIGNED) | |
| { | |
| uint32_t tail = aligned_ptr[2]; | |
| data_out.x = __funnelshift_r(data_out.x, data_out.y, bit_shift); | |
| data_out.y = __funnelshift_r(data_out.y, tail, bit_shift); | |
| } | |
| } | |
| template <bool PTR_IS_FOUR_BYTE_ALIGNED> | |
| __forceinline__ __device__ void LoadVectorAndFunnelShiftR(uint32_t const *aligned_ptr, | |
| uint32_t bit_shift, | |
| uint32_t &data_out) | |
| { | |
| data_out = aligned_ptr[0]; | |
| if (!PTR_IS_FOUR_BYTE_ALIGNED) | |
| { | |
| uint32_t tail = aligned_ptr[1]; | |
| data_out = __funnelshift_r(data_out, tail, bit_shift); | |
| } | |
| } | |
| /** | |
| * @brief Loads data from \p ptr into \p data_out without requiring \p ptr to be aligned. | |
| * @note If \p ptr isn't aligned to four bytes, the bytes from the last four-byte aligned address up | |
| * to \p ptr are loaded too (but dropped) and, hence, need to be device-accessible. Similarly, if | |
| * \p ptr isn't aligned to four bytes, the bytes from `(ptr + sizeof(VectorT))` up to the following | |
| * four-byte aligned address are loaded too (but dropped), and, hence, need to be device-accessible. | |
| * | |
| * @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t) | |
| * @param ptr The pointer from which the data is supposed to be loaded | |
| * @param data_out The vector type that stores the data loaded from \p ptr | |
| */ | |
| template <typename VectorT> | |
| __forceinline__ __device__ void LoadVector(const char *ptr, VectorT &data_out) | |
| { | |
| const uint32_t offset = reinterpret_cast<std::uintptr_t>(ptr) % 4U; | |
| const uint32_t *aligned_ptr = reinterpret_cast<uint32_t const *>(ptr - offset); | |
| constexpr uint32_t bits_per_byte = 8U; | |
| const uint32_t bit_shift = offset * bits_per_byte; | |
| // If `ptr` is aligned to four bytes, we can perform a simple uint32_t-aliased load | |
| if (offset == 0) | |
| { | |
| LoadVectorAndFunnelShiftR<true>(aligned_ptr, bit_shift, data_out); | |
| } | |
| // Otherwise, we need to load extra bytes and perform funnel-shifting | |
| else | |
| { | |
| LoadVectorAndFunnelShiftR<false>(aligned_ptr, bit_shift, data_out); | |
| } | |
| } | |
| /** | |
| * @brief Helper data structure to hold information on the byte range for which we can safely | |
| * perform vectorized copies. | |
| * | |
| * @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t) | |
| */ | |
| template <typename VectorT> | |
| struct PointerRange | |
| { | |
| VectorT *out_begin; | |
| VectorT *out_end; | |
| const char *in_begin; | |
| const char *in_end; | |
| }; | |
| /** | |
| * @brief Both `out_start_aligned` and `out_end_aligned` are indices into `out_ptr`. | |
| * `out_start_aligned` is the first VectorT-aligned memory location after `out_ptr + 3`. | |
| * `out_end_aligned` is the last VectorT-aligned memory location before `out_end - 4`, where out_end | |
| * corresponds to one past the last byte to be copied. Bytes between `[out_start_aligned, | |
| * out_end_aligned)` will be copied using VectorT. `out_ptr + 3` and `out_end - 4` are used instead | |
| * of `out_ptr` and `out_end` to avoid `LoadVector` reading beyond data boundaries. | |
| * | |
| * @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t) | |
| * @tparam ByteOffsetT Type used to index the bytes within the buffers | |
| * @param in_begin Pointer to the beginning of the byte range that shall be copied | |
| * @param out_begin Pointer to the beginning of the byte range that shall be copied | |
| * @param num_bytes Number of bytes that shall be copied | |
| * @return The byte range that can safely be copied using vectorized stores of type VectorT | |
| */ | |
| template <typename VectorT, typename ByteOffsetT> | |
| __device__ __forceinline__ PointerRange<VectorT> GetAlignedPtrs(const void *in_begin, | |
| void *out_begin, | |
| ByteOffsetT num_bytes) | |
| { | |
| // Data type size used for vectorized stores | |
| constexpr size_t out_datatype_size = sizeof(VectorT); | |
| // Data type size used for type-aliased loads | |
| constexpr size_t in_datatype_size = sizeof(uint32_t); | |
| // char-aliased ptrs to simplify pointer arithmetic | |
| char *out_ptr = reinterpret_cast<char *>(out_begin); | |
| const char *in_ptr = reinterpret_cast<const char *>(in_begin); | |
| // Number of bytes between the first VectorT-aligned address at or before out_begin and out_begin | |
| const uint32_t alignment_offset = reinterpret_cast<std::uintptr_t>(out_ptr) % out_datatype_size; | |
| // The first VectorT-aligned address before (or at) out_begin | |
| char *out_chars_aligned = reinterpret_cast<char *>(out_ptr - alignment_offset); | |
| // The number of extra bytes preceding `in_ptr` that are loaded but dropped | |
| uint32_t in_extra_bytes = reinterpret_cast<std::uintptr_t>(in_ptr) % in_datatype_size; | |
| // The offset required by `LoadVector`: | |
| // If the input pointer is not aligned, we load data from the last aligned address preceding the | |
| // pointer. That is, loading up to (in_datatype_size-1) bytes before `in_ptr` | |
| uint32_t in_offset_req = in_extra_bytes; | |
| // Bytes after `out_chars_aligned` to the first VectorT-aligned address at or after `out_begin` | |
| uint32_t out_start_aligned = | |
| CUB_QUOTIENT_CEILING(in_offset_req + alignment_offset, out_datatype_size) * out_datatype_size; | |
| // Compute the beginning of the aligned ranges (output and input pointers) | |
| VectorT *out_aligned_begin = reinterpret_cast<VectorT *>(out_chars_aligned + out_start_aligned); | |
| const char *in_aligned_begin = in_ptr + (reinterpret_cast<char *>(out_aligned_begin) - out_ptr); | |
| // If the aligned range is not aligned for the input pointer, we load up to (in_datatype_size-1) | |
| // bytes after the last byte that is copied. That is, we always load four bytes up to the next | |
| // aligned input address at a time. E.g., if the last byte loaded is one byte past the last | |
| // aligned address we'll also load the three bytes after that byte. | |
| uint32_t in_extra_bytes_from_aligned = | |
| (reinterpret_cast<std::uintptr_t>(in_aligned_begin) % in_datatype_size); | |
| uint32_t in_end_padding_req = (in_datatype_size - in_extra_bytes_from_aligned) % in_datatype_size; | |
| // Bytes after `out_chars_aligned` to the last VectorT-aligned | |
| // address at (or before) `out_begin` + `num_bytes` | |
| uint32_t out_end_aligned{}; | |
| if (in_end_padding_req + alignment_offset > num_bytes) | |
| { | |
| out_end_aligned = out_start_aligned; | |
| } | |
| else | |
| { | |
| out_end_aligned = (num_bytes - in_end_padding_req + alignment_offset) / out_datatype_size * | |
| out_datatype_size; | |
| } | |
| VectorT *out_aligned_end = reinterpret_cast<VectorT *>(out_chars_aligned + out_end_aligned); | |
| const char *in_aligned_end = in_ptr + (reinterpret_cast<char *>(out_aligned_end) - out_ptr); | |
| return {out_aligned_begin, out_aligned_end, in_aligned_begin, in_aligned_end}; | |
| } | |
| /** | |
| * @brief Cooperatively copies \p num_bytes from \p src to \p dest using vectorized stores of type | |
| * \p VectorT for addresses within [dest, dest + num_bytes) that are aligned to \p VectorT. A | |
| * byte-wise copy is used for byte-ranges that are not aligned to \p VectorT. | |
| * | |
| * @tparam LOGICAL_WARP_SIZE The number of threads cooperaing to copy the data; all threads within | |
| * [0, `LOGICAL_WARP_SIZE`) must invoke this method with the same arguments | |
| * @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t) | |
| * @tparam ByteOffsetT Type used to index the bytes within the buffers | |
| * @param thread_rank The thread rank within the group that cooperates to copy the data must be | |
| * within [0, `LOGICAL_WARP_SIZE`) | |
| * @param dest Pointer to the memory location to copy to | |
| * @param num_bytes Number of bytes to copy | |
| * @param src Pointer to the memory location to copy from | |
| */ | |
| template <int LOGICAL_WARP_SIZE, typename VectorT, typename ByteOffsetT> | |
| __device__ __forceinline__ void | |
| VectorizedCopy(int32_t thread_rank, void *dest, ByteOffsetT num_bytes, const void *src) | |
| { | |
| char *out_ptr = reinterpret_cast<char *>(dest); | |
| const char *in_ptr = reinterpret_cast<const char *>(src); | |
| // Gets the byte range that can safely be copied using vectorized stores of type VectorT | |
| auto aligned_range = GetAlignedPtrs<VectorT>(src, dest, num_bytes); | |
| // If byte range for which we can use vectorized copies is empty -> use byte-wise copies | |
| if (aligned_range.out_end <= aligned_range.out_begin) | |
| { | |
| for (ByteOffsetT ichar = thread_rank; ichar < num_bytes; ichar += LOGICAL_WARP_SIZE) | |
| { | |
| out_ptr[ichar] = in_ptr[ichar]; | |
| } | |
| } | |
| else | |
| { | |
| // Copy bytes in range `[dest, aligned_range.out_begin)` | |
| out_ptr += thread_rank; | |
| in_ptr += thread_rank; | |
| while (out_ptr < reinterpret_cast<char *>(aligned_range.out_begin)) | |
| { | |
| *out_ptr = *in_ptr; | |
| out_ptr += LOGICAL_WARP_SIZE; | |
| in_ptr += LOGICAL_WARP_SIZE; | |
| } | |
| // Copy bytes in range `[aligned_range.out_begin, aligned_range.out_end)` | |
| VectorT *aligned_range_begin = aligned_range.out_begin + thread_rank; | |
| const char *in_aligned_begin = aligned_range.in_begin + thread_rank * sizeof(VectorT); | |
| while (aligned_range_begin < aligned_range.out_end) | |
| { | |
| VectorT data_in; | |
| LoadVector(in_aligned_begin, data_in); | |
| *aligned_range_begin = data_in; | |
| in_aligned_begin += sizeof(VectorT) * LOGICAL_WARP_SIZE; | |
| aligned_range_begin += LOGICAL_WARP_SIZE; | |
| } | |
| // Copy bytes in range `[aligned_range.out_end, dest + num_bytes)`. | |
| out_ptr = reinterpret_cast<char *>(aligned_range.out_end) + thread_rank; | |
| in_ptr = aligned_range.in_end + thread_rank; | |
| while (out_ptr < reinterpret_cast<char *>(dest) + num_bytes) | |
| { | |
| *out_ptr = *in_ptr; | |
| out_ptr += LOGICAL_WARP_SIZE; | |
| in_ptr += LOGICAL_WARP_SIZE; | |
| } | |
| } | |
| } | |
| template <bool IsMemcpy, | |
| uint32_t LOGICAL_WARP_SIZE, | |
| typename InputBufferT, | |
| typename OutputBufferT, | |
| typename OffsetT, | |
| typename ::cuda::std::enable_if<IsMemcpy, int>::type = 0> | |
| __device__ __forceinline__ void copy_items(InputBufferT input_buffer, | |
| OutputBufferT output_buffer, | |
| OffsetT num_bytes, | |
| OffsetT offset = 0) | |
| { | |
| VectorizedCopy<LOGICAL_WARP_SIZE, uint4>(threadIdx.x % LOGICAL_WARP_SIZE, | |
| &reinterpret_cast<char *>(output_buffer)[offset], | |
| num_bytes, | |
| &reinterpret_cast<const char *>(input_buffer)[offset]); | |
| } | |
| template <bool IsMemcpy, | |
| uint32_t LOGICAL_WARP_SIZE, | |
| typename InputBufferT, | |
| typename OutputBufferT, | |
| typename OffsetT, | |
| typename ::cuda::std::enable_if<!IsMemcpy, int>::type = 0> | |
| __device__ __forceinline__ void copy_items(InputBufferT input_buffer, | |
| OutputBufferT output_buffer, | |
| OffsetT num_items, | |
| OffsetT offset = 0) | |
| { | |
| output_buffer += offset; | |
| input_buffer += offset; | |
| for (OffsetT i = threadIdx.x % LOGICAL_WARP_SIZE; i < num_items; i += LOGICAL_WARP_SIZE) | |
| { | |
| *(output_buffer + i) = *(input_buffer + i); | |
| } | |
| } | |
| template <bool IsMemcpy, | |
| typename AliasT, | |
| typename InputIt, | |
| typename OffsetT, | |
| typename ::cuda::std::enable_if<IsMemcpy, int>::type = 0> | |
| __device__ __forceinline__ AliasT read_item(InputIt buffer_src, OffsetT offset) | |
| { | |
| return *(reinterpret_cast<const AliasT *>(buffer_src) + offset); | |
| } | |
| template <bool IsMemcpy, | |
| typename AliasT, | |
| typename InputIt, | |
| typename OffsetT, | |
| typename ::cuda::std::enable_if<!IsMemcpy, int>::type = 0> | |
| __device__ __forceinline__ AliasT read_item(InputIt buffer_src, OffsetT offset) | |
| { | |
| return *(buffer_src + offset); | |
| } | |
| template <bool IsMemcpy, | |
| typename AliasT, | |
| typename OutputIt, | |
| typename OffsetT, | |
| typename ::cuda::std::enable_if<IsMemcpy, int>::type = 0> | |
| __device__ __forceinline__ void write_item(OutputIt buffer_dst, OffsetT offset, AliasT value) | |
| { | |
| *(reinterpret_cast<AliasT *>(buffer_dst) + offset) = value; | |
| } | |
| template <bool IsMemcpy, | |
| typename AliasT, | |
| typename OutputIt, | |
| typename OffsetT, | |
| typename ::cuda::std::enable_if<!IsMemcpy, int>::type = 0> | |
| __device__ __forceinline__ void write_item(OutputIt buffer_dst, OffsetT offset, AliasT value) | |
| { | |
| *(buffer_dst + offset) = value; | |
| } | |
| /** | |
| * @brief A helper class that allows threads to maintain multiple counters, where the counter that | |
| * shall be incremented can be addressed dynamically without incurring register spillage. | |
| * | |
| * @tparam NUM_ITEMS The number of counters to allocate | |
| * @tparam MAX_ITEM_VALUE The maximum count that must be supported. | |
| * @tparam PREFER_POW2_BITS Whether the number of bits to dedicate to each counter should be a | |
| * power-of-two. If enabled, this allows replacing integer multiplication with a bit-shift in | |
| * exchange for higher register pressure. | |
| * @tparam BackingUnitT The data type that is used to provide the bits of all the counters that | |
| * shall be allocated. | |
| */ | |
| template <uint32_t NUM_ITEMS, | |
| uint32_t MAX_ITEM_VALUE, | |
| bool PREFER_POW2_BITS, | |
| typename BackingUnitT = uint32_t> | |
| class BitPackedCounter | |
| { | |
| private: | |
| /// The minimum number of bits required to represent all values from [0, MAX_ITEM_VALUE] | |
| static constexpr uint32_t MIN_BITS_PER_ITEM = | |
| (MAX_ITEM_VALUE == 0U) ? 1U : cub::Log2<static_cast<int32_t>(MAX_ITEM_VALUE + 1U)>::VALUE; | |
| /// The number of bits allocated for each item. For pre-Volta, we prefer a power-of-2 here to | |
| /// have the compiler replace costly integer multiplication with bit-shifting. | |
| static constexpr uint32_t BITS_PER_ITEM = | |
| PREFER_POW2_BITS ? (0x01ULL << (cub::Log2<static_cast<int32_t>(MIN_BITS_PER_ITEM)>::VALUE)) | |
| : MIN_BITS_PER_ITEM; | |
| /// The number of bits that each backing data type can store | |
| static constexpr uint32_t NUM_BITS_PER_UNIT = sizeof(BackingUnitT) * 8; | |
| /// The number of items that each backing data type can store | |
| static constexpr uint32_t ITEMS_PER_UNIT = NUM_BITS_PER_UNIT / BITS_PER_ITEM; | |
| /// The number of bits the backing data type is actually making use of | |
| static constexpr uint32_t USED_BITS_PER_UNIT = ITEMS_PER_UNIT * BITS_PER_ITEM; | |
| /// The number of backing data types required to store the given number of items | |
| static constexpr uint32_t NUM_TOTAL_UNITS = CUB_QUOTIENT_CEILING(NUM_ITEMS, ITEMS_PER_UNIT); | |
| /// This is the net number of bit-storage provided by each unit (remainder bits are unused) | |
| static constexpr uint32_t UNIT_MASK = (USED_BITS_PER_UNIT >= (8U * sizeof(uint32_t))) | |
| ? 0xFFFFFFFF | |
| : (0x01U << USED_BITS_PER_UNIT) - 1; | |
| /// This is the bit-mask for each item | |
| static constexpr uint32_t ITEM_MASK = (BITS_PER_ITEM >= (8U * sizeof(uint32_t))) | |
| ? 0xFFFFFFFF | |
| : (0x01U << BITS_PER_ITEM) - 1; | |
| //------------------------------------------------------------------------------ | |
| // ACCESSORS | |
| //------------------------------------------------------------------------------ | |
| public: | |
| __device__ __forceinline__ uint32_t Get(uint32_t index) const | |
| { | |
| const uint32_t target_offset = index * BITS_PER_ITEM; | |
| uint32_t val = 0; | |
| #pragma unroll | |
| for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i) | |
| { | |
| // In case the bit-offset of the counter at <index> is larger than the bit range of the | |
| // current unit, the bit_shift amount will be larger than the bits provided by this unit. As | |
| // C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width, | |
| // we use the PTX instruction `shr` to make sure behaviour is well-defined. | |
| // Negative bit-shift amounts wrap around in unsigned integer math and are ultimately clamped. | |
| const uint32_t bit_shift = target_offset - i * USED_BITS_PER_UNIT; | |
| val |= detail::LogicShiftRight(data[i], bit_shift) & ITEM_MASK; | |
| } | |
| return val; | |
| } | |
| __device__ __forceinline__ void Add(uint32_t index, uint32_t value) | |
| { | |
| const uint32_t target_offset = index * BITS_PER_ITEM; | |
| #pragma unroll | |
| for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i) | |
| { | |
| // In case the bit-offset of the counter at <index> is larger than the bit range of the | |
| // current unit, the bit_shift amount will be larger than the bits provided by this unit. As | |
| // C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width, | |
| // we use the PTX instruction `shl` to make sure behaviour is well-defined. | |
| // Negative bit-shift amounts wrap around in unsigned integer math and are ultimately clamped. | |
| const uint32_t bit_shift = target_offset - i * USED_BITS_PER_UNIT; | |
| data[i] += detail::LogicShiftLeft(value, bit_shift) & UNIT_MASK; | |
| } | |
| } | |
| __device__ BitPackedCounter operator+(const BitPackedCounter &rhs) const | |
| { | |
| BitPackedCounter result; | |
| #pragma unroll | |
| for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i) | |
| { | |
| result.data[i] = data[i] + rhs.data[i]; | |
| } | |
| return result; | |
| } | |
| //------------------------------------------------------------------------------ | |
| // MEMBER VARIABLES | |
| //------------------------------------------------------------------------------ | |
| private: | |
| BackingUnitT data[NUM_TOTAL_UNITS] = {}; | |
| }; | |
| /** | |
| * Parameterizable tuning policy type for AgentBatchMemcpy | |
| */ | |
| template <uint32_t _BLOCK_THREADS, | |
| uint32_t _BUFFERS_PER_THREAD, | |
| uint32_t _TLEV_BYTES_PER_THREAD, | |
| bool _PREFER_POW2_BITS, | |
| uint32_t _BLOCK_LEVEL_TILE_SIZE> | |
| struct AgentBatchMemcpyPolicy | |
| { | |
| /// Threads per thread block | |
| static constexpr uint32_t BLOCK_THREADS = _BLOCK_THREADS; | |
| /// Items per thread (per tile of input) | |
| static constexpr uint32_t BUFFERS_PER_THREAD = _BUFFERS_PER_THREAD; | |
| /// The number of bytes that each thread will work on with each iteration of reading in bytes | |
| /// from one or more | |
| // source-buffers and writing them out to the respective destination-buffers. | |
| static constexpr uint32_t TLEV_BYTES_PER_THREAD = _TLEV_BYTES_PER_THREAD; | |
| /// Whether the BitPackedCounter should prefer allocating a power-of-2 number of bits per | |
| /// counter | |
| static constexpr uint32_t PREFER_POW2_BITS = _PREFER_POW2_BITS; | |
| /// BLEV tile size granularity | |
| static constexpr uint32_t BLOCK_LEVEL_TILE_SIZE = _BLOCK_LEVEL_TILE_SIZE; | |
| }; | |
| template <typename AgentMemcpySmallBuffersPolicyT, | |
| typename InputBufferIt, | |
| typename OutputBufferIt, | |
| typename BufferSizeIteratorT, | |
| typename BufferOffsetT, | |
| typename BlevBufferSrcsOutItT, | |
| typename BlevBufferDstsOutItT, | |
| typename BlevBufferSizesOutItT, | |
| typename BlevBufferTileOffsetsOutItT, | |
| typename BlockOffsetT, | |
| typename BLevBufferOffsetTileState, | |
| typename BLevBlockOffsetTileState, | |
| bool IsMemcpy> | |
| class AgentBatchMemcpy | |
| { | |
| private: | |
| //--------------------------------------------------------------------- | |
| // CONFIGS / CONSTANTS | |
| //--------------------------------------------------------------------- | |
| // Tuning policy-based configurations | |
| static constexpr uint32_t BLOCK_THREADS = AgentMemcpySmallBuffersPolicyT::BLOCK_THREADS; | |
| static constexpr uint32_t BUFFERS_PER_THREAD = AgentMemcpySmallBuffersPolicyT::BUFFERS_PER_THREAD; | |
| static constexpr uint32_t TLEV_BYTES_PER_THREAD = | |
| AgentMemcpySmallBuffersPolicyT::TLEV_BYTES_PER_THREAD; | |
| static constexpr bool PREFER_POW2_BITS = AgentMemcpySmallBuffersPolicyT::PREFER_POW2_BITS; | |
| static constexpr uint32_t BLOCK_LEVEL_TILE_SIZE = | |
| AgentMemcpySmallBuffersPolicyT::BLOCK_LEVEL_TILE_SIZE; | |
| // Derived configs | |
| static constexpr uint32_t BUFFERS_PER_BLOCK = BUFFERS_PER_THREAD * BLOCK_THREADS; | |
| static constexpr uint32_t TLEV_BUFFERS_PER_THREAD = BUFFERS_PER_THREAD; | |
| static constexpr uint32_t BLEV_BUFFERS_PER_THREAD = BUFFERS_PER_THREAD; | |
| static constexpr uint32_t WARP_LEVEL_THRESHOLD = 128; | |
| static constexpr uint32_t BLOCK_LEVEL_THRESHOLD = 8 * 1024; | |
| static constexpr uint32_t BUFFER_STABLE_PARTITION = false; | |
| // Constants | |
| enum : uint32_t | |
| { | |
| TLEV_SIZE_CLASS = 0, | |
| WLEV_SIZE_CLASS, | |
| BLEV_SIZE_CLASS, | |
| NUM_SIZE_CLASSES, | |
| }; | |
| //--------------------------------------------------------------------- | |
| // TYPE DECLARATIONS | |
| //--------------------------------------------------------------------- | |
| /// Internal load/store type. For byte-wise memcpy, a single-byte type | |
| using AliasT = typename ::cuda::std::conditional< | |
| IsMemcpy, | |
| std::iterator_traits<char *>, | |
| std::iterator_traits<cub::detail::value_t<InputBufferIt>>>::type::value_type; | |
| /// Types of the input and output buffers | |
| using InputBufferT = cub::detail::value_t<InputBufferIt>; | |
| using OutputBufferT = cub::detail::value_t<OutputBufferIt>; | |
| /// Type that has to be sufficiently large to hold any of the buffers' sizes. | |
| /// The BufferSizeIteratorT's value type must be convertible to this type. | |
| using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>; | |
| /// Type used to index into the tile of buffers that this thread block is assigned to. | |
| using BlockBufferOffsetT = uint16_t; | |
| /// Internal type used to index into the bytes of and represent size of a TLEV buffer | |
| using TLevBufferSizeT = uint16_t; | |
| /** | |
| * @brief Helper struct to simplify BlockExchange within a single four-byte word | |
| */ | |
| struct ZippedTLevByteAssignment | |
| { | |
| // The buffer id within this tile | |
| BlockBufferOffsetT tile_buffer_id; | |
| // Byte-offset within that buffer | |
| TLevBufferSizeT buffer_byte_offset; | |
| }; | |
| /** | |
| * POD to keep track of <buffer_id, buffer_size> pairs after having partitioned this tile's | |
| * buffers by their size. | |
| */ | |
| struct BufferTuple | |
| { | |
| // Size is only valid (and relevant) for buffers that are use thread-level collaboration | |
| TLevBufferSizeT size; | |
| // The buffer id relativ to this tile (i.e., the buffer id within this tile) | |
| BlockBufferOffsetT buffer_id; | |
| }; | |
| // Load buffers in a striped arrangement if we do not want to performa a stable partitioning into | |
| // small, medium, and large buffers, otherwise load them in a blocked arrangement | |
| using BufferLoadT = | |
| BlockLoad<BufferSizeT, | |
| static_cast<int32_t>(BLOCK_THREADS), | |
| static_cast<int32_t>(BUFFERS_PER_THREAD), | |
| BUFFER_STABLE_PARTITION ? BLOCK_LOAD_WARP_TRANSPOSE : BLOCK_LOAD_STRIPED>; | |
| // A vectorized counter that will count the number of buffers that fall into each of the | |
| // size-classes. Where the size class representes the collaboration level that is required to | |
| // process a buffer. The collaboration level being either: | |
| //-> (1) TLEV (thread-level collaboration), requiring one or multiple threads but not a FULL warp | |
| // to collaborate | |
| //-> (2) WLEV (warp-level collaboration), requiring a full warp to collaborate on a buffer | |
| //-> (3) BLEV (block-level collaboration), requiring one or multiple thread blocks to collaborate | |
| // on a buffer */ | |
| using VectorizedSizeClassCounterT = | |
| BitPackedCounter<NUM_SIZE_CLASSES, BUFFERS_PER_BLOCK, PREFER_POW2_BITS>; | |
| // Block-level scan used to compute the write offsets | |
| using BlockSizeClassScanT = | |
| cub::BlockScan<VectorizedSizeClassCounterT, static_cast<int32_t>(BLOCK_THREADS)>; | |
| // | |
| using BlockBLevTileCountScanT = cub::BlockScan<BlockOffsetT, static_cast<int32_t>(BLOCK_THREADS)>; | |
| // Block-level run-length decode algorithm to evenly distribute work of all buffers requiring | |
| // thread-level collaboration | |
| using BlockRunLengthDecodeT = | |
| cub::BlockRunLengthDecode<BlockBufferOffsetT, | |
| static_cast<int32_t>(BLOCK_THREADS), | |
| static_cast<int32_t>(TLEV_BUFFERS_PER_THREAD), | |
| static_cast<int32_t>(TLEV_BYTES_PER_THREAD)>; | |
| using BlockExchangeTLevT = cub::BlockExchange<ZippedTLevByteAssignment, | |
| static_cast<int32_t>(BLOCK_THREADS), | |
| static_cast<int32_t>(TLEV_BYTES_PER_THREAD)>; | |
| using BLevBuffScanPrefixCallbackOpT = | |
| TilePrefixCallbackOp<BufferOffsetT, Sum, BLevBufferOffsetTileState>; | |
| using BLevBlockScanPrefixCallbackOpT = | |
| TilePrefixCallbackOp<BlockOffsetT, Sum, BLevBlockOffsetTileState>; | |
| //----------------------------------------------------------------------------- | |
| // SHARED MEMORY DECLARATIONS | |
| //----------------------------------------------------------------------------- | |
| struct _TempStorage | |
| { | |
| union | |
| { | |
| typename BufferLoadT::TempStorage load_storage; | |
| // Stage 1: histogram over the size classes in preparation for partitioning buffers by size | |
| typename BlockSizeClassScanT::TempStorage size_scan_storage; | |
| // Stage 2: Communicate the number ofer buffers requiring block-level collaboration | |
| typename BLevBuffScanPrefixCallbackOpT::TempStorage buffer_scan_callback; | |
| // Stage 3; batch memcpy buffers that require only thread-level collaboration | |
| struct | |
| { | |
| BufferTuple buffers_by_size_class[BUFFERS_PER_BLOCK]; | |
| // Stage 3.1: Write buffers requiring block-level collaboration to queue | |
| union | |
| { | |
| struct | |
| { | |
| typename BLevBlockScanPrefixCallbackOpT::TempStorage block_scan_callback; | |
| typename BlockBLevTileCountScanT::TempStorage block_scan_storage; | |
| } blev; | |
| // Stage 3.3: run-length decode & block exchange for tlev | |
| // rld_state needs to be persistent across loop iterations (RunLengthDecode calls) and, | |
| // hence, cannot alias block_exchange_storage | |
| struct | |
| { | |
| typename BlockRunLengthDecodeT::TempStorage rld_state; | |
| typename BlockExchangeTLevT::TempStorage block_exchange_storage; | |
| } tlev; | |
| }; | |
| } staged; | |
| }; | |
| BufferOffsetT blev_buffer_offset; | |
| }; | |
| //----------------------------------------------------------------------------- | |
| // PUBLIC TYPE MEMBERS | |
| //----------------------------------------------------------------------------- | |
| public: | |
| struct TempStorage : Uninitialized<_TempStorage> | |
| {}; | |
| //----------------------------------------------------------------------------- | |
| // PRIVATE MEMBER FUNCTIONS | |
| //----------------------------------------------------------------------------- | |
| private: | |
| /// Shared storage reference | |
| _TempStorage &temp_storage; | |
| /** | |
| * @brief Loads this tile's buffers' sizes, without any guards (i.e., out-of-bounds checks) | |
| */ | |
| __device__ __forceinline__ void | |
| LoadBufferSizesFullTile(BufferSizeIteratorT tile_buffer_sizes_it, | |
| BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD]) | |
| { | |
| BufferLoadT(temp_storage.load_storage).Load(tile_buffer_sizes_it, buffer_sizes); | |
| } | |
| /** | |
| * @brief Loads this tile's buffers' sizes, making sure to read at most \p num_valid items. | |
| */ | |
| __device__ __forceinline__ void | |
| LoadBufferSizesPartialTile(BufferSizeIteratorT tile_buffer_sizes_it, | |
| BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD], | |
| BufferOffsetT num_valid) | |
| { | |
| // Out-of-bounds buffer items are initialized to '0', so those buffers will simply be ignored | |
| // later on | |
| constexpr BufferSizeT OOB_DEFAULT_BUFFER_SIZE = 0U; | |
| BufferLoadT(temp_storage.load_storage) | |
| .Load(tile_buffer_sizes_it, buffer_sizes, num_valid, OOB_DEFAULT_BUFFER_SIZE); | |
| } | |
| /** | |
| * @brief Computes the histogram over the number of buffers belonging to each of the three | |
| * size-classes (TLEV, WLEV, BLEV). | |
| */ | |
| __device__ __forceinline__ VectorizedSizeClassCounterT | |
| GetBufferSizeClassHistogram(const BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD]) | |
| { | |
| VectorizedSizeClassCounterT vectorized_counters{}; | |
| #pragma unroll | |
| for (uint32_t i = 0; i < BUFFERS_PER_THREAD; i++) | |
| { | |
| // Whether to increment ANY of the buffer size classes at all | |
| const uint32_t increment = buffer_sizes[i] > 0 ? 1U : 0U; | |
| // Identify the buffer's size class | |
| uint32_t buffer_size_class = 0; | |
| buffer_size_class += buffer_sizes[i] > WARP_LEVEL_THRESHOLD ? 1U : 0U; | |
| buffer_size_class += buffer_sizes[i] > BLOCK_LEVEL_THRESHOLD ? 1U : 0U; | |
| // Increment the count of the respective size class | |
| vectorized_counters.Add(buffer_size_class, increment); | |
| } | |
| return vectorized_counters; | |
| } | |
| /** | |
| * @brief Scatters the buffers into the respective buffer's size-class partition. | |
| */ | |
| __device__ __forceinline__ void | |
| PartitionBuffersBySize(const BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD], | |
| VectorizedSizeClassCounterT &vectorized_offsets, | |
| BufferTuple (&buffers_by_size_class)[BUFFERS_PER_BLOCK]) | |
| { | |
| // If we intend to perform a stable partitioning, the thread's buffer are in a blocked | |
| // arrangement, otherwise they are in a striped arrangement | |
| BlockBufferOffsetT buffer_id = BUFFER_STABLE_PARTITION ? (BUFFERS_PER_THREAD * threadIdx.x) | |
| : (threadIdx.x); | |
| constexpr BlockBufferOffsetT BUFFER_STRIDE = BUFFER_STABLE_PARTITION | |
| ? static_cast<BlockBufferOffsetT>(1) | |
| : static_cast<BlockBufferOffsetT>(BLOCK_THREADS); | |
| #pragma unroll | |
| for (uint32_t i = 0; i < BUFFERS_PER_THREAD; i++) | |
| { | |
| if (buffer_sizes[i] > 0) | |
| { | |
| uint32_t buffer_size_class = 0; | |
| buffer_size_class += buffer_sizes[i] > WARP_LEVEL_THRESHOLD ? 1U : 0U; | |
| buffer_size_class += buffer_sizes[i] > BLOCK_LEVEL_THRESHOLD ? 1U : 0U; | |
| const uint32_t write_offset = vectorized_offsets.Get(buffer_size_class); | |
| buffers_by_size_class[write_offset] = {static_cast<TLevBufferSizeT>(buffer_sizes[i]), | |
| buffer_id}; | |
| vectorized_offsets.Add(buffer_size_class, 1U); | |
| } | |
| buffer_id += BUFFER_STRIDE; | |
| } | |
| } | |
| /** | |
| * @brief Read in all the buffers that require block-level collaboration and put them to a queue | |
| * that will get picked up in a separate, subsequent kernel. | |
| */ | |
| __device__ __forceinline__ void EnqueueBLEVBuffers(BufferTuple *buffers_by_size_class, | |
| InputBufferIt tile_buffer_srcs, | |
| OutputBufferIt tile_buffer_dsts, | |
| BufferSizeIteratorT tile_buffer_sizes, | |
| BlockBufferOffsetT num_blev_buffers, | |
| BufferOffsetT tile_buffer_offset, | |
| BufferOffsetT tile_id) | |
| { | |
| BlockOffsetT block_offset[BLEV_BUFFERS_PER_THREAD]; | |
| // Read in the BLEV buffer partition (i.e., the buffers that require block-level collaboration) | |
| uint32_t blev_buffer_offset = threadIdx.x * BLEV_BUFFERS_PER_THREAD; | |
| #pragma unroll | |
| for (uint32_t i = 0; i < BLEV_BUFFERS_PER_THREAD; i++) | |
| { | |
| if (blev_buffer_offset < num_blev_buffers) | |
| { | |
| BlockBufferOffsetT tile_buffer_id = buffers_by_size_class[blev_buffer_offset].buffer_id; | |
| block_offset[i] = CUB_QUOTIENT_CEILING(tile_buffer_sizes[tile_buffer_id], | |
| BLOCK_LEVEL_TILE_SIZE); | |
| } | |
| else | |
| { | |
| // Out-of-bounds buffers are assigned a tile count of '0' | |
| block_offset[i] = 0U; | |
| } | |
| blev_buffer_offset++; | |
| } | |
| if (tile_id == 0) | |
| { | |
| BlockOffsetT block_aggregate; | |
| BlockBLevTileCountScanT(temp_storage.staged.blev.block_scan_storage) | |
| .ExclusiveSum(block_offset, block_offset, block_aggregate); | |
| if (threadIdx.x == 0) | |
| { | |
| blev_block_scan_state.SetInclusive(0, block_aggregate); | |
| } | |
| } | |
| else | |
| { | |
| BLevBlockScanPrefixCallbackOpT blev_tile_prefix_op( | |
| blev_block_scan_state, | |
| temp_storage.staged.blev.block_scan_callback, | |
| Sum(), | |
| tile_id); | |
| BlockBLevTileCountScanT(temp_storage.staged.blev.block_scan_storage) | |
| .ExclusiveSum(block_offset, block_offset, blev_tile_prefix_op); | |
| } | |
| CTA_SYNC(); | |
| // Read in the BLEV buffer partition (i.e., the buffers that require block-level collaboration) | |
| blev_buffer_offset = threadIdx.x * BLEV_BUFFERS_PER_THREAD; | |
| #pragma unroll | |
| for (uint32_t i = 0; i < BLEV_BUFFERS_PER_THREAD; i++) | |
| { | |
| if (blev_buffer_offset < num_blev_buffers) | |
| { | |
| BlockBufferOffsetT tile_buffer_id = buffers_by_size_class[blev_buffer_offset].buffer_id; | |
| blev_buffer_srcs[tile_buffer_offset + blev_buffer_offset] = | |
| tile_buffer_srcs[tile_buffer_id]; | |
| blev_buffer_dsts[tile_buffer_offset + blev_buffer_offset] = | |
| tile_buffer_dsts[tile_buffer_id]; | |
| blev_buffer_sizes[tile_buffer_offset + blev_buffer_offset] = | |
| tile_buffer_sizes[tile_buffer_id]; | |
| blev_buffer_tile_offsets[tile_buffer_offset + blev_buffer_offset] = block_offset[i]; | |
| blev_buffer_offset++; | |
| } | |
| } | |
| } | |
| /** | |
| * @brief Read in all the buffers of this tile that require warp-level collaboration and copy | |
| * their bytes to the corresponding destination buffer | |
| */ | |
| __device__ __forceinline__ void BatchMemcpyWLEVBuffers(BufferTuple *buffers_by_size_class, | |
| InputBufferIt tile_buffer_srcs, | |
| OutputBufferIt tile_buffer_dsts, | |
| BufferSizeIteratorT tile_buffer_sizes, | |
| BlockBufferOffsetT num_wlev_buffers) | |
| { | |
| const int32_t warp_id = threadIdx.x / CUB_PTX_WARP_THREADS; | |
| constexpr uint32_t WARPS_PER_BLOCK = BLOCK_THREADS / CUB_PTX_WARP_THREADS; | |
| for (BlockBufferOffsetT buffer_offset = warp_id; buffer_offset < num_wlev_buffers; | |
| buffer_offset += WARPS_PER_BLOCK) | |
| { | |
| const auto buffer_id = buffers_by_size_class[buffer_offset].buffer_id; | |
| copy_items<IsMemcpy, CUB_PTX_WARP_THREADS, InputBufferT, OutputBufferT, BufferSizeT>( | |
| tile_buffer_srcs[buffer_id], | |
| tile_buffer_dsts[buffer_id], | |
| tile_buffer_sizes[buffer_id]); | |
| } | |
| } | |
| /** | |
| * @brief Read in all the buffers of this tile that require thread-level collaboration and copy | |
| * their bytes to the corresponding destination buffer | |
| */ | |
| __device__ __forceinline__ void BatchMemcpyTLEVBuffers(BufferTuple *buffers_by_size_class, | |
| InputBufferIt tile_buffer_srcs, | |
| OutputBufferIt tile_buffer_dsts, | |
| BlockBufferOffsetT num_tlev_buffers) | |
| { | |
| // Read in the buffers' ids that require thread-level collaboration (where buffer id is the | |
| // buffer within this tile) | |
| BlockBufferOffsetT tlev_buffer_ids[TLEV_BUFFERS_PER_THREAD]; | |
| TLevBufferSizeT tlev_buffer_sizes[TLEV_BUFFERS_PER_THREAD]; | |
| // Currently we do not go over the TLEV buffers in multiple iterations, so we need to make sure | |
| // we are able to be covered for the case that all our buffers are TLEV buffers | |
| static_assert(TLEV_BUFFERS_PER_THREAD >= BUFFERS_PER_THREAD, | |
| "Unsupported confiugraiton: The number of 'thread-level buffers' must be at " | |
| "least as large as the number of overall buffers being processed by each " | |
| "thread."); | |
| // Read in the TLEV buffer partition (i.e., the buffers that require thread-level collaboration) | |
| uint32_t tlev_buffer_offset = threadIdx.x * TLEV_BUFFERS_PER_THREAD; | |
| // Pre-populate the buffer sizes to 0 (i.e. zero-padding towards the end) to ensure | |
| // out-of-bounds TLEV buffers will not be considered | |
| #pragma unroll | |
| for (uint32_t i = 0; i < TLEV_BUFFERS_PER_THREAD; i++) | |
| { | |
| tlev_buffer_sizes[i] = 0; | |
| } | |
| // Assign TLEV buffers in a blocked arrangement (each thread is assigned consecutive TLEV | |
| // buffers) | |
| #pragma unroll | |
| for (uint32_t i = 0; i < TLEV_BUFFERS_PER_THREAD; i++) | |
| { | |
| if (tlev_buffer_offset < num_tlev_buffers) | |
| { | |
| tlev_buffer_ids[i] = buffers_by_size_class[tlev_buffer_offset].buffer_id; | |
| tlev_buffer_sizes[i] = buffers_by_size_class[tlev_buffer_offset].size; | |
| } | |
| tlev_buffer_offset++; | |
| } | |
| // Evenly distribute all the bytes that have to be copied from all the buffers that require | |
| // thread-level collaboration using BlockRunLengthDecode | |
| uint32_t num_total_tlev_bytes = 0U; | |
| BlockRunLengthDecodeT block_run_length_decode(temp_storage.staged.tlev.rld_state, | |
| tlev_buffer_ids, | |
| tlev_buffer_sizes, | |
| num_total_tlev_bytes); | |
| // Run-length decode the buffers' sizes into a window buffer of limited size. This is repeated | |
| // until we were able to cover all the bytes of TLEV buffers | |
| uint32_t decoded_window_offset = 0U; | |
| while (decoded_window_offset < num_total_tlev_bytes) | |
| { | |
| BlockBufferOffsetT buffer_id[TLEV_BYTES_PER_THREAD]; | |
| TLevBufferSizeT buffer_byte_offset[TLEV_BYTES_PER_THREAD]; | |
| // Now we have a balanced assignment: buffer_id[i] will hold the tile's buffer id and | |
| // buffer_byte_offset[i] that buffer's byte that this thread supposed to copy | |
| block_run_length_decode.RunLengthDecode(buffer_id, buffer_byte_offset, decoded_window_offset); | |
| // Zip from SoA to AoS | |
| ZippedTLevByteAssignment zipped_byte_assignment[TLEV_BYTES_PER_THREAD]; | |
| #pragma unroll | |
| for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++) | |
| { | |
| zipped_byte_assignment[i] = {buffer_id[i], buffer_byte_offset[i]}; | |
| } | |
| // Exchange from blocked to striped arrangement for coalesced memory reads and writes | |
| BlockExchangeTLevT(temp_storage.staged.tlev.block_exchange_storage) | |
| .BlockedToStriped(zipped_byte_assignment, zipped_byte_assignment); | |
| // Read in the bytes that this thread is assigned to | |
| constexpr uint32_t WINDOW_SIZE = (TLEV_BYTES_PER_THREAD * BLOCK_THREADS); | |
| const bool is_full_window = decoded_window_offset + WINDOW_SIZE < num_total_tlev_bytes; | |
| if (is_full_window) | |
| { | |
| uint32_t absolute_tlev_byte_offset = decoded_window_offset + threadIdx.x; | |
| AliasT src_byte[TLEV_BYTES_PER_THREAD]; | |
| #pragma unroll | |
| for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++) | |
| { | |
| src_byte[i] = read_item<IsMemcpy, AliasT, InputBufferT>( | |
| tile_buffer_srcs[zipped_byte_assignment[i].tile_buffer_id], | |
| zipped_byte_assignment[i].buffer_byte_offset); | |
| absolute_tlev_byte_offset += BLOCK_THREADS; | |
| } | |
| #pragma unroll | |
| for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++) | |
| { | |
| write_item<IsMemcpy, AliasT, OutputBufferT>( | |
| tile_buffer_dsts[zipped_byte_assignment[i].tile_buffer_id], | |
| zipped_byte_assignment[i].buffer_byte_offset, | |
| src_byte[i]); | |
| } | |
| } | |
| else | |
| { | |
| uint32_t absolute_tlev_byte_offset = decoded_window_offset + threadIdx.x; | |
| #pragma unroll | |
| for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++) | |
| { | |
| if (absolute_tlev_byte_offset < num_total_tlev_bytes) | |
| { | |
| const AliasT src_byte = read_item<IsMemcpy, AliasT, InputBufferT>( | |
| tile_buffer_srcs[zipped_byte_assignment[i].tile_buffer_id], | |
| zipped_byte_assignment[i].buffer_byte_offset); | |
| write_item<IsMemcpy, AliasT, OutputBufferT>( | |
| tile_buffer_dsts[zipped_byte_assignment[i].tile_buffer_id], | |
| zipped_byte_assignment[i].buffer_byte_offset, | |
| src_byte); | |
| } | |
| absolute_tlev_byte_offset += BLOCK_THREADS; | |
| } | |
| } | |
| decoded_window_offset += WINDOW_SIZE; | |
| // Ensure all threads finished collaborative BlockExchange so temporary storage can be reused | |
| // with next iteration | |
| CTA_SYNC(); | |
| } | |
| } | |
| //----------------------------------------------------------------------------- | |
| // PUBLIC MEMBER FUNCTIONS | |
| //----------------------------------------------------------------------------- | |
| public: | |
| __device__ __forceinline__ void ConsumeTile(BufferOffsetT tile_id) | |
| { | |
| // Offset into this tile's buffers | |
| BufferOffsetT buffer_offset = tile_id * BUFFERS_PER_BLOCK; | |
| // Indicates whether all of this tiles items are within bounds | |
| bool is_full_tile = buffer_offset + BUFFERS_PER_BLOCK < num_buffers; | |
| // Load the buffer sizes of this tile's buffers | |
| BufferSizeIteratorT tile_buffer_sizes_it = buffer_sizes_it + buffer_offset; | |
| BufferSizeT buffer_sizes[BUFFERS_PER_THREAD]; | |
| if (is_full_tile) | |
| { | |
| LoadBufferSizesFullTile(tile_buffer_sizes_it, buffer_sizes); | |
| } | |
| else | |
| { | |
| LoadBufferSizesPartialTile(tile_buffer_sizes_it, buffer_sizes, num_buffers - buffer_offset); | |
| } | |
| // Ensure we can repurpose the BlockLoad's temporary storage | |
| CTA_SYNC(); | |
| // Count how many buffers fall into each size-class | |
| VectorizedSizeClassCounterT size_class_histogram = GetBufferSizeClassHistogram(buffer_sizes); | |
| // Compute the prefix sum over the histogram | |
| VectorizedSizeClassCounterT size_class_agg = {}; | |
| BlockSizeClassScanT(temp_storage.size_scan_storage) | |
| .ExclusiveSum(size_class_histogram, size_class_histogram, size_class_agg); | |
| // Ensure we can repurpose the scan's temporary storage for scattering the buffer ids | |
| CTA_SYNC(); | |
| // Factor in the per-size-class counts / offsets | |
| // That is, WLEV buffer offset has to be offset by the TLEV buffer count and BLEV buffer offset | |
| // has to be offset by the TLEV+WLEV buffer count | |
| uint32_t buffer_count = 0U; | |
| for (uint32_t i = 0; i < NUM_SIZE_CLASSES; i++) | |
| { | |
| size_class_histogram.Add(i, buffer_count); | |
| buffer_count += size_class_agg.Get(i); | |
| } | |
| // Signal the number of BLEV buffers we're planning to write out | |
| BufferOffsetT buffer_exclusive_prefix = 0; | |
| if (tile_id == 0) | |
| { | |
| if (threadIdx.x == 0) | |
| { | |
| blev_buffer_scan_state.SetInclusive(tile_id, size_class_agg.Get(BLEV_SIZE_CLASS)); | |
| } | |
| buffer_exclusive_prefix = 0; | |
| } | |
| else | |
| { | |
| BLevBuffScanPrefixCallbackOpT blev_buffer_prefix_op(blev_buffer_scan_state, | |
| temp_storage.buffer_scan_callback, | |
| Sum(), | |
| tile_id); | |
| // Signal our partial prefix and wait for the inclusive prefix of previous tiles | |
| if (threadIdx.x < CUB_PTX_WARP_THREADS) | |
| { | |
| buffer_exclusive_prefix = blev_buffer_prefix_op(size_class_agg.Get(BLEV_SIZE_CLASS)); | |
| } | |
| } | |
| if (threadIdx.x == 0) | |
| { | |
| temp_storage.blev_buffer_offset = buffer_exclusive_prefix; | |
| } | |
| // Ensure the prefix callback has finished using its temporary storage and that it can be reused | |
| // in the next stage | |
| CTA_SYNC(); | |
| // Scatter the buffers into one of the three partitions (TLEV, WLEV, BLEV) depending on their | |
| // size | |
| PartitionBuffersBySize(buffer_sizes, | |
| size_class_histogram, | |
| temp_storage.staged.buffers_by_size_class); | |
| // Ensure all buffers have been partitioned by their size class AND | |
| // ensure that blev_buffer_offset has been written to shared memory | |
| CTA_SYNC(); | |
| // TODO: think about prefetching tile_buffer_{srcs,dsts} into shmem | |
| InputBufferIt tile_buffer_srcs = input_buffer_it + buffer_offset; | |
| OutputBufferIt tile_buffer_dsts = output_buffer_it + buffer_offset; | |
| BufferSizeIteratorT tile_buffer_sizes = buffer_sizes_it + buffer_offset; | |
| // Copy block-level buffers | |
| EnqueueBLEVBuffers( | |
| &temp_storage.staged.buffers_by_size_class[size_class_agg.Get(TLEV_SIZE_CLASS) + | |
| size_class_agg.Get(WLEV_SIZE_CLASS)], | |
| tile_buffer_srcs, | |
| tile_buffer_dsts, | |
| tile_buffer_sizes, | |
| size_class_agg.Get(BLEV_SIZE_CLASS), | |
| temp_storage.blev_buffer_offset, | |
| tile_id); | |
| // Ensure we can repurpose the temporary storage required by EnqueueBLEVBuffers | |
| CTA_SYNC(); | |
| // Copy warp-level buffers | |
| BatchMemcpyWLEVBuffers( | |
| &temp_storage.staged.buffers_by_size_class[size_class_agg.Get(TLEV_SIZE_CLASS)], | |
| tile_buffer_srcs, | |
| tile_buffer_dsts, | |
| tile_buffer_sizes, | |
| size_class_agg.Get(WLEV_SIZE_CLASS)); | |
| // Perform batch memcpy for all the buffers that require thread-level collaboration | |
| uint32_t num_tlev_buffers = size_class_agg.Get(TLEV_SIZE_CLASS); | |
| BatchMemcpyTLEVBuffers(temp_storage.staged.buffers_by_size_class, | |
| tile_buffer_srcs, | |
| tile_buffer_dsts, | |
| num_tlev_buffers); | |
| } | |
| //----------------------------------------------------------------------------- | |
| // CONSTRUCTOR | |
| //----------------------------------------------------------------------------- | |
| __device__ __forceinline__ AgentBatchMemcpy(TempStorage &temp_storage, | |
| InputBufferIt input_buffer_it, | |
| OutputBufferIt output_buffer_it, | |
| BufferSizeIteratorT buffer_sizes_it, | |
| BufferOffsetT num_buffers, | |
| BlevBufferSrcsOutItT blev_buffer_srcs, | |
| BlevBufferDstsOutItT blev_buffer_dsts, | |
| BlevBufferSizesOutItT blev_buffer_sizes, | |
| BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets, | |
| BLevBufferOffsetTileState blev_buffer_scan_state, | |
| BLevBlockOffsetTileState blev_block_scan_state) | |
| : temp_storage(temp_storage.Alias()) | |
| , input_buffer_it(input_buffer_it) | |
| , output_buffer_it(output_buffer_it) | |
| , buffer_sizes_it(buffer_sizes_it) | |
| , num_buffers(num_buffers) | |
| , blev_buffer_srcs(blev_buffer_srcs) | |
| , blev_buffer_dsts(blev_buffer_dsts) | |
| , blev_buffer_sizes(blev_buffer_sizes) | |
| , blev_buffer_tile_offsets(blev_buffer_tile_offsets) | |
| , blev_buffer_scan_state(blev_buffer_scan_state) | |
| , blev_block_scan_state(blev_block_scan_state) | |
| {} | |
| private: | |
| // Iterator providing the pointers to the source memory buffers | |
| InputBufferIt input_buffer_it; | |
| // Iterator providing the pointers to the destination memory buffers | |
| OutputBufferIt output_buffer_it; | |
| // Iterator providing the number of bytes to be copied for each pair of buffers | |
| BufferSizeIteratorT buffer_sizes_it; | |
| // The total number of buffer pairs | |
| BufferOffsetT num_buffers; | |
| // Output iterator to which the source pointers of the BLEV buffers are written | |
| BlevBufferSrcsOutItT blev_buffer_srcs; | |
| // Output iterator to which the destination pointers of the BLEV buffers are written | |
| BlevBufferDstsOutItT blev_buffer_dsts; | |
| // Output iterator to which the number of bytes to be copied of the BLEV buffers are written | |
| BlevBufferSizesOutItT blev_buffer_sizes; | |
| // Output iterator to which the mapping of tiles to BLEV buffers is written | |
| BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets; | |
| // The single-pass prefix scan's tile state used for tracking the prefix sum over the number of | |
| // BLEV buffers | |
| BLevBufferOffsetTileState blev_buffer_scan_state; | |
| // The single-pass prefix scan's tile state used for tracking the prefix sum over tiles of BLEV | |
| // buffers | |
| BLevBlockOffsetTileState blev_block_scan_state; | |
| }; | |
| } // namespace detail | |
| CUB_NAMESPACE_END | |