thrust / install /include /cub /agent /agent_batch_memcpy.cuh

thanks to nvidia ❤

0dc1b04 over 2 years ago

51.7 kB

	/******************************************************************************
	* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* cub::AgentBatchMemcpy implements device-wide copying of a batch of device-accessible
	* source-buffers to device-accessible destination-buffers.
	*/

	#pragma once

	#include <cub/agent/single_pass_scan_operators.cuh>
	#include <cub/block/block_exchange.cuh>
	#include <cub/block/block_load.cuh>
	#include <cub/block/block_run_length_decode.cuh>
	#include <cub/block/block_scan.cuh>
	#include <cub/block/block_store.cuh>
	#include <cub/util_ptx.cuh>
	#include <cub/util_type.cuh>

	#include <cuda/std/type_traits>

	#include <cstdint>

	CUB_NAMESPACE_BEGIN

	namespace detail
	{
	template <bool PTR_IS_FOUR_BYTE_ALIGNED>
	__forceinline__ __device__ void LoadVectorAndFunnelShiftR(uint32_t const *aligned_ptr,
	uint32_t bit_shift,
	uint4 &data_out)
	{
	data_out = {aligned_ptr[0], aligned_ptr[1], aligned_ptr[2], aligned_ptr[3]};

	if (!PTR_IS_FOUR_BYTE_ALIGNED)
	{
	uint32_t tail = aligned_ptr[4];
	data_out.x = __funnelshift_r(data_out.x, data_out.y, bit_shift);
	data_out.y = __funnelshift_r(data_out.y, data_out.z, bit_shift);
	data_out.z = __funnelshift_r(data_out.z, data_out.w, bit_shift);
	data_out.w = __funnelshift_r(data_out.w, tail, bit_shift);
	}
	}

	template <bool PTR_IS_FOUR_BYTE_ALIGNED>
	__forceinline__ __device__ void LoadVectorAndFunnelShiftR(uint32_t const *aligned_ptr,
	uint32_t bit_shift,
	uint2 &data_out)
	{
	data_out = {aligned_ptr[0], aligned_ptr[1]};

	if (!PTR_IS_FOUR_BYTE_ALIGNED)
	{
	uint32_t tail = aligned_ptr[2];
	data_out.x = __funnelshift_r(data_out.x, data_out.y, bit_shift);
	data_out.y = __funnelshift_r(data_out.y, tail, bit_shift);
	}
	}

	template <bool PTR_IS_FOUR_BYTE_ALIGNED>
	__forceinline__ __device__ void LoadVectorAndFunnelShiftR(uint32_t const *aligned_ptr,
	uint32_t bit_shift,
	uint32_t &data_out)
	{
	data_out = aligned_ptr[0];

	if (!PTR_IS_FOUR_BYTE_ALIGNED)
	{
	uint32_t tail = aligned_ptr[1];
	data_out = __funnelshift_r(data_out, tail, bit_shift);
	}
	}

	/**
	* @brief Loads data from \p ptr into \p data_out without requiring \p ptr to be aligned.
	* @note If \p ptr isn't aligned to four bytes, the bytes from the last four-byte aligned address up
	* to \p ptr are loaded too (but dropped) and, hence, need to be device-accessible. Similarly, if
	* \p ptr isn't aligned to four bytes, the bytes from `(ptr + sizeof(VectorT))` up to the following
	* four-byte aligned address are loaded too (but dropped), and, hence, need to be device-accessible.
	*
	* @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t)
	* @param ptr The pointer from which the data is supposed to be loaded
	* @param data_out The vector type that stores the data loaded from \p ptr
	*/
	template <typename VectorT>
	__forceinline__ __device__ void LoadVector(const char *ptr, VectorT &data_out)
	{
	const uint32_t offset = reinterpret_cast<std::uintptr_t>(ptr) % 4U;
	const uint32_t aligned_ptr = reinterpret_cast<uint32_t const >(ptr - offset);
	constexpr uint32_t bits_per_byte = 8U;
	const uint32_t bit_shift = offset * bits_per_byte;

	// If `ptr` is aligned to four bytes, we can perform a simple uint32_t-aliased load
	if (offset == 0)
	{
	LoadVectorAndFunnelShiftR<true>(aligned_ptr, bit_shift, data_out);
	}
	// Otherwise, we need to load extra bytes and perform funnel-shifting
	else
	{
	LoadVectorAndFunnelShiftR<false>(aligned_ptr, bit_shift, data_out);
	}
	}

	/**
	* @brief Helper data structure to hold information on the byte range for which we can safely
	* perform vectorized copies.
	*
	* @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t)
	*/
	template <typename VectorT>
	struct PointerRange
	{
	VectorT *out_begin;
	VectorT *out_end;
	const char *in_begin;
	const char *in_end;
	};

	/**
	* @brief Both `out_start_aligned` and `out_end_aligned` are indices into `out_ptr`.
	* `out_start_aligned` is the first VectorT-aligned memory location after `out_ptr + 3`.
	* `out_end_aligned` is the last VectorT-aligned memory location before `out_end - 4`, where out_end
	* corresponds to one past the last byte to be copied. Bytes between `[out_start_aligned,
	* out_end_aligned)` will be copied using VectorT. `out_ptr + 3` and `out_end - 4` are used instead
	* of `out_ptr` and `out_end` to avoid `LoadVector` reading beyond data boundaries.
	*
	* @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t)
	* @tparam ByteOffsetT Type used to index the bytes within the buffers
	* @param in_begin Pointer to the beginning of the byte range that shall be copied
	* @param out_begin Pointer to the beginning of the byte range that shall be copied
	* @param num_bytes Number of bytes that shall be copied
	* @return The byte range that can safely be copied using vectorized stores of type VectorT
	*/
	template <typename VectorT, typename ByteOffsetT>
	__device__ __forceinline__ PointerRange<VectorT> GetAlignedPtrs(const void *in_begin,
	void *out_begin,
	ByteOffsetT num_bytes)
	{
	// Data type size used for vectorized stores
	constexpr size_t out_datatype_size = sizeof(VectorT);
	// Data type size used for type-aliased loads
	constexpr size_t in_datatype_size = sizeof(uint32_t);

	// char-aliased ptrs to simplify pointer arithmetic
	char out_ptr = reinterpret_cast<char >(out_begin);
	const char in_ptr = reinterpret_cast<const char >(in_begin);

	// Number of bytes between the first VectorT-aligned address at or before out_begin and out_begin
	const uint32_t alignment_offset = reinterpret_cast<std::uintptr_t>(out_ptr) % out_datatype_size;

	// The first VectorT-aligned address before (or at) out_begin
	char out_chars_aligned = reinterpret_cast<char >(out_ptr - alignment_offset);

	// The number of extra bytes preceding `in_ptr` that are loaded but dropped
	uint32_t in_extra_bytes = reinterpret_cast<std::uintptr_t>(in_ptr) % in_datatype_size;

	// The offset required by `LoadVector`:
	// If the input pointer is not aligned, we load data from the last aligned address preceding the
	// pointer. That is, loading up to (in_datatype_size-1) bytes before `in_ptr`
	uint32_t in_offset_req = in_extra_bytes;

	// Bytes after `out_chars_aligned` to the first VectorT-aligned address at or after `out_begin`
	uint32_t out_start_aligned =
	CUB_QUOTIENT_CEILING(in_offset_req + alignment_offset, out_datatype_size) * out_datatype_size;

	// Compute the beginning of the aligned ranges (output and input pointers)
	VectorT out_aligned_begin = reinterpret_cast<VectorT >(out_chars_aligned + out_start_aligned);
	const char in_aligned_begin = in_ptr + (reinterpret_cast<char >(out_aligned_begin) - out_ptr);

	// If the aligned range is not aligned for the input pointer, we load up to (in_datatype_size-1)
	// bytes after the last byte that is copied. That is, we always load four bytes up to the next
	// aligned input address at a time. E.g., if the last byte loaded is one byte past the last
	// aligned address we'll also load the three bytes after that byte.
	uint32_t in_extra_bytes_from_aligned =
	(reinterpret_cast<std::uintptr_t>(in_aligned_begin) % in_datatype_size);
	uint32_t in_end_padding_req = (in_datatype_size - in_extra_bytes_from_aligned) % in_datatype_size;

	// Bytes after `out_chars_aligned` to the last VectorT-aligned
	// address at (or before) `out_begin` + `num_bytes`
	uint32_t out_end_aligned{};
	if (in_end_padding_req + alignment_offset > num_bytes)
	{
	out_end_aligned = out_start_aligned;
	}
	else
	{
	out_end_aligned = (num_bytes - in_end_padding_req + alignment_offset) / out_datatype_size *
	out_datatype_size;
	}

	VectorT out_aligned_end = reinterpret_cast<VectorT >(out_chars_aligned + out_end_aligned);
	const char in_aligned_end = in_ptr + (reinterpret_cast<char >(out_aligned_end) - out_ptr);

	return {out_aligned_begin, out_aligned_end, in_aligned_begin, in_aligned_end};
	}

	/**
	* @brief Cooperatively copies \p num_bytes from \p src to \p dest using vectorized stores of type
	* \p VectorT for addresses within [dest, dest + num_bytes) that are aligned to \p VectorT. A
	* byte-wise copy is used for byte-ranges that are not aligned to \p VectorT.
	*
	* @tparam LOGICAL_WARP_SIZE The number of threads cooperaing to copy the data; all threads within
	* [0, `LOGICAL_WARP_SIZE`) must invoke this method with the same arguments
	* @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t)
	* @tparam ByteOffsetT Type used to index the bytes within the buffers
	* @param thread_rank The thread rank within the group that cooperates to copy the data must be
	* within [0, `LOGICAL_WARP_SIZE`)
	* @param dest Pointer to the memory location to copy to
	* @param num_bytes Number of bytes to copy
	* @param src Pointer to the memory location to copy from
	*/
	template <int LOGICAL_WARP_SIZE, typename VectorT, typename ByteOffsetT>
	__device__ __forceinline__ void
	VectorizedCopy(int32_t thread_rank, void dest, ByteOffsetT num_bytes, const void src)
	{
	char out_ptr = reinterpret_cast<char >(dest);
	const char in_ptr = reinterpret_cast<const char >(src);

	// Gets the byte range that can safely be copied using vectorized stores of type VectorT
	auto aligned_range = GetAlignedPtrs<VectorT>(src, dest, num_bytes);

	// If byte range for which we can use vectorized copies is empty -> use byte-wise copies
	if (aligned_range.out_end <= aligned_range.out_begin)
	{
	for (ByteOffsetT ichar = thread_rank; ichar < num_bytes; ichar += LOGICAL_WARP_SIZE)
	{
	out_ptr[ichar] = in_ptr[ichar];
	}
	}
	else
	{
	// Copy bytes in range `[dest, aligned_range.out_begin)`
	out_ptr += thread_rank;
	in_ptr += thread_rank;
	while (out_ptr < reinterpret_cast<char *>(aligned_range.out_begin))
	{
	out_ptr = in_ptr;
	out_ptr += LOGICAL_WARP_SIZE;
	in_ptr += LOGICAL_WARP_SIZE;
	}

	// Copy bytes in range `[aligned_range.out_begin, aligned_range.out_end)`
	VectorT *aligned_range_begin = aligned_range.out_begin + thread_rank;
	const char in_aligned_begin = aligned_range.in_begin + thread_rank sizeof(VectorT);
	while (aligned_range_begin < aligned_range.out_end)
	{
	VectorT data_in;
	LoadVector(in_aligned_begin, data_in);
	*aligned_range_begin = data_in;
	in_aligned_begin += sizeof(VectorT) * LOGICAL_WARP_SIZE;
	aligned_range_begin += LOGICAL_WARP_SIZE;
	}

	// Copy bytes in range `[aligned_range.out_end, dest + num_bytes)`.
	out_ptr = reinterpret_cast<char *>(aligned_range.out_end) + thread_rank;
	in_ptr = aligned_range.in_end + thread_rank;
	while (out_ptr < reinterpret_cast<char *>(dest) + num_bytes)
	{
	out_ptr = in_ptr;
	out_ptr += LOGICAL_WARP_SIZE;
	in_ptr += LOGICAL_WARP_SIZE;
	}
	}
	}

	template <bool IsMemcpy,
	uint32_t LOGICAL_WARP_SIZE,
	typename InputBufferT,
	typename OutputBufferT,
	typename OffsetT,
	typename ::cuda::std::enable_if<IsMemcpy, int>::type = 0>
	__device__ __forceinline__ void copy_items(InputBufferT input_buffer,
	OutputBufferT output_buffer,
	OffsetT num_bytes,
	OffsetT offset = 0)
	{
	VectorizedCopy<LOGICAL_WARP_SIZE, uint4>(threadIdx.x % LOGICAL_WARP_SIZE,
	&reinterpret_cast<char *>(output_buffer)[offset],
	num_bytes,
	&reinterpret_cast<const char *>(input_buffer)[offset]);
	}

	template <bool IsMemcpy,
	uint32_t LOGICAL_WARP_SIZE,
	typename InputBufferT,
	typename OutputBufferT,
	typename OffsetT,
	typename ::cuda::std::enable_if<!IsMemcpy, int>::type = 0>
	__device__ __forceinline__ void copy_items(InputBufferT input_buffer,
	OutputBufferT output_buffer,
	OffsetT num_items,
	OffsetT offset = 0)
	{
	output_buffer += offset;
	input_buffer += offset;
	for (OffsetT i = threadIdx.x % LOGICAL_WARP_SIZE; i < num_items; i += LOGICAL_WARP_SIZE)
	{
	(output_buffer + i) = (input_buffer + i);
	}
	}

	template <bool IsMemcpy,
	typename AliasT,
	typename InputIt,
	typename OffsetT,
	typename ::cuda::std::enable_if<IsMemcpy, int>::type = 0>
	__device__ __forceinline__ AliasT read_item(InputIt buffer_src, OffsetT offset)
	{
	return (reinterpret_cast<const AliasT >(buffer_src) + offset);
	}

	template <bool IsMemcpy,
	typename AliasT,
	typename InputIt,
	typename OffsetT,
	typename ::cuda::std::enable_if<!IsMemcpy, int>::type = 0>
	__device__ __forceinline__ AliasT read_item(InputIt buffer_src, OffsetT offset)
	{
	return *(buffer_src + offset);
	}

	template <bool IsMemcpy,
	typename AliasT,
	typename OutputIt,
	typename OffsetT,
	typename ::cuda::std::enable_if<IsMemcpy, int>::type = 0>
	__device__ __forceinline__ void write_item(OutputIt buffer_dst, OffsetT offset, AliasT value)
	{
	(reinterpret_cast<AliasT >(buffer_dst) + offset) = value;
	}

	template <bool IsMemcpy,
	typename AliasT,
	typename OutputIt,
	typename OffsetT,
	typename ::cuda::std::enable_if<!IsMemcpy, int>::type = 0>
	__device__ __forceinline__ void write_item(OutputIt buffer_dst, OffsetT offset, AliasT value)
	{
	*(buffer_dst + offset) = value;
	}

	/**
	* @brief A helper class that allows threads to maintain multiple counters, where the counter that
	* shall be incremented can be addressed dynamically without incurring register spillage.
	*
	* @tparam NUM_ITEMS The number of counters to allocate
	* @tparam MAX_ITEM_VALUE The maximum count that must be supported.
	* @tparam PREFER_POW2_BITS Whether the number of bits to dedicate to each counter should be a
	* power-of-two. If enabled, this allows replacing integer multiplication with a bit-shift in
	* exchange for higher register pressure.
	* @tparam BackingUnitT The data type that is used to provide the bits of all the counters that
	* shall be allocated.
	*/
	template <uint32_t NUM_ITEMS,
	uint32_t MAX_ITEM_VALUE,
	bool PREFER_POW2_BITS,
	typename BackingUnitT = uint32_t>
	class BitPackedCounter
	{
	private:
	/// The minimum number of bits required to represent all values from [0, MAX_ITEM_VALUE]
	static constexpr uint32_t MIN_BITS_PER_ITEM =
	(MAX_ITEM_VALUE == 0U) ? 1U : cub::Log2<static_cast<int32_t>(MAX_ITEM_VALUE + 1U)>::VALUE;

	/// The number of bits allocated for each item. For pre-Volta, we prefer a power-of-2 here to
	/// have the compiler replace costly integer multiplication with bit-shifting.
	static constexpr uint32_t BITS_PER_ITEM =
	PREFER_POW2_BITS ? (0x01ULL << (cub::Log2<static_cast<int32_t>(MIN_BITS_PER_ITEM)>::VALUE))
	: MIN_BITS_PER_ITEM;

	/// The number of bits that each backing data type can store
	static constexpr uint32_t NUM_BITS_PER_UNIT = sizeof(BackingUnitT) * 8;

	/// The number of items that each backing data type can store
	static constexpr uint32_t ITEMS_PER_UNIT = NUM_BITS_PER_UNIT / BITS_PER_ITEM;

	/// The number of bits the backing data type is actually making use of
	static constexpr uint32_t USED_BITS_PER_UNIT = ITEMS_PER_UNIT * BITS_PER_ITEM;

	/// The number of backing data types required to store the given number of items
	static constexpr uint32_t NUM_TOTAL_UNITS = CUB_QUOTIENT_CEILING(NUM_ITEMS, ITEMS_PER_UNIT);

	/// This is the net number of bit-storage provided by each unit (remainder bits are unused)
	static constexpr uint32_t UNIT_MASK = (USED_BITS_PER_UNIT >= (8U * sizeof(uint32_t)))
	? 0xFFFFFFFF
	: (0x01U << USED_BITS_PER_UNIT) - 1;
	/// This is the bit-mask for each item
	static constexpr uint32_t ITEM_MASK = (BITS_PER_ITEM >= (8U * sizeof(uint32_t)))
	? 0xFFFFFFFF
	: (0x01U << BITS_PER_ITEM) - 1;

	//------------------------------------------------------------------------------
	// ACCESSORS
	//------------------------------------------------------------------------------
	public:
	__device__ __forceinline__ uint32_t Get(uint32_t index) const
	{
	const uint32_t target_offset = index * BITS_PER_ITEM;
	uint32_t val = 0;

	#pragma unroll
	for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i)
	{
	// In case the bit-offset of the counter at <index> is larger than the bit range of the
	// current unit, the bit_shift amount will be larger than the bits provided by this unit. As
	// C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width,
	// we use the PTX instruction `shr` to make sure behaviour is well-defined.
	// Negative bit-shift amounts wrap around in unsigned integer math and are ultimately clamped.
	const uint32_t bit_shift = target_offset - i * USED_BITS_PER_UNIT;
	val \|= detail::LogicShiftRight(data[i], bit_shift) & ITEM_MASK;
	}
	return val;
	}

	__device__ __forceinline__ void Add(uint32_t index, uint32_t value)
	{
	const uint32_t target_offset = index * BITS_PER_ITEM;

	#pragma unroll
	for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i)
	{
	// In case the bit-offset of the counter at <index> is larger than the bit range of the
	// current unit, the bit_shift amount will be larger than the bits provided by this unit. As
	// C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width,
	// we use the PTX instruction `shl` to make sure behaviour is well-defined.
	// Negative bit-shift amounts wrap around in unsigned integer math and are ultimately clamped.
	const uint32_t bit_shift = target_offset - i * USED_BITS_PER_UNIT;
	data[i] += detail::LogicShiftLeft(value, bit_shift) & UNIT_MASK;
	}
	}

	__device__ BitPackedCounter operator+(const BitPackedCounter &rhs) const
	{
	BitPackedCounter result;
	#pragma unroll
	for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i)
	{
	result.data[i] = data[i] + rhs.data[i];
	}
	return result;
	}

	//------------------------------------------------------------------------------
	// MEMBER VARIABLES
	//------------------------------------------------------------------------------
	private:
	BackingUnitT data[NUM_TOTAL_UNITS] = {};
	};

	/**
	* Parameterizable tuning policy type for AgentBatchMemcpy
	*/
	template <uint32_t _BLOCK_THREADS,
	uint32_t _BUFFERS_PER_THREAD,
	uint32_t _TLEV_BYTES_PER_THREAD,
	bool _PREFER_POW2_BITS,
	uint32_t _BLOCK_LEVEL_TILE_SIZE>
	struct AgentBatchMemcpyPolicy
	{
	/// Threads per thread block
	static constexpr uint32_t BLOCK_THREADS = _BLOCK_THREADS;
	/// Items per thread (per tile of input)
	static constexpr uint32_t BUFFERS_PER_THREAD = _BUFFERS_PER_THREAD;
	/// The number of bytes that each thread will work on with each iteration of reading in bytes
	/// from one or more
	// source-buffers and writing them out to the respective destination-buffers.
	static constexpr uint32_t TLEV_BYTES_PER_THREAD = _TLEV_BYTES_PER_THREAD;
	/// Whether the BitPackedCounter should prefer allocating a power-of-2 number of bits per
	/// counter
	static constexpr uint32_t PREFER_POW2_BITS = _PREFER_POW2_BITS;
	/// BLEV tile size granularity
	static constexpr uint32_t BLOCK_LEVEL_TILE_SIZE = _BLOCK_LEVEL_TILE_SIZE;
	};

	template <typename AgentMemcpySmallBuffersPolicyT,
	typename InputBufferIt,
	typename OutputBufferIt,
	typename BufferSizeIteratorT,
	typename BufferOffsetT,
	typename BlevBufferSrcsOutItT,
	typename BlevBufferDstsOutItT,
	typename BlevBufferSizesOutItT,
	typename BlevBufferTileOffsetsOutItT,
	typename BlockOffsetT,
	typename BLevBufferOffsetTileState,
	typename BLevBlockOffsetTileState,
	bool IsMemcpy>
	class AgentBatchMemcpy
	{
	private:
	//---------------------------------------------------------------------
	// CONFIGS / CONSTANTS
	//---------------------------------------------------------------------
	// Tuning policy-based configurations
	static constexpr uint32_t BLOCK_THREADS = AgentMemcpySmallBuffersPolicyT::BLOCK_THREADS;
	static constexpr uint32_t BUFFERS_PER_THREAD = AgentMemcpySmallBuffersPolicyT::BUFFERS_PER_THREAD;
	static constexpr uint32_t TLEV_BYTES_PER_THREAD =
	AgentMemcpySmallBuffersPolicyT::TLEV_BYTES_PER_THREAD;
	static constexpr bool PREFER_POW2_BITS = AgentMemcpySmallBuffersPolicyT::PREFER_POW2_BITS;
	static constexpr uint32_t BLOCK_LEVEL_TILE_SIZE =
	AgentMemcpySmallBuffersPolicyT::BLOCK_LEVEL_TILE_SIZE;

	// Derived configs
	static constexpr uint32_t BUFFERS_PER_BLOCK = BUFFERS_PER_THREAD * BLOCK_THREADS;
	static constexpr uint32_t TLEV_BUFFERS_PER_THREAD = BUFFERS_PER_THREAD;
	static constexpr uint32_t BLEV_BUFFERS_PER_THREAD = BUFFERS_PER_THREAD;

	static constexpr uint32_t WARP_LEVEL_THRESHOLD = 128;
	static constexpr uint32_t BLOCK_LEVEL_THRESHOLD = 8 * 1024;

	static constexpr uint32_t BUFFER_STABLE_PARTITION = false;

	// Constants
	enum : uint32_t
	{
	TLEV_SIZE_CLASS = 0,
	WLEV_SIZE_CLASS,
	BLEV_SIZE_CLASS,
	NUM_SIZE_CLASSES,
	};

	//---------------------------------------------------------------------
	// TYPE DECLARATIONS
	//---------------------------------------------------------------------
	/// Internal load/store type. For byte-wise memcpy, a single-byte type
	using AliasT = typename ::cuda::std::conditional<
	IsMemcpy,
	std::iterator_traits<char *>,
	std::iterator_traits<cub::detail::value_t<InputBufferIt>>>::type::value_type;

	/// Types of the input and output buffers
	using InputBufferT = cub::detail::value_t<InputBufferIt>;
	using OutputBufferT = cub::detail::value_t<OutputBufferIt>;

	/// Type that has to be sufficiently large to hold any of the buffers' sizes.
	/// The BufferSizeIteratorT's value type must be convertible to this type.
	using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>;

	/// Type used to index into the tile of buffers that this thread block is assigned to.
	using BlockBufferOffsetT = uint16_t;

	/// Internal type used to index into the bytes of and represent size of a TLEV buffer
	using TLevBufferSizeT = uint16_t;

	/**
	* @brief Helper struct to simplify BlockExchange within a single four-byte word
	*/
	struct ZippedTLevByteAssignment
	{
	// The buffer id within this tile
	BlockBufferOffsetT tile_buffer_id;

	// Byte-offset within that buffer
	TLevBufferSizeT buffer_byte_offset;
	};

	/**
	* POD to keep track of <buffer_id, buffer_size> pairs after having partitioned this tile's
	* buffers by their size.
	*/
	struct BufferTuple
	{
	// Size is only valid (and relevant) for buffers that are use thread-level collaboration
	TLevBufferSizeT size;

	// The buffer id relativ to this tile (i.e., the buffer id within this tile)
	BlockBufferOffsetT buffer_id;
	};

	// Load buffers in a striped arrangement if we do not want to performa a stable partitioning into
	// small, medium, and large buffers, otherwise load them in a blocked arrangement
	using BufferLoadT =
	BlockLoad<BufferSizeT,
	static_cast<int32_t>(BLOCK_THREADS),
	static_cast<int32_t>(BUFFERS_PER_THREAD),
	BUFFER_STABLE_PARTITION ? BLOCK_LOAD_WARP_TRANSPOSE : BLOCK_LOAD_STRIPED>;

	// A vectorized counter that will count the number of buffers that fall into each of the
	// size-classes. Where the size class representes the collaboration level that is required to
	// process a buffer. The collaboration level being either:
	//-> (1) TLEV (thread-level collaboration), requiring one or multiple threads but not a FULL warp
	// to collaborate
	//-> (2) WLEV (warp-level collaboration), requiring a full warp to collaborate on a buffer
	//-> (3) BLEV (block-level collaboration), requiring one or multiple thread blocks to collaborate
	// on a buffer */
	using VectorizedSizeClassCounterT =
	BitPackedCounter<NUM_SIZE_CLASSES, BUFFERS_PER_BLOCK, PREFER_POW2_BITS>;

	// Block-level scan used to compute the write offsets
	using BlockSizeClassScanT =
	cub::BlockScan<VectorizedSizeClassCounterT, static_cast<int32_t>(BLOCK_THREADS)>;

	//
	using BlockBLevTileCountScanT = cub::BlockScan<BlockOffsetT, static_cast<int32_t>(BLOCK_THREADS)>;

	// Block-level run-length decode algorithm to evenly distribute work of all buffers requiring
	// thread-level collaboration
	using BlockRunLengthDecodeT =
	cub::BlockRunLengthDecode<BlockBufferOffsetT,
	static_cast<int32_t>(BLOCK_THREADS),
	static_cast<int32_t>(TLEV_BUFFERS_PER_THREAD),
	static_cast<int32_t>(TLEV_BYTES_PER_THREAD)>;

	using BlockExchangeTLevT = cub::BlockExchange<ZippedTLevByteAssignment,
	static_cast<int32_t>(BLOCK_THREADS),
	static_cast<int32_t>(TLEV_BYTES_PER_THREAD)>;

	using BLevBuffScanPrefixCallbackOpT =
	TilePrefixCallbackOp<BufferOffsetT, Sum, BLevBufferOffsetTileState>;
	using BLevBlockScanPrefixCallbackOpT =
	TilePrefixCallbackOp<BlockOffsetT, Sum, BLevBlockOffsetTileState>;

	//-----------------------------------------------------------------------------
	// SHARED MEMORY DECLARATIONS
	//-----------------------------------------------------------------------------
	struct _TempStorage
	{
	union
	{
	typename BufferLoadT::TempStorage load_storage;

	// Stage 1: histogram over the size classes in preparation for partitioning buffers by size
	typename BlockSizeClassScanT::TempStorage size_scan_storage;

	// Stage 2: Communicate the number ofer buffers requiring block-level collaboration
	typename BLevBuffScanPrefixCallbackOpT::TempStorage buffer_scan_callback;

	// Stage 3; batch memcpy buffers that require only thread-level collaboration
	struct
	{
	BufferTuple buffers_by_size_class[BUFFERS_PER_BLOCK];

	// Stage 3.1: Write buffers requiring block-level collaboration to queue
	union
	{
	struct
	{
	typename BLevBlockScanPrefixCallbackOpT::TempStorage block_scan_callback;
	typename BlockBLevTileCountScanT::TempStorage block_scan_storage;
	} blev;

	// Stage 3.3: run-length decode & block exchange for tlev
	// rld_state needs to be persistent across loop iterations (RunLengthDecode calls) and,
	// hence, cannot alias block_exchange_storage
	struct
	{
	typename BlockRunLengthDecodeT::TempStorage rld_state;
	typename BlockExchangeTLevT::TempStorage block_exchange_storage;
	} tlev;
	};
	} staged;
	};
	BufferOffsetT blev_buffer_offset;
	};

	//-----------------------------------------------------------------------------
	// PUBLIC TYPE MEMBERS
	//-----------------------------------------------------------------------------
	public:
	struct TempStorage : Uninitialized<_TempStorage>
	{};

	//-----------------------------------------------------------------------------
	// PRIVATE MEMBER FUNCTIONS
	//-----------------------------------------------------------------------------
	private:
	/// Shared storage reference
	_TempStorage &temp_storage;

	/**
	* @brief Loads this tile's buffers' sizes, without any guards (i.e., out-of-bounds checks)
	*/
	__device__ __forceinline__ void
	LoadBufferSizesFullTile(BufferSizeIteratorT tile_buffer_sizes_it,
	BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD])
	{
	BufferLoadT(temp_storage.load_storage).Load(tile_buffer_sizes_it, buffer_sizes);
	}

	/**
	* @brief Loads this tile's buffers' sizes, making sure to read at most \p num_valid items.
	*/
	__device__ __forceinline__ void
	LoadBufferSizesPartialTile(BufferSizeIteratorT tile_buffer_sizes_it,
	BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD],
	BufferOffsetT num_valid)
	{
	// Out-of-bounds buffer items are initialized to '0', so those buffers will simply be ignored
	// later on
	constexpr BufferSizeT OOB_DEFAULT_BUFFER_SIZE = 0U;

	BufferLoadT(temp_storage.load_storage)
	.Load(tile_buffer_sizes_it, buffer_sizes, num_valid, OOB_DEFAULT_BUFFER_SIZE);
	}

	/**
	* @brief Computes the histogram over the number of buffers belonging to each of the three
	* size-classes (TLEV, WLEV, BLEV).
	*/
	__device__ __forceinline__ VectorizedSizeClassCounterT
	GetBufferSizeClassHistogram(const BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD])
	{
	VectorizedSizeClassCounterT vectorized_counters{};
	#pragma unroll
	for (uint32_t i = 0; i < BUFFERS_PER_THREAD; i++)
	{
	// Whether to increment ANY of the buffer size classes at all
	const uint32_t increment = buffer_sizes[i] > 0 ? 1U : 0U;
	// Identify the buffer's size class
	uint32_t buffer_size_class = 0;
	buffer_size_class += buffer_sizes[i] > WARP_LEVEL_THRESHOLD ? 1U : 0U;
	buffer_size_class += buffer_sizes[i] > BLOCK_LEVEL_THRESHOLD ? 1U : 0U;

	// Increment the count of the respective size class
	vectorized_counters.Add(buffer_size_class, increment);
	}
	return vectorized_counters;
	}

	/**
	* @brief Scatters the buffers into the respective buffer's size-class partition.
	*/
	__device__ __forceinline__ void
	PartitionBuffersBySize(const BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD],
	VectorizedSizeClassCounterT &vectorized_offsets,
	BufferTuple (&buffers_by_size_class)[BUFFERS_PER_BLOCK])
	{
	// If we intend to perform a stable partitioning, the thread's buffer are in a blocked
	// arrangement, otherwise they are in a striped arrangement
	BlockBufferOffsetT buffer_id = BUFFER_STABLE_PARTITION ? (BUFFERS_PER_THREAD * threadIdx.x)
	: (threadIdx.x);
	constexpr BlockBufferOffsetT BUFFER_STRIDE = BUFFER_STABLE_PARTITION
	? static_cast<BlockBufferOffsetT>(1)
	: static_cast<BlockBufferOffsetT>(BLOCK_THREADS);

	#pragma unroll
	for (uint32_t i = 0; i < BUFFERS_PER_THREAD; i++)
	{
	if (buffer_sizes[i] > 0)
	{
	uint32_t buffer_size_class = 0;
	buffer_size_class += buffer_sizes[i] > WARP_LEVEL_THRESHOLD ? 1U : 0U;
	buffer_size_class += buffer_sizes[i] > BLOCK_LEVEL_THRESHOLD ? 1U : 0U;
	const uint32_t write_offset = vectorized_offsets.Get(buffer_size_class);
	buffers_by_size_class[write_offset] = {static_cast<TLevBufferSizeT>(buffer_sizes[i]),
	buffer_id};
	vectorized_offsets.Add(buffer_size_class, 1U);
	}
	buffer_id += BUFFER_STRIDE;
	}
	}

	/**
	* @brief Read in all the buffers that require block-level collaboration and put them to a queue
	* that will get picked up in a separate, subsequent kernel.
	*/
	__device__ __forceinline__ void EnqueueBLEVBuffers(BufferTuple *buffers_by_size_class,
	InputBufferIt tile_buffer_srcs,
	OutputBufferIt tile_buffer_dsts,
	BufferSizeIteratorT tile_buffer_sizes,
	BlockBufferOffsetT num_blev_buffers,
	BufferOffsetT tile_buffer_offset,
	BufferOffsetT tile_id)
	{
	BlockOffsetT block_offset[BLEV_BUFFERS_PER_THREAD];
	// Read in the BLEV buffer partition (i.e., the buffers that require block-level collaboration)
	uint32_t blev_buffer_offset = threadIdx.x * BLEV_BUFFERS_PER_THREAD;
	#pragma unroll
	for (uint32_t i = 0; i < BLEV_BUFFERS_PER_THREAD; i++)
	{
	if (blev_buffer_offset < num_blev_buffers)
	{
	BlockBufferOffsetT tile_buffer_id = buffers_by_size_class[blev_buffer_offset].buffer_id;
	block_offset[i] = CUB_QUOTIENT_CEILING(tile_buffer_sizes[tile_buffer_id],
	BLOCK_LEVEL_TILE_SIZE);
	}
	else
	{
	// Out-of-bounds buffers are assigned a tile count of '0'
	block_offset[i] = 0U;
	}
	blev_buffer_offset++;
	}

	if (tile_id == 0)
	{
	BlockOffsetT block_aggregate;
	BlockBLevTileCountScanT(temp_storage.staged.blev.block_scan_storage)
	.ExclusiveSum(block_offset, block_offset, block_aggregate);
	if (threadIdx.x == 0)
	{
	blev_block_scan_state.SetInclusive(0, block_aggregate);
	}
	}
	else
	{
	BLevBlockScanPrefixCallbackOpT blev_tile_prefix_op(
	blev_block_scan_state,
	temp_storage.staged.blev.block_scan_callback,
	Sum(),
	tile_id);
	BlockBLevTileCountScanT(temp_storage.staged.blev.block_scan_storage)
	.ExclusiveSum(block_offset, block_offset, blev_tile_prefix_op);
	}
	CTA_SYNC();

	// Read in the BLEV buffer partition (i.e., the buffers that require block-level collaboration)
	blev_buffer_offset = threadIdx.x * BLEV_BUFFERS_PER_THREAD;
	#pragma unroll
	for (uint32_t i = 0; i < BLEV_BUFFERS_PER_THREAD; i++)
	{
	if (blev_buffer_offset < num_blev_buffers)
	{
	BlockBufferOffsetT tile_buffer_id = buffers_by_size_class[blev_buffer_offset].buffer_id;
	blev_buffer_srcs[tile_buffer_offset + blev_buffer_offset] =
	tile_buffer_srcs[tile_buffer_id];
	blev_buffer_dsts[tile_buffer_offset + blev_buffer_offset] =
	tile_buffer_dsts[tile_buffer_id];
	blev_buffer_sizes[tile_buffer_offset + blev_buffer_offset] =
	tile_buffer_sizes[tile_buffer_id];
	blev_buffer_tile_offsets[tile_buffer_offset + blev_buffer_offset] = block_offset[i];
	blev_buffer_offset++;
	}
	}
	}

	/**
	* @brief Read in all the buffers of this tile that require warp-level collaboration and copy
	* their bytes to the corresponding destination buffer
	*/
	__device__ __forceinline__ void BatchMemcpyWLEVBuffers(BufferTuple *buffers_by_size_class,
	InputBufferIt tile_buffer_srcs,
	OutputBufferIt tile_buffer_dsts,
	BufferSizeIteratorT tile_buffer_sizes,
	BlockBufferOffsetT num_wlev_buffers)
	{
	const int32_t warp_id = threadIdx.x / CUB_PTX_WARP_THREADS;
	constexpr uint32_t WARPS_PER_BLOCK = BLOCK_THREADS / CUB_PTX_WARP_THREADS;

	for (BlockBufferOffsetT buffer_offset = warp_id; buffer_offset < num_wlev_buffers;
	buffer_offset += WARPS_PER_BLOCK)
	{
	const auto buffer_id = buffers_by_size_class[buffer_offset].buffer_id;
	copy_items<IsMemcpy, CUB_PTX_WARP_THREADS, InputBufferT, OutputBufferT, BufferSizeT>(
	tile_buffer_srcs[buffer_id],
	tile_buffer_dsts[buffer_id],
	tile_buffer_sizes[buffer_id]);
	}
	}

	/**
	* @brief Read in all the buffers of this tile that require thread-level collaboration and copy
	* their bytes to the corresponding destination buffer
	*/
	__device__ __forceinline__ void BatchMemcpyTLEVBuffers(BufferTuple *buffers_by_size_class,
	InputBufferIt tile_buffer_srcs,
	OutputBufferIt tile_buffer_dsts,
	BlockBufferOffsetT num_tlev_buffers)
	{
	// Read in the buffers' ids that require thread-level collaboration (where buffer id is the
	// buffer within this tile)
	BlockBufferOffsetT tlev_buffer_ids[TLEV_BUFFERS_PER_THREAD];
	TLevBufferSizeT tlev_buffer_sizes[TLEV_BUFFERS_PER_THREAD];
	// Currently we do not go over the TLEV buffers in multiple iterations, so we need to make sure
	// we are able to be covered for the case that all our buffers are TLEV buffers
	static_assert(TLEV_BUFFERS_PER_THREAD >= BUFFERS_PER_THREAD,
	"Unsupported confiugraiton: The number of 'thread-level buffers' must be at "
	"least as large as the number of overall buffers being processed by each "
	"thread.");

	// Read in the TLEV buffer partition (i.e., the buffers that require thread-level collaboration)
	uint32_t tlev_buffer_offset = threadIdx.x * TLEV_BUFFERS_PER_THREAD;

	// Pre-populate the buffer sizes to 0 (i.e. zero-padding towards the end) to ensure
	// out-of-bounds TLEV buffers will not be considered
	#pragma unroll
	for (uint32_t i = 0; i < TLEV_BUFFERS_PER_THREAD; i++)
	{
	tlev_buffer_sizes[i] = 0;
	}

	// Assign TLEV buffers in a blocked arrangement (each thread is assigned consecutive TLEV
	// buffers)
	#pragma unroll
	for (uint32_t i = 0; i < TLEV_BUFFERS_PER_THREAD; i++)
	{
	if (tlev_buffer_offset < num_tlev_buffers)
	{
	tlev_buffer_ids[i] = buffers_by_size_class[tlev_buffer_offset].buffer_id;
	tlev_buffer_sizes[i] = buffers_by_size_class[tlev_buffer_offset].size;
	}
	tlev_buffer_offset++;
	}

	// Evenly distribute all the bytes that have to be copied from all the buffers that require
	// thread-level collaboration using BlockRunLengthDecode
	uint32_t num_total_tlev_bytes = 0U;
	BlockRunLengthDecodeT block_run_length_decode(temp_storage.staged.tlev.rld_state,
	tlev_buffer_ids,
	tlev_buffer_sizes,
	num_total_tlev_bytes);

	// Run-length decode the buffers' sizes into a window buffer of limited size. This is repeated
	// until we were able to cover all the bytes of TLEV buffers
	uint32_t decoded_window_offset = 0U;
	while (decoded_window_offset < num_total_tlev_bytes)
	{
	BlockBufferOffsetT buffer_id[TLEV_BYTES_PER_THREAD];
	TLevBufferSizeT buffer_byte_offset[TLEV_BYTES_PER_THREAD];

	// Now we have a balanced assignment: buffer_id[i] will hold the tile's buffer id and
	// buffer_byte_offset[i] that buffer's byte that this thread supposed to copy
	block_run_length_decode.RunLengthDecode(buffer_id, buffer_byte_offset, decoded_window_offset);

	// Zip from SoA to AoS
	ZippedTLevByteAssignment zipped_byte_assignment[TLEV_BYTES_PER_THREAD];
	#pragma unroll
	for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++)
	{
	zipped_byte_assignment[i] = {buffer_id[i], buffer_byte_offset[i]};
	}

	// Exchange from blocked to striped arrangement for coalesced memory reads and writes
	BlockExchangeTLevT(temp_storage.staged.tlev.block_exchange_storage)
	.BlockedToStriped(zipped_byte_assignment, zipped_byte_assignment);

	// Read in the bytes that this thread is assigned to
	constexpr uint32_t WINDOW_SIZE = (TLEV_BYTES_PER_THREAD * BLOCK_THREADS);
	const bool is_full_window = decoded_window_offset + WINDOW_SIZE < num_total_tlev_bytes;
	if (is_full_window)
	{
	uint32_t absolute_tlev_byte_offset = decoded_window_offset + threadIdx.x;
	AliasT src_byte[TLEV_BYTES_PER_THREAD];
	#pragma unroll
	for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++)
	{
	src_byte[i] = read_item<IsMemcpy, AliasT, InputBufferT>(
	tile_buffer_srcs[zipped_byte_assignment[i].tile_buffer_id],
	zipped_byte_assignment[i].buffer_byte_offset);
	absolute_tlev_byte_offset += BLOCK_THREADS;
	}
	#pragma unroll
	for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++)
	{
	write_item<IsMemcpy, AliasT, OutputBufferT>(
	tile_buffer_dsts[zipped_byte_assignment[i].tile_buffer_id],
	zipped_byte_assignment[i].buffer_byte_offset,
	src_byte[i]);
	}
	}
	else
	{
	uint32_t absolute_tlev_byte_offset = decoded_window_offset + threadIdx.x;
	#pragma unroll
	for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++)
	{
	if (absolute_tlev_byte_offset < num_total_tlev_bytes)
	{
	const AliasT src_byte = read_item<IsMemcpy, AliasT, InputBufferT>(
	tile_buffer_srcs[zipped_byte_assignment[i].tile_buffer_id],
	zipped_byte_assignment[i].buffer_byte_offset);
	write_item<IsMemcpy, AliasT, OutputBufferT>(
	tile_buffer_dsts[zipped_byte_assignment[i].tile_buffer_id],
	zipped_byte_assignment[i].buffer_byte_offset,
	src_byte);
	}
	absolute_tlev_byte_offset += BLOCK_THREADS;
	}
	}

	decoded_window_offset += WINDOW_SIZE;

	// Ensure all threads finished collaborative BlockExchange so temporary storage can be reused
	// with next iteration
	CTA_SYNC();
	}
	}

	//-----------------------------------------------------------------------------
	// PUBLIC MEMBER FUNCTIONS
	//-----------------------------------------------------------------------------
	public:
	__device__ __forceinline__ void ConsumeTile(BufferOffsetT tile_id)
	{
	// Offset into this tile's buffers
	BufferOffsetT buffer_offset = tile_id * BUFFERS_PER_BLOCK;

	// Indicates whether all of this tiles items are within bounds
	bool is_full_tile = buffer_offset + BUFFERS_PER_BLOCK < num_buffers;

	// Load the buffer sizes of this tile's buffers
	BufferSizeIteratorT tile_buffer_sizes_it = buffer_sizes_it + buffer_offset;
	BufferSizeT buffer_sizes[BUFFERS_PER_THREAD];
	if (is_full_tile)
	{
	LoadBufferSizesFullTile(tile_buffer_sizes_it, buffer_sizes);
	}
	else
	{
	LoadBufferSizesPartialTile(tile_buffer_sizes_it, buffer_sizes, num_buffers - buffer_offset);
	}

	// Ensure we can repurpose the BlockLoad's temporary storage
	CTA_SYNC();

	// Count how many buffers fall into each size-class
	VectorizedSizeClassCounterT size_class_histogram = GetBufferSizeClassHistogram(buffer_sizes);

	// Compute the prefix sum over the histogram
	VectorizedSizeClassCounterT size_class_agg = {};
	BlockSizeClassScanT(temp_storage.size_scan_storage)
	.ExclusiveSum(size_class_histogram, size_class_histogram, size_class_agg);

	// Ensure we can repurpose the scan's temporary storage for scattering the buffer ids
	CTA_SYNC();

	// Factor in the per-size-class counts / offsets
	// That is, WLEV buffer offset has to be offset by the TLEV buffer count and BLEV buffer offset
	// has to be offset by the TLEV+WLEV buffer count
	uint32_t buffer_count = 0U;
	for (uint32_t i = 0; i < NUM_SIZE_CLASSES; i++)
	{
	size_class_histogram.Add(i, buffer_count);
	buffer_count += size_class_agg.Get(i);
	}

	// Signal the number of BLEV buffers we're planning to write out
	BufferOffsetT buffer_exclusive_prefix = 0;
	if (tile_id == 0)
	{
	if (threadIdx.x == 0)
	{
	blev_buffer_scan_state.SetInclusive(tile_id, size_class_agg.Get(BLEV_SIZE_CLASS));
	}
	buffer_exclusive_prefix = 0;
	}
	else
	{
	BLevBuffScanPrefixCallbackOpT blev_buffer_prefix_op(blev_buffer_scan_state,
	temp_storage.buffer_scan_callback,
	Sum(),
	tile_id);

	// Signal our partial prefix and wait for the inclusive prefix of previous tiles
	if (threadIdx.x < CUB_PTX_WARP_THREADS)
	{
	buffer_exclusive_prefix = blev_buffer_prefix_op(size_class_agg.Get(BLEV_SIZE_CLASS));
	}
	}
	if (threadIdx.x == 0)
	{
	temp_storage.blev_buffer_offset = buffer_exclusive_prefix;
	}

	// Ensure the prefix callback has finished using its temporary storage and that it can be reused
	// in the next stage
	CTA_SYNC();

	// Scatter the buffers into one of the three partitions (TLEV, WLEV, BLEV) depending on their
	// size
	PartitionBuffersBySize(buffer_sizes,
	size_class_histogram,
	temp_storage.staged.buffers_by_size_class);

	// Ensure all buffers have been partitioned by their size class AND
	// ensure that blev_buffer_offset has been written to shared memory
	CTA_SYNC();

	// TODO: think about prefetching tile_buffer_{srcs,dsts} into shmem
	InputBufferIt tile_buffer_srcs = input_buffer_it + buffer_offset;
	OutputBufferIt tile_buffer_dsts = output_buffer_it + buffer_offset;
	BufferSizeIteratorT tile_buffer_sizes = buffer_sizes_it + buffer_offset;

	// Copy block-level buffers
	EnqueueBLEVBuffers(
	&temp_storage.staged.buffers_by_size_class[size_class_agg.Get(TLEV_SIZE_CLASS) +
	size_class_agg.Get(WLEV_SIZE_CLASS)],
	tile_buffer_srcs,
	tile_buffer_dsts,
	tile_buffer_sizes,
	size_class_agg.Get(BLEV_SIZE_CLASS),
	temp_storage.blev_buffer_offset,
	tile_id);

	// Ensure we can repurpose the temporary storage required by EnqueueBLEVBuffers
	CTA_SYNC();

	// Copy warp-level buffers
	BatchMemcpyWLEVBuffers(
	&temp_storage.staged.buffers_by_size_class[size_class_agg.Get(TLEV_SIZE_CLASS)],
	tile_buffer_srcs,
	tile_buffer_dsts,
	tile_buffer_sizes,
	size_class_agg.Get(WLEV_SIZE_CLASS));

	// Perform batch memcpy for all the buffers that require thread-level collaboration
	uint32_t num_tlev_buffers = size_class_agg.Get(TLEV_SIZE_CLASS);
	BatchMemcpyTLEVBuffers(temp_storage.staged.buffers_by_size_class,
	tile_buffer_srcs,
	tile_buffer_dsts,
	num_tlev_buffers);
	}

	//-----------------------------------------------------------------------------
	// CONSTRUCTOR
	//-----------------------------------------------------------------------------
	__device__ __forceinline__ AgentBatchMemcpy(TempStorage &temp_storage,
	InputBufferIt input_buffer_it,
	OutputBufferIt output_buffer_it,
	BufferSizeIteratorT buffer_sizes_it,
	BufferOffsetT num_buffers,
	BlevBufferSrcsOutItT blev_buffer_srcs,
	BlevBufferDstsOutItT blev_buffer_dsts,
	BlevBufferSizesOutItT blev_buffer_sizes,
	BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
	BLevBufferOffsetTileState blev_buffer_scan_state,
	BLevBlockOffsetTileState blev_block_scan_state)
	: temp_storage(temp_storage.Alias())
	, input_buffer_it(input_buffer_it)
	, output_buffer_it(output_buffer_it)
	, buffer_sizes_it(buffer_sizes_it)
	, num_buffers(num_buffers)
	, blev_buffer_srcs(blev_buffer_srcs)
	, blev_buffer_dsts(blev_buffer_dsts)
	, blev_buffer_sizes(blev_buffer_sizes)
	, blev_buffer_tile_offsets(blev_buffer_tile_offsets)
	, blev_buffer_scan_state(blev_buffer_scan_state)
	, blev_block_scan_state(blev_block_scan_state)
	{}

	private:
	// Iterator providing the pointers to the source memory buffers
	InputBufferIt input_buffer_it;
	// Iterator providing the pointers to the destination memory buffers
	OutputBufferIt output_buffer_it;
	// Iterator providing the number of bytes to be copied for each pair of buffers
	BufferSizeIteratorT buffer_sizes_it;
	// The total number of buffer pairs
	BufferOffsetT num_buffers;
	// Output iterator to which the source pointers of the BLEV buffers are written
	BlevBufferSrcsOutItT blev_buffer_srcs;
	// Output iterator to which the destination pointers of the BLEV buffers are written
	BlevBufferDstsOutItT blev_buffer_dsts;
	// Output iterator to which the number of bytes to be copied of the BLEV buffers are written
	BlevBufferSizesOutItT blev_buffer_sizes;
	// Output iterator to which the mapping of tiles to BLEV buffers is written
	BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets;
	// The single-pass prefix scan's tile state used for tracking the prefix sum over the number of
	// BLEV buffers
	BLevBufferOffsetTileState blev_buffer_scan_state;
	// The single-pass prefix scan's tile state used for tracking the prefix sum over tiles of BLEV
	// buffers
	BLevBlockOffsetTileState blev_block_scan_state;
	};

	} // namespace detail

	CUB_NAMESPACE_END