thrust / install /include /cub /device /dispatch /dispatch_batch_memcpy.cuh

thanks to nvidia ❤

0dc1b04 over 2 years ago

31.9 kB

	/******************************************************************************
	* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* cub::DispatchBatchMemcpy provides device-wide, parallel operations for copying data from a number
	* of given source buffers to their corresponding destination buffer.
	*/

	#pragma once

	#include <cub/agent/agent_batch_memcpy.cuh>
	#include <cub/agent/single_pass_scan_operators.cuh>
	#include <cub/config.cuh>
	#include <cub/detail/temporary_storage.cuh>
	#include <cub/thread/thread_search.cuh>
	#include <cub/util_debug.cuh>
	#include <cub/util_ptx.cuh>

	#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

	#include <cuda/std/type_traits>

	#include <cstdint>

	CUB_NAMESPACE_BEGIN

	namespace detail
	{

	/**
	* Parameterizable tuning policy type for AgentBatchMemcpy
	*/
	template <uint32_t _BLOCK_THREADS, uint32_t _BYTES_PER_THREAD>
	struct AgentBatchMemcpyLargeBuffersPolicy
	{
	/// Threads per thread block
	static constexpr uint32_t BLOCK_THREADS = _BLOCK_THREADS;
	/// The number of bytes each thread copies
	static constexpr uint32_t BYTES_PER_THREAD = _BYTES_PER_THREAD;
	};

	/**
	* Initialization kernel for tile status initialization (multi-block)
	*/
	template <typename BufferOffsetScanTileStateT,
	typename BlockOffsetScanTileStateT,
	typename TileOffsetT>
	__global__ void InitTileStateKernel(BufferOffsetScanTileStateT buffer_offset_scan_tile_state,
	BlockOffsetScanTileStateT block_offset_scan_tile_state,
	TileOffsetT num_tiles)
	{
	// Initialize tile status
	buffer_offset_scan_tile_state.InitializeStatus(num_tiles);
	block_offset_scan_tile_state.InitializeStatus(num_tiles);
	}

	/**
	* Kernel that copies buffers that need to be copied by at least one (and potentially many) thread
	* blocks.
	*/
	template <typename ChainedPolicyT,
	typename BufferOffsetT,
	typename InputBufferIt,
	typename OutputBufferIt,
	typename BufferSizeIteratorT,
	typename BufferTileOffsetItT,
	typename TileT,
	typename TileOffsetT,
	bool IsMemcpy>
	__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLOCK_THREADS))
	__global__ void MultiBlockBatchMemcpyKernel(InputBufferIt input_buffer_it,
	OutputBufferIt output_buffer_it,
	BufferSizeIteratorT buffer_sizes,
	BufferTileOffsetItT buffer_tile_offsets,
	TileT buffer_offset_tile,
	TileOffsetT last_tile_offset)
	{
	using StatusWord = typename TileT::StatusWord;
	using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT;
	using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>;
	/// Internal load/store type. For byte-wise memcpy, a single-byte type
	using AliasT = typename ::cuda::std::conditional<
	IsMemcpy,
	std::iterator_traits<char *>,
	std::iterator_traits<cub::detail::value_t<InputBufferIt>>>::type::value_type;
	/// Types of the input and output buffers
	using InputBufferT = cub::detail::value_t<InputBufferIt>;
	using OutputBufferT = cub::detail::value_t<OutputBufferIt>;

	constexpr uint32_t BLOCK_THREADS = ActivePolicyT::BLOCK_THREADS;
	constexpr uint32_t ITEMS_PER_THREAD = ActivePolicyT::BYTES_PER_THREAD;
	constexpr BufferSizeT TILE_SIZE = static_cast<BufferSizeT>(BLOCK_THREADS * ITEMS_PER_THREAD);

	BufferOffsetT num_blev_buffers = buffer_offset_tile.LoadValid(last_tile_offset);

	uint32_t tile_id = blockIdx.x;

	// No block-level buffers => we're done here
	if (num_blev_buffers == 0)
	{
	return;
	}

	// While there's still tiles of bytes from block-level buffers to copied
	do
	{
	__shared__ BufferOffsetT block_buffer_id;

	// Make sure thread 0 does not overwrite the buffer id before other threads have finished with
	// the prior iteration of the loop
	CTA_SYNC();

	// Binary search the buffer that this tile belongs to
	if (threadIdx.x == 0)
	block_buffer_id = UpperBound(buffer_tile_offsets, num_blev_buffers, tile_id) - 1;

	// Make sure thread 0 has written the buffer this thread block is assigned to
	CTA_SYNC();

	const BufferOffsetT buffer_id = block_buffer_id;

	// The relative offset of this tile within the buffer it's assigned to
	BufferSizeT tile_offset_within_buffer =
	static_cast<BufferSizeT>(tile_id - buffer_tile_offsets[buffer_id]) * TILE_SIZE;

	// If the tile has already reached beyond the work of the end of the last buffer
	if (buffer_id >= num_blev_buffers - 1 && tile_offset_within_buffer > buffer_sizes[buffer_id])
	{
	return;
	}

	// Tiny remainders are copied without vectorizing laods
	if (buffer_sizes[buffer_id] - tile_offset_within_buffer <= 32)
	{
	BufferSizeT thread_offset = tile_offset_within_buffer + threadIdx.x;
	for (int i = 0; i < ITEMS_PER_THREAD; i++)
	{
	if (thread_offset < buffer_sizes[buffer_id])
	{
	const auto value = read_item<IsMemcpy, AliasT, InputBufferT>(input_buffer_it[buffer_id],
	thread_offset);
	write_item<IsMemcpy, AliasT, OutputBufferT>(output_buffer_it[buffer_id],
	thread_offset,
	value);
	}
	thread_offset += BLOCK_THREADS;
	}
	}
	else
	{
	copy_items<IsMemcpy, BLOCK_THREADS, InputBufferT, OutputBufferT, BufferSizeT>(
	input_buffer_it[buffer_id],
	output_buffer_it[buffer_id],
	(cub::min)(buffer_sizes[buffer_id] - tile_offset_within_buffer, TILE_SIZE),
	tile_offset_within_buffer);
	}

	tile_id += gridDim.x;
	} while (true);
	}

	/**
	* @brief Kernel that copies data from a batch of given source buffers to their corresponding
	* destination buffer. If a buffer's size is to large to be copied by a single thread block, that
	* buffer is put into a queue of buffers that will get picked up later on, where multiple blocks
	* collaborate on each of these buffers. All other buffers get copied straight away.
	*
	* @param input_buffer_it [in] Iterator providing the pointers to the source memory buffers
	* @param output_buffer_it [in] Iterator providing the pointers to the destination memory buffers
	* @param buffer_sizes [in] Iterator providing the number of bytes to be copied for each pair of
	* buffers
	* @param num_buffers [in] The total number of buffer pairs
	* @param blev_buffer_srcs [out] The source pointers of buffers that require block-level
	* collaboration
	* @param blev_buffer_dsts [out] The destination pointers of buffers that require block-level
	* collaboration
	* @param blev_buffer_sizes [out] The sizes of buffers that require block-level collaboration
	* @param blev_buffer_scan_state [in,out] Tile states for the prefix sum over the count of buffers
	* requiring block-level collaboration (to "stream compact" (aka "select") BLEV-buffers)
	* @param blev_block_scan_state [in,out] Tile states for the prefix sum over the number of thread
	* blocks getting assigned to each buffer that requires block-level collaboration
	*/
	template <typename ChainedPolicyT,
	typename InputBufferIt,
	typename OutputBufferIt,
	typename BufferSizeIteratorT,
	typename BufferOffsetT,
	typename BlevBufferSrcsOutItT,
	typename BlevBufferDstsOutItT,
	typename BlevBufferSizesOutItT,
	typename BlevBufferTileOffsetsOutItT,
	typename BlockOffsetT,
	typename BLevBufferOffsetTileState,
	typename BLevBlockOffsetTileState,
	bool IsMemcpy>
	__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLOCK_THREADS))
	__global__ void BatchMemcpyKernel(InputBufferIt input_buffer_it,
	OutputBufferIt output_buffer_it,
	BufferSizeIteratorT buffer_sizes,
	BufferOffsetT num_buffers,
	BlevBufferSrcsOutItT blev_buffer_srcs,
	BlevBufferDstsOutItT blev_buffer_dsts,
	BlevBufferSizesOutItT blev_buffer_sizes,
	BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
	BLevBufferOffsetTileState blev_buffer_scan_state,
	BLevBlockOffsetTileState blev_block_scan_state)
	{
	// Internal type used for storing a buffer's size
	using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>;

	// Alias the correct tuning policy for the current compilation pass' architecture
	using AgentBatchMemcpyPolicyT = typename ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT;

	// Block-level specialization
	using AgentBatchMemcpyT = AgentBatchMemcpy<AgentBatchMemcpyPolicyT,
	InputBufferIt,
	OutputBufferIt,
	BufferSizeIteratorT,
	BufferOffsetT,
	BlevBufferSrcsOutItT,
	BlevBufferDstsOutItT,
	BlevBufferSizesOutItT,
	BlevBufferTileOffsetsOutItT,
	BlockOffsetT,
	BLevBufferOffsetTileState,
	BLevBlockOffsetTileState,
	IsMemcpy>;

	// Shared memory for AgentBatchMemcpy
	__shared__ typename AgentBatchMemcpyT::TempStorage temp_storage;

	// Process this block's tile of input&output buffer pairs
	AgentBatchMemcpyT(temp_storage,
	input_buffer_it,
	output_buffer_it,
	buffer_sizes,
	num_buffers,
	blev_buffer_srcs,
	blev_buffer_dsts,
	blev_buffer_sizes,
	blev_buffer_tile_offsets,
	blev_buffer_scan_state,
	blev_block_scan_state)
	.ConsumeTile(blockIdx.x);
	}

	struct DeviceBatchMemcpyPolicy
	{
	static constexpr uint32_t BLOCK_THREADS = 128U;
	static constexpr uint32_t BUFFERS_PER_THREAD = 4U;
	static constexpr uint32_t TLEV_BYTES_PER_THREAD = 8U;

	static constexpr uint32_t LARGE_BUFFER_BLOCK_THREADS = 256U;
	static constexpr uint32_t LARGE_BUFFER_BYTES_PER_THREAD = 32U;

	/// SM35
	struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
	{
	static constexpr bool PREFER_POW2_BITS = true;
	using AgentSmallBufferPolicyT =
	AgentBatchMemcpyPolicy<BLOCK_THREADS,
	BUFFERS_PER_THREAD,
	TLEV_BYTES_PER_THREAD,
	PREFER_POW2_BITS,
	LARGE_BUFFER_BLOCK_THREADS * LARGE_BUFFER_BYTES_PER_THREAD>;

	using AgentLargeBufferPolicyT =
	AgentBatchMemcpyLargeBuffersPolicy<LARGE_BUFFER_BLOCK_THREADS, LARGE_BUFFER_BYTES_PER_THREAD>;
	};

	/// SM70
	struct Policy700 : ChainedPolicy<700, Policy700, Policy350>
	{
	static constexpr bool PREFER_POW2_BITS = false;
	using AgentSmallBufferPolicyT =
	AgentBatchMemcpyPolicy<BLOCK_THREADS,
	BUFFERS_PER_THREAD,
	TLEV_BYTES_PER_THREAD,
	PREFER_POW2_BITS,
	LARGE_BUFFER_BLOCK_THREADS * LARGE_BUFFER_BYTES_PER_THREAD>;

	using AgentLargeBufferPolicyT =
	AgentBatchMemcpyLargeBuffersPolicy<LARGE_BUFFER_BLOCK_THREADS, LARGE_BUFFER_BYTES_PER_THREAD>;
	};

	using MaxPolicy = Policy700;
	};

	/**
	* @tparam InputBufferIt <b>[inferred]</b> Random-access input iterator type providing the pointers
	* to the source memory buffers
	* @tparam OutputBufferIt <b>[inferred]</b> Random-access input iterator type providing the pointers
	* to the destination memory buffers
	* @tparam BufferSizeIteratorT <b>[inferred]</b> Random-access input iterator type providing the
	* number of bytes to be copied for each pair of buffers
	* @tparam BufferOffsetT Integer type large enough to hold any offset in [0, num_buffers)
	* @tparam BlockOffsetT Integer type large enough to hold any offset in [0,
	* num_thread_blocks_launched)
	*/
	template <typename InputBufferIt,
	typename OutputBufferIt,
	typename BufferSizeIteratorT,
	typename BufferOffsetT,
	typename BlockOffsetT,
	typename SelectedPolicy = DeviceBatchMemcpyPolicy,
	bool IsMemcpy = true>
	struct DispatchBatchMemcpy : SelectedPolicy
	{
	//------------------------------------------------------------------------------
	// TYPE ALIASES
	//------------------------------------------------------------------------------
	// Tile state for the single-pass prefix scan to "stream compact" (aka "select") the buffers
	// requiring block-level collaboration
	using BufferPartitionScanTileStateT = typename cub::ScanTileState<BufferOffsetT>;

	// Tile state for the single-pass prefix scan to keep track of how many blocks are assigned to
	// each of the buffers requiring block-level collaboration
	using BufferTileOffsetScanStateT = typename cub::ScanTileState<BlockOffsetT>;

	// Internal type used to keep track of a buffer's size
	using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>;

	//------------------------------------------------------------------------------
	// Member Veriables
	//------------------------------------------------------------------------------
	void *d_temp_storage;
	size_t &temp_storage_bytes;
	InputBufferIt input_buffer_it;
	OutputBufferIt output_buffer_it;
	BufferSizeIteratorT buffer_sizes;
	BufferOffsetT num_buffers;
	cudaStream_t stream;

	//------------------------------------------------------------------------------
	// Constructor
	//------------------------------------------------------------------------------
	CUB_RUNTIME_FUNCTION __forceinline__ DispatchBatchMemcpy(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputBufferIt input_buffer_it,
	OutputBufferIt output_buffer_it,
	BufferSizeIteratorT buffer_sizes,
	BufferOffsetT num_buffers,
	cudaStream_t stream)
	: d_temp_storage(d_temp_storage)
	, temp_storage_bytes(temp_storage_bytes)
	, input_buffer_it(input_buffer_it)
	, output_buffer_it(output_buffer_it)
	, buffer_sizes(buffer_sizes)
	, num_buffers(num_buffers)
	, stream(stream)
	{}

	//------------------------------------------------------------------------------
	// Chained policy invocation
	//------------------------------------------------------------------------------
	/**
	* @brief Tuning policy invocation. This member function template is getting instantiated for all
	* tuning policies in the tuning policy chain. It is, however, invoked for the correct tuning
	* policy only.
	*/
	template <typename ActivePolicyT>
	CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
	{
	using MaxPolicyT = typename DispatchBatchMemcpy::MaxPolicy;

	// Single-pass prefix scan tile states for the prefix-sum over the number of block-level buffers
	using BLevBufferOffsetTileState = cub::ScanTileState<BufferOffsetT>;

	// Single-pass prefix scan tile states for the prefix sum over the number of thread blocks
	// assigned to each of the block-level buffers
	using BLevBlockOffsetTileState = cub::ScanTileState<BlockOffsetT>;

	cudaError error = cudaSuccess;

	enum : uint32_t
	{
	// Memory for the source pointers of the buffers that require block-level collaboration
	MEM_BLEV_BUFFER_SRCS = 0,
	// Memory for the destination pointers of the buffers that require block-level collaboration
	MEM_BLEV_BUFFER_DSTS,
	// Memory for the block-level buffers' sizes
	MEM_BLEV_BUFFER_SIZES,
	// Memory to keep track of the assignment of thread blocks to block-level buffers
	MEM_BLEV_BUFFER_TBLOCK,
	// Memory for the tile states of the prefix sum over the number of buffers that require
	// block-level collaboration
	MEM_BLEV_BUFFER_SCAN_STATE,
	// Memory for the scan tile states of the prefix sum over the number of thread block's
	// assigned up to and including a certain block-level buffer
	MEM_BLEV_BLOCK_SCAN_STATE,
	// Total number of distinct memory allocations in the temporary storage memory BLOB
	MEM_NUM_ALLOCATIONS
	};

	// Number of threads per block for initializing the grid states
	constexpr BlockOffsetT INIT_KERNEL_THREADS = 128U;

	// The number of buffers that get processed per thread block
	constexpr uint32_t TILE_SIZE = ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS *
	ActivePolicyT::AgentSmallBufferPolicyT::BUFFERS_PER_THREAD;

	// The number of thread blocks (or tiles) required to process all of the given buffers
	BlockOffsetT num_tiles = DivideAndRoundUp(num_buffers, TILE_SIZE);

	using BlevBufferSrcsOutT =
	cub::detail::conditional_t<IsMemcpy, void *, cub::detail::value_t<InputBufferIt>>;
	using BlevBufferDstOutT =
	cub::detail::conditional_t<IsMemcpy, void *, cub::detail::value_t<OutputBufferIt>>;
	using BlevBufferSrcsOutItT = BlevBufferSrcsOutT *;
	using BlevBufferDstsOutItT = BlevBufferDstOutT *;
	using BlevBufferSizesOutItT = BufferSizeT *;
	using BlevBufferTileOffsetsOutItT = BlockOffsetT *;

	temporary_storage::layout<MEM_NUM_ALLOCATIONS> temporary_storage_layout;

	auto blev_buffer_srcs_slot = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_SRCS);
	auto blev_buffer_dsts_slot = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_DSTS);
	auto blev_buffer_sizes_slot = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_SIZES);
	auto blev_buffer_block_slot = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_TBLOCK);
	auto blev_buffer_scan_slot = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_SCAN_STATE);
	auto blev_buffer_block_scan_slot = temporary_storage_layout.get_slot(MEM_BLEV_BLOCK_SCAN_STATE);

	auto blev_buffer_srcs_alloc =
	blev_buffer_srcs_slot->template create_alias<BlevBufferSrcsOutT>();
	auto blev_buffer_dsts_alloc = blev_buffer_dsts_slot->template create_alias<BlevBufferDstOutT>();
	auto blev_buffer_sizes_alloc = blev_buffer_sizes_slot->template create_alias<BufferSizeT>();
	auto blev_buffer_block_alloc = blev_buffer_block_slot->template create_alias<BlockOffsetT>();
	auto blev_buffer_scan_alloc = blev_buffer_scan_slot->template create_alias<uint8_t>();
	auto blev_block_scan_alloc = blev_buffer_block_scan_slot->template create_alias<uint8_t>();

	std::size_t buffer_offset_scan_storage = 0;
	std::size_t blev_block_scan_storage = 0;
	CubDebug(error = BLevBufferOffsetTileState::AllocationSize(static_cast<int32_t>(num_tiles),
	buffer_offset_scan_storage));
	if (error)
	{
	return error;
	}
	CubDebug(error = BLevBlockOffsetTileState::AllocationSize(static_cast<int32_t>(num_tiles),
	blev_block_scan_storage));
	if (error)
	{
	return error;
	}

	blev_buffer_srcs_alloc.grow(num_buffers);
	blev_buffer_dsts_alloc.grow(num_buffers);
	blev_buffer_sizes_alloc.grow(num_buffers);
	blev_buffer_block_alloc.grow(num_buffers);
	blev_buffer_scan_alloc.grow(buffer_offset_scan_storage);
	blev_block_scan_alloc.grow(blev_block_scan_storage);

	// Just return if no temporary storage is provided
	if (d_temp_storage == nullptr)
	{
	temp_storage_bytes = temporary_storage_layout.get_size();
	return error;
	}

	// Return if empty problem
	if (num_buffers == 0)
	{
	return error;
	}

	// Alias memory buffers into the storage blob
	if (CubDebug(
	error = temporary_storage_layout.map_to_buffer(d_temp_storage, temp_storage_bytes)))
	{
	return error;
	}

	// Alias into temporary storage allocation
	BlevBufferSrcsOutItT d_blev_src_buffers = blev_buffer_srcs_alloc.get();
	BlevBufferDstsOutItT d_blev_dst_buffers = blev_buffer_dsts_alloc.get();
	BlevBufferSizesOutItT d_blev_buffer_sizes = blev_buffer_sizes_alloc.get();
	BlevBufferTileOffsetsOutItT d_blev_block_offsets = blev_buffer_block_alloc.get();

	// Kernels' grid sizes
	BlockOffsetT init_grid_size = DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS);
	BlockOffsetT batch_memcpy_grid_size = num_tiles;

	// Kernels
	auto init_scan_states_kernel =
	InitTileStateKernel<BLevBufferOffsetTileState, BLevBlockOffsetTileState, BlockOffsetT>;
	auto batch_memcpy_non_blev_kernel = BatchMemcpyKernel<MaxPolicyT,
	InputBufferIt,
	OutputBufferIt,
	BufferSizeIteratorT,
	BufferOffsetT,
	BlevBufferSrcsOutItT,
	BlevBufferDstsOutItT,
	BlevBufferSizesOutItT,
	BlevBufferTileOffsetsOutItT,
	BlockOffsetT,
	BLevBufferOffsetTileState,
	BLevBlockOffsetTileState,
	IsMemcpy>;

	auto multi_block_memcpy_kernel = MultiBlockBatchMemcpyKernel<MaxPolicyT,
	BufferOffsetT,
	BlevBufferSrcsOutItT,
	BlevBufferDstsOutItT,
	BlevBufferSizesOutItT,
	BlevBufferTileOffsetsOutItT,
	BLevBufferOffsetTileState,
	BlockOffsetT,
	IsMemcpy>;

	constexpr uint32_t BLEV_BLOCK_THREADS = ActivePolicyT::AgentLargeBufferPolicyT::BLOCK_THREADS;

	// Get device ordinal
	int device_ordinal;
	if (CubDebug(error = cudaGetDevice(&device_ordinal)))
	{
	return error;
	}

	// Get SM count
	int sm_count;
	if (CubDebug(error = cudaDeviceGetAttribute(&sm_count,
	cudaDevAttrMultiProcessorCount,
	device_ordinal)))
	{
	return error;
	}

	// Get SM occupancy for the batch memcpy block-level buffers kernel
	int batch_memcpy_blev_occupancy;
	if (CubDebug(error = MaxSmOccupancy(batch_memcpy_blev_occupancy,
	multi_block_memcpy_kernel,
	BLEV_BLOCK_THREADS)))
	{
	return error;
	}

	int batch_memcpy_blev_grid_size =
	static_cast<int>(sm_count * batch_memcpy_blev_occupancy * CUB_SUBSCRIPTION_FACTOR(0));

	// Construct the tile status for the buffer prefix sum
	BLevBufferOffsetTileState buffer_scan_tile_state;
	if (CubDebug(error = buffer_scan_tile_state.Init(static_cast<int32_t>(num_tiles),
	blev_buffer_scan_alloc.get(),
	buffer_offset_scan_storage)))
	{
	return error;
	}

	// Construct the tile status for thread blocks-to-buffer-assignment prefix sum
	BLevBlockOffsetTileState block_scan_tile_state;
	if (CubDebug(error = block_scan_tile_state.Init(static_cast<int32_t>(num_tiles),
	blev_block_scan_alloc.get(),
	blev_block_scan_storage)))
	{
	return error;
	}

	#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
	_CubLog("Invoking "
	"InitTileStateKernel<<<%d, %d, 0, %lld>>>()\n",
	static_cast<int>(init_grid_size),
	INIT_KERNEL_THREADS,
	(long long)stream);
	#endif

	// Invoke init_kernel to initialize buffer prefix sum-tile descriptors
	error =
	THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size,
	INIT_KERNEL_THREADS,
	0,
	stream)
	.doit(init_scan_states_kernel, buffer_scan_tile_state, block_scan_tile_state, num_tiles);

	// Check for failure to launch
	if (CubDebug(error))
	{
	return error;
	}

	// Sync the stream if specified to flush runtime errors
	error = detail::DebugSyncStream(stream);

	// Check for failure to launch
	if (CubDebug(error))
	{
	return error;
	}

	#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
	_CubLog("Invoking "
	"BatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n",
	static_cast<int>(batch_memcpy_grid_size),
	ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS,
	(long long)stream);
	#endif

	// Invoke kernel to copy small buffers and put the larger ones into a queue that will get picked
	// up by next kernel
	error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
	batch_memcpy_grid_size,
	ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS,
	0,
	stream)
	.doit(batch_memcpy_non_blev_kernel,
	input_buffer_it,
	output_buffer_it,
	buffer_sizes,
	num_buffers,
	d_blev_src_buffers,
	d_blev_dst_buffers,
	d_blev_buffer_sizes,
	d_blev_block_offsets,
	buffer_scan_tile_state,
	block_scan_tile_state);

	// Check for failure to launch
	if (CubDebug(error))
	{
	return error;
	}

	// Sync the stream if specified to flush runtime errors
	error = detail::DebugSyncStream(stream);
	if (CubDebug(error))
	{
	return error;
	}

	#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
	_CubLog("Invoking "
	"MultiBlockBatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n",
	static_cast<int>(batch_memcpy_blev_grid_size),
	BLEV_BLOCK_THREADS,
	(long long)stream);
	#endif

	error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(batch_memcpy_blev_grid_size,
	BLEV_BLOCK_THREADS,
	0,
	stream)
	.doit(multi_block_memcpy_kernel,
	d_blev_src_buffers,
	d_blev_dst_buffers,
	d_blev_buffer_sizes,
	d_blev_block_offsets,
	buffer_scan_tile_state,
	batch_memcpy_grid_size - 1);

	// Check for failure to launch
	if (CubDebug(error))
	{
	return error;
	}

	// Sync the stream if specified to flush runtime errors
	error = detail::DebugSyncStream(stream);

	return error;
	}

	//------------------------------------------------------------------------------
	// Dispatch entrypoints
	//------------------------------------------------------------------------------
	/**
	* Internal dispatch routine
	*/
	CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputBufferIt input_buffer_it,
	OutputBufferIt output_buffer_it,
	BufferSizeIteratorT buffer_sizes,
	BufferOffsetT num_buffers,
	cudaStream_t stream)
	{
	using MaxPolicyT = typename DispatchBatchMemcpy::MaxPolicy;

	cudaError_t error = cudaSuccess;

	// Get PTX version
	int ptx_version = 0;
	if (CubDebug(error = PtxVersion(ptx_version)))
	{
	return error;
	}

	// Create dispatch functor
	DispatchBatchMemcpy dispatch(d_temp_storage,
	temp_storage_bytes,
	input_buffer_it,
	output_buffer_it,
	buffer_sizes,
	num_buffers,
	stream);

	// Dispatch to chained policy
	if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
	{
	return error;
	}
	return error;
	}
	};

	} // namespace detail

	CUB_NAMESPACE_END