thrust / install /include /cub /device /dispatch /dispatch_batch_memcpy.cuh
camenduru's picture
thanks to nvidia ❤
0dc1b04
/******************************************************************************
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* cub::DispatchBatchMemcpy provides device-wide, parallel operations for copying data from a number
* of given source buffers to their corresponding destination buffer.
*/
#pragma once
#include <cub/agent/agent_batch_memcpy.cuh>
#include <cub/agent/single_pass_scan_operators.cuh>
#include <cub/config.cuh>
#include <cub/detail/temporary_storage.cuh>
#include <cub/thread/thread_search.cuh>
#include <cub/util_debug.cuh>
#include <cub/util_ptx.cuh>
#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
#include <cuda/std/type_traits>
#include <cstdint>
CUB_NAMESPACE_BEGIN
namespace detail
{
/**
* Parameterizable tuning policy type for AgentBatchMemcpy
*/
template <uint32_t _BLOCK_THREADS, uint32_t _BYTES_PER_THREAD>
struct AgentBatchMemcpyLargeBuffersPolicy
{
/// Threads per thread block
static constexpr uint32_t BLOCK_THREADS = _BLOCK_THREADS;
/// The number of bytes each thread copies
static constexpr uint32_t BYTES_PER_THREAD = _BYTES_PER_THREAD;
};
/**
* Initialization kernel for tile status initialization (multi-block)
*/
template <typename BufferOffsetScanTileStateT,
typename BlockOffsetScanTileStateT,
typename TileOffsetT>
__global__ void InitTileStateKernel(BufferOffsetScanTileStateT buffer_offset_scan_tile_state,
BlockOffsetScanTileStateT block_offset_scan_tile_state,
TileOffsetT num_tiles)
{
// Initialize tile status
buffer_offset_scan_tile_state.InitializeStatus(num_tiles);
block_offset_scan_tile_state.InitializeStatus(num_tiles);
}
/**
* Kernel that copies buffers that need to be copied by at least one (and potentially many) thread
* blocks.
*/
template <typename ChainedPolicyT,
typename BufferOffsetT,
typename InputBufferIt,
typename OutputBufferIt,
typename BufferSizeIteratorT,
typename BufferTileOffsetItT,
typename TileT,
typename TileOffsetT,
bool IsMemcpy>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLOCK_THREADS))
__global__ void MultiBlockBatchMemcpyKernel(InputBufferIt input_buffer_it,
OutputBufferIt output_buffer_it,
BufferSizeIteratorT buffer_sizes,
BufferTileOffsetItT buffer_tile_offsets,
TileT buffer_offset_tile,
TileOffsetT last_tile_offset)
{
using StatusWord = typename TileT::StatusWord;
using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT;
using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>;
/// Internal load/store type. For byte-wise memcpy, a single-byte type
using AliasT = typename ::cuda::std::conditional<
IsMemcpy,
std::iterator_traits<char *>,
std::iterator_traits<cub::detail::value_t<InputBufferIt>>>::type::value_type;
/// Types of the input and output buffers
using InputBufferT = cub::detail::value_t<InputBufferIt>;
using OutputBufferT = cub::detail::value_t<OutputBufferIt>;
constexpr uint32_t BLOCK_THREADS = ActivePolicyT::BLOCK_THREADS;
constexpr uint32_t ITEMS_PER_THREAD = ActivePolicyT::BYTES_PER_THREAD;
constexpr BufferSizeT TILE_SIZE = static_cast<BufferSizeT>(BLOCK_THREADS * ITEMS_PER_THREAD);
BufferOffsetT num_blev_buffers = buffer_offset_tile.LoadValid(last_tile_offset);
uint32_t tile_id = blockIdx.x;
// No block-level buffers => we're done here
if (num_blev_buffers == 0)
{
return;
}
// While there's still tiles of bytes from block-level buffers to copied
do
{
__shared__ BufferOffsetT block_buffer_id;
// Make sure thread 0 does not overwrite the buffer id before other threads have finished with
// the prior iteration of the loop
CTA_SYNC();
// Binary search the buffer that this tile belongs to
if (threadIdx.x == 0)
block_buffer_id = UpperBound(buffer_tile_offsets, num_blev_buffers, tile_id) - 1;
// Make sure thread 0 has written the buffer this thread block is assigned to
CTA_SYNC();
const BufferOffsetT buffer_id = block_buffer_id;
// The relative offset of this tile within the buffer it's assigned to
BufferSizeT tile_offset_within_buffer =
static_cast<BufferSizeT>(tile_id - buffer_tile_offsets[buffer_id]) * TILE_SIZE;
// If the tile has already reached beyond the work of the end of the last buffer
if (buffer_id >= num_blev_buffers - 1 && tile_offset_within_buffer > buffer_sizes[buffer_id])
{
return;
}
// Tiny remainders are copied without vectorizing laods
if (buffer_sizes[buffer_id] - tile_offset_within_buffer <= 32)
{
BufferSizeT thread_offset = tile_offset_within_buffer + threadIdx.x;
for (int i = 0; i < ITEMS_PER_THREAD; i++)
{
if (thread_offset < buffer_sizes[buffer_id])
{
const auto value = read_item<IsMemcpy, AliasT, InputBufferT>(input_buffer_it[buffer_id],
thread_offset);
write_item<IsMemcpy, AliasT, OutputBufferT>(output_buffer_it[buffer_id],
thread_offset,
value);
}
thread_offset += BLOCK_THREADS;
}
}
else
{
copy_items<IsMemcpy, BLOCK_THREADS, InputBufferT, OutputBufferT, BufferSizeT>(
input_buffer_it[buffer_id],
output_buffer_it[buffer_id],
(cub::min)(buffer_sizes[buffer_id] - tile_offset_within_buffer, TILE_SIZE),
tile_offset_within_buffer);
}
tile_id += gridDim.x;
} while (true);
}
/**
* @brief Kernel that copies data from a batch of given source buffers to their corresponding
* destination buffer. If a buffer's size is to large to be copied by a single thread block, that
* buffer is put into a queue of buffers that will get picked up later on, where multiple blocks
* collaborate on each of these buffers. All other buffers get copied straight away.
*
* @param input_buffer_it [in] Iterator providing the pointers to the source memory buffers
* @param output_buffer_it [in] Iterator providing the pointers to the destination memory buffers
* @param buffer_sizes [in] Iterator providing the number of bytes to be copied for each pair of
* buffers
* @param num_buffers [in] The total number of buffer pairs
* @param blev_buffer_srcs [out] The source pointers of buffers that require block-level
* collaboration
* @param blev_buffer_dsts [out] The destination pointers of buffers that require block-level
* collaboration
* @param blev_buffer_sizes [out] The sizes of buffers that require block-level collaboration
* @param blev_buffer_scan_state [in,out] Tile states for the prefix sum over the count of buffers
* requiring block-level collaboration (to "stream compact" (aka "select") BLEV-buffers)
* @param blev_block_scan_state [in,out] Tile states for the prefix sum over the number of thread
* blocks getting assigned to each buffer that requires block-level collaboration
*/
template <typename ChainedPolicyT,
typename InputBufferIt,
typename OutputBufferIt,
typename BufferSizeIteratorT,
typename BufferOffsetT,
typename BlevBufferSrcsOutItT,
typename BlevBufferDstsOutItT,
typename BlevBufferSizesOutItT,
typename BlevBufferTileOffsetsOutItT,
typename BlockOffsetT,
typename BLevBufferOffsetTileState,
typename BLevBlockOffsetTileState,
bool IsMemcpy>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLOCK_THREADS))
__global__ void BatchMemcpyKernel(InputBufferIt input_buffer_it,
OutputBufferIt output_buffer_it,
BufferSizeIteratorT buffer_sizes,
BufferOffsetT num_buffers,
BlevBufferSrcsOutItT blev_buffer_srcs,
BlevBufferDstsOutItT blev_buffer_dsts,
BlevBufferSizesOutItT blev_buffer_sizes,
BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
BLevBufferOffsetTileState blev_buffer_scan_state,
BLevBlockOffsetTileState blev_block_scan_state)
{
// Internal type used for storing a buffer's size
using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>;
// Alias the correct tuning policy for the current compilation pass' architecture
using AgentBatchMemcpyPolicyT = typename ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT;
// Block-level specialization
using AgentBatchMemcpyT = AgentBatchMemcpy<AgentBatchMemcpyPolicyT,
InputBufferIt,
OutputBufferIt,
BufferSizeIteratorT,
BufferOffsetT,
BlevBufferSrcsOutItT,
BlevBufferDstsOutItT,
BlevBufferSizesOutItT,
BlevBufferTileOffsetsOutItT,
BlockOffsetT,
BLevBufferOffsetTileState,
BLevBlockOffsetTileState,
IsMemcpy>;
// Shared memory for AgentBatchMemcpy
__shared__ typename AgentBatchMemcpyT::TempStorage temp_storage;
// Process this block's tile of input&output buffer pairs
AgentBatchMemcpyT(temp_storage,
input_buffer_it,
output_buffer_it,
buffer_sizes,
num_buffers,
blev_buffer_srcs,
blev_buffer_dsts,
blev_buffer_sizes,
blev_buffer_tile_offsets,
blev_buffer_scan_state,
blev_block_scan_state)
.ConsumeTile(blockIdx.x);
}
struct DeviceBatchMemcpyPolicy
{
static constexpr uint32_t BLOCK_THREADS = 128U;
static constexpr uint32_t BUFFERS_PER_THREAD = 4U;
static constexpr uint32_t TLEV_BYTES_PER_THREAD = 8U;
static constexpr uint32_t LARGE_BUFFER_BLOCK_THREADS = 256U;
static constexpr uint32_t LARGE_BUFFER_BYTES_PER_THREAD = 32U;
/// SM35
struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
{
static constexpr bool PREFER_POW2_BITS = true;
using AgentSmallBufferPolicyT =
AgentBatchMemcpyPolicy<BLOCK_THREADS,
BUFFERS_PER_THREAD,
TLEV_BYTES_PER_THREAD,
PREFER_POW2_BITS,
LARGE_BUFFER_BLOCK_THREADS * LARGE_BUFFER_BYTES_PER_THREAD>;
using AgentLargeBufferPolicyT =
AgentBatchMemcpyLargeBuffersPolicy<LARGE_BUFFER_BLOCK_THREADS, LARGE_BUFFER_BYTES_PER_THREAD>;
};
/// SM70
struct Policy700 : ChainedPolicy<700, Policy700, Policy350>
{
static constexpr bool PREFER_POW2_BITS = false;
using AgentSmallBufferPolicyT =
AgentBatchMemcpyPolicy<BLOCK_THREADS,
BUFFERS_PER_THREAD,
TLEV_BYTES_PER_THREAD,
PREFER_POW2_BITS,
LARGE_BUFFER_BLOCK_THREADS * LARGE_BUFFER_BYTES_PER_THREAD>;
using AgentLargeBufferPolicyT =
AgentBatchMemcpyLargeBuffersPolicy<LARGE_BUFFER_BLOCK_THREADS, LARGE_BUFFER_BYTES_PER_THREAD>;
};
using MaxPolicy = Policy700;
};
/**
* @tparam InputBufferIt <b>[inferred]</b> Random-access input iterator type providing the pointers
* to the source memory buffers
* @tparam OutputBufferIt <b>[inferred]</b> Random-access input iterator type providing the pointers
* to the destination memory buffers
* @tparam BufferSizeIteratorT <b>[inferred]</b> Random-access input iterator type providing the
* number of bytes to be copied for each pair of buffers
* @tparam BufferOffsetT Integer type large enough to hold any offset in [0, num_buffers)
* @tparam BlockOffsetT Integer type large enough to hold any offset in [0,
* num_thread_blocks_launched)
*/
template <typename InputBufferIt,
typename OutputBufferIt,
typename BufferSizeIteratorT,
typename BufferOffsetT,
typename BlockOffsetT,
typename SelectedPolicy = DeviceBatchMemcpyPolicy,
bool IsMemcpy = true>
struct DispatchBatchMemcpy : SelectedPolicy
{
//------------------------------------------------------------------------------
// TYPE ALIASES
//------------------------------------------------------------------------------
// Tile state for the single-pass prefix scan to "stream compact" (aka "select") the buffers
// requiring block-level collaboration
using BufferPartitionScanTileStateT = typename cub::ScanTileState<BufferOffsetT>;
// Tile state for the single-pass prefix scan to keep track of how many blocks are assigned to
// each of the buffers requiring block-level collaboration
using BufferTileOffsetScanStateT = typename cub::ScanTileState<BlockOffsetT>;
// Internal type used to keep track of a buffer's size
using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>;
//------------------------------------------------------------------------------
// Member Veriables
//------------------------------------------------------------------------------
void *d_temp_storage;
size_t &temp_storage_bytes;
InputBufferIt input_buffer_it;
OutputBufferIt output_buffer_it;
BufferSizeIteratorT buffer_sizes;
BufferOffsetT num_buffers;
cudaStream_t stream;
//------------------------------------------------------------------------------
// Constructor
//------------------------------------------------------------------------------
CUB_RUNTIME_FUNCTION __forceinline__ DispatchBatchMemcpy(void *d_temp_storage,
size_t &temp_storage_bytes,
InputBufferIt input_buffer_it,
OutputBufferIt output_buffer_it,
BufferSizeIteratorT buffer_sizes,
BufferOffsetT num_buffers,
cudaStream_t stream)
: d_temp_storage(d_temp_storage)
, temp_storage_bytes(temp_storage_bytes)
, input_buffer_it(input_buffer_it)
, output_buffer_it(output_buffer_it)
, buffer_sizes(buffer_sizes)
, num_buffers(num_buffers)
, stream(stream)
{}
//------------------------------------------------------------------------------
// Chained policy invocation
//------------------------------------------------------------------------------
/**
* @brief Tuning policy invocation. This member function template is getting instantiated for all
* tuning policies in the tuning policy chain. It is, however, *invoked* for the correct tuning
* policy only.
*/
template <typename ActivePolicyT>
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
{
using MaxPolicyT = typename DispatchBatchMemcpy::MaxPolicy;
// Single-pass prefix scan tile states for the prefix-sum over the number of block-level buffers
using BLevBufferOffsetTileState = cub::ScanTileState<BufferOffsetT>;
// Single-pass prefix scan tile states for the prefix sum over the number of thread blocks
// assigned to each of the block-level buffers
using BLevBlockOffsetTileState = cub::ScanTileState<BlockOffsetT>;
cudaError error = cudaSuccess;
enum : uint32_t
{
// Memory for the source pointers of the buffers that require block-level collaboration
MEM_BLEV_BUFFER_SRCS = 0,
// Memory for the destination pointers of the buffers that require block-level collaboration
MEM_BLEV_BUFFER_DSTS,
// Memory for the block-level buffers' sizes
MEM_BLEV_BUFFER_SIZES,
// Memory to keep track of the assignment of thread blocks to block-level buffers
MEM_BLEV_BUFFER_TBLOCK,
// Memory for the tile states of the prefix sum over the number of buffers that require
// block-level collaboration
MEM_BLEV_BUFFER_SCAN_STATE,
// Memory for the scan tile states of the prefix sum over the number of thread block's
// assigned up to and including a certain block-level buffer
MEM_BLEV_BLOCK_SCAN_STATE,
// Total number of distinct memory allocations in the temporary storage memory BLOB
MEM_NUM_ALLOCATIONS
};
// Number of threads per block for initializing the grid states
constexpr BlockOffsetT INIT_KERNEL_THREADS = 128U;
// The number of buffers that get processed per thread block
constexpr uint32_t TILE_SIZE = ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS *
ActivePolicyT::AgentSmallBufferPolicyT::BUFFERS_PER_THREAD;
// The number of thread blocks (or tiles) required to process all of the given buffers
BlockOffsetT num_tiles = DivideAndRoundUp(num_buffers, TILE_SIZE);
using BlevBufferSrcsOutT =
cub::detail::conditional_t<IsMemcpy, void *, cub::detail::value_t<InputBufferIt>>;
using BlevBufferDstOutT =
cub::detail::conditional_t<IsMemcpy, void *, cub::detail::value_t<OutputBufferIt>>;
using BlevBufferSrcsOutItT = BlevBufferSrcsOutT *;
using BlevBufferDstsOutItT = BlevBufferDstOutT *;
using BlevBufferSizesOutItT = BufferSizeT *;
using BlevBufferTileOffsetsOutItT = BlockOffsetT *;
temporary_storage::layout<MEM_NUM_ALLOCATIONS> temporary_storage_layout;
auto blev_buffer_srcs_slot = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_SRCS);
auto blev_buffer_dsts_slot = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_DSTS);
auto blev_buffer_sizes_slot = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_SIZES);
auto blev_buffer_block_slot = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_TBLOCK);
auto blev_buffer_scan_slot = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_SCAN_STATE);
auto blev_buffer_block_scan_slot = temporary_storage_layout.get_slot(MEM_BLEV_BLOCK_SCAN_STATE);
auto blev_buffer_srcs_alloc =
blev_buffer_srcs_slot->template create_alias<BlevBufferSrcsOutT>();
auto blev_buffer_dsts_alloc = blev_buffer_dsts_slot->template create_alias<BlevBufferDstOutT>();
auto blev_buffer_sizes_alloc = blev_buffer_sizes_slot->template create_alias<BufferSizeT>();
auto blev_buffer_block_alloc = blev_buffer_block_slot->template create_alias<BlockOffsetT>();
auto blev_buffer_scan_alloc = blev_buffer_scan_slot->template create_alias<uint8_t>();
auto blev_block_scan_alloc = blev_buffer_block_scan_slot->template create_alias<uint8_t>();
std::size_t buffer_offset_scan_storage = 0;
std::size_t blev_block_scan_storage = 0;
CubDebug(error = BLevBufferOffsetTileState::AllocationSize(static_cast<int32_t>(num_tiles),
buffer_offset_scan_storage));
if (error)
{
return error;
}
CubDebug(error = BLevBlockOffsetTileState::AllocationSize(static_cast<int32_t>(num_tiles),
blev_block_scan_storage));
if (error)
{
return error;
}
blev_buffer_srcs_alloc.grow(num_buffers);
blev_buffer_dsts_alloc.grow(num_buffers);
blev_buffer_sizes_alloc.grow(num_buffers);
blev_buffer_block_alloc.grow(num_buffers);
blev_buffer_scan_alloc.grow(buffer_offset_scan_storage);
blev_block_scan_alloc.grow(blev_block_scan_storage);
// Just return if no temporary storage is provided
if (d_temp_storage == nullptr)
{
temp_storage_bytes = temporary_storage_layout.get_size();
return error;
}
// Return if empty problem
if (num_buffers == 0)
{
return error;
}
// Alias memory buffers into the storage blob
if (CubDebug(
error = temporary_storage_layout.map_to_buffer(d_temp_storage, temp_storage_bytes)))
{
return error;
}
// Alias into temporary storage allocation
BlevBufferSrcsOutItT d_blev_src_buffers = blev_buffer_srcs_alloc.get();
BlevBufferDstsOutItT d_blev_dst_buffers = blev_buffer_dsts_alloc.get();
BlevBufferSizesOutItT d_blev_buffer_sizes = blev_buffer_sizes_alloc.get();
BlevBufferTileOffsetsOutItT d_blev_block_offsets = blev_buffer_block_alloc.get();
// Kernels' grid sizes
BlockOffsetT init_grid_size = DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS);
BlockOffsetT batch_memcpy_grid_size = num_tiles;
// Kernels
auto init_scan_states_kernel =
InitTileStateKernel<BLevBufferOffsetTileState, BLevBlockOffsetTileState, BlockOffsetT>;
auto batch_memcpy_non_blev_kernel = BatchMemcpyKernel<MaxPolicyT,
InputBufferIt,
OutputBufferIt,
BufferSizeIteratorT,
BufferOffsetT,
BlevBufferSrcsOutItT,
BlevBufferDstsOutItT,
BlevBufferSizesOutItT,
BlevBufferTileOffsetsOutItT,
BlockOffsetT,
BLevBufferOffsetTileState,
BLevBlockOffsetTileState,
IsMemcpy>;
auto multi_block_memcpy_kernel = MultiBlockBatchMemcpyKernel<MaxPolicyT,
BufferOffsetT,
BlevBufferSrcsOutItT,
BlevBufferDstsOutItT,
BlevBufferSizesOutItT,
BlevBufferTileOffsetsOutItT,
BLevBufferOffsetTileState,
BlockOffsetT,
IsMemcpy>;
constexpr uint32_t BLEV_BLOCK_THREADS = ActivePolicyT::AgentLargeBufferPolicyT::BLOCK_THREADS;
// Get device ordinal
int device_ordinal;
if (CubDebug(error = cudaGetDevice(&device_ordinal)))
{
return error;
}
// Get SM count
int sm_count;
if (CubDebug(error = cudaDeviceGetAttribute(&sm_count,
cudaDevAttrMultiProcessorCount,
device_ordinal)))
{
return error;
}
// Get SM occupancy for the batch memcpy block-level buffers kernel
int batch_memcpy_blev_occupancy;
if (CubDebug(error = MaxSmOccupancy(batch_memcpy_blev_occupancy,
multi_block_memcpy_kernel,
BLEV_BLOCK_THREADS)))
{
return error;
}
int batch_memcpy_blev_grid_size =
static_cast<int>(sm_count * batch_memcpy_blev_occupancy * CUB_SUBSCRIPTION_FACTOR(0));
// Construct the tile status for the buffer prefix sum
BLevBufferOffsetTileState buffer_scan_tile_state;
if (CubDebug(error = buffer_scan_tile_state.Init(static_cast<int32_t>(num_tiles),
blev_buffer_scan_alloc.get(),
buffer_offset_scan_storage)))
{
return error;
}
// Construct the tile status for thread blocks-to-buffer-assignment prefix sum
BLevBlockOffsetTileState block_scan_tile_state;
if (CubDebug(error = block_scan_tile_state.Init(static_cast<int32_t>(num_tiles),
blev_block_scan_alloc.get(),
blev_block_scan_storage)))
{
return error;
}
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
_CubLog("Invoking "
"InitTileStateKernel<<<%d, %d, 0, %lld>>>()\n",
static_cast<int>(init_grid_size),
INIT_KERNEL_THREADS,
(long long)stream);
#endif
// Invoke init_kernel to initialize buffer prefix sum-tile descriptors
error =
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size,
INIT_KERNEL_THREADS,
0,
stream)
.doit(init_scan_states_kernel, buffer_scan_tile_state, block_scan_tile_state, num_tiles);
// Check for failure to launch
if (CubDebug(error))
{
return error;
}
// Sync the stream if specified to flush runtime errors
error = detail::DebugSyncStream(stream);
// Check for failure to launch
if (CubDebug(error))
{
return error;
}
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
_CubLog("Invoking "
"BatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n",
static_cast<int>(batch_memcpy_grid_size),
ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS,
(long long)stream);
#endif
// Invoke kernel to copy small buffers and put the larger ones into a queue that will get picked
// up by next kernel
error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
batch_memcpy_grid_size,
ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS,
0,
stream)
.doit(batch_memcpy_non_blev_kernel,
input_buffer_it,
output_buffer_it,
buffer_sizes,
num_buffers,
d_blev_src_buffers,
d_blev_dst_buffers,
d_blev_buffer_sizes,
d_blev_block_offsets,
buffer_scan_tile_state,
block_scan_tile_state);
// Check for failure to launch
if (CubDebug(error))
{
return error;
}
// Sync the stream if specified to flush runtime errors
error = detail::DebugSyncStream(stream);
if (CubDebug(error))
{
return error;
}
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
_CubLog("Invoking "
"MultiBlockBatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n",
static_cast<int>(batch_memcpy_blev_grid_size),
BLEV_BLOCK_THREADS,
(long long)stream);
#endif
error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(batch_memcpy_blev_grid_size,
BLEV_BLOCK_THREADS,
0,
stream)
.doit(multi_block_memcpy_kernel,
d_blev_src_buffers,
d_blev_dst_buffers,
d_blev_buffer_sizes,
d_blev_block_offsets,
buffer_scan_tile_state,
batch_memcpy_grid_size - 1);
// Check for failure to launch
if (CubDebug(error))
{
return error;
}
// Sync the stream if specified to flush runtime errors
error = detail::DebugSyncStream(stream);
return error;
}
//------------------------------------------------------------------------------
// Dispatch entrypoints
//------------------------------------------------------------------------------
/**
* Internal dispatch routine
*/
CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage,
size_t &temp_storage_bytes,
InputBufferIt input_buffer_it,
OutputBufferIt output_buffer_it,
BufferSizeIteratorT buffer_sizes,
BufferOffsetT num_buffers,
cudaStream_t stream)
{
using MaxPolicyT = typename DispatchBatchMemcpy::MaxPolicy;
cudaError_t error = cudaSuccess;
// Get PTX version
int ptx_version = 0;
if (CubDebug(error = PtxVersion(ptx_version)))
{
return error;
}
// Create dispatch functor
DispatchBatchMemcpy dispatch(d_temp_storage,
temp_storage_bytes,
input_buffer_it,
output_buffer_it,
buffer_sizes,
num_buffers,
stream);
// Dispatch to chained policy
if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
{
return error;
}
return error;
}
};
} // namespace detail
CUB_NAMESPACE_END