File size: 31,917 Bytes

0dc1b04

/******************************************************************************
 * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/**
 * \file
 * cub::DispatchBatchMemcpy provides device-wide, parallel operations for copying data from a number
 * of given source buffers to their corresponding destination buffer.
 */

#pragma once

#include <cub/agent/agent_batch_memcpy.cuh>
#include <cub/agent/single_pass_scan_operators.cuh>
#include <cub/config.cuh>
#include <cub/detail/temporary_storage.cuh>
#include <cub/thread/thread_search.cuh>
#include <cub/util_debug.cuh>
#include <cub/util_ptx.cuh>

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <cuda/std/type_traits>

#include <cstdint>

CUB_NAMESPACE_BEGIN

namespace detail
{

/**
 * Parameterizable tuning policy type for AgentBatchMemcpy
 */
template <uint32_t _BLOCK_THREADS, uint32_t _BYTES_PER_THREAD>
struct AgentBatchMemcpyLargeBuffersPolicy
{
  /// Threads per thread block
  static constexpr uint32_t BLOCK_THREADS = _BLOCK_THREADS;
  /// The number of bytes each thread copies
  static constexpr uint32_t BYTES_PER_THREAD = _BYTES_PER_THREAD;
};

/**
 * Initialization kernel for tile status initialization (multi-block)
 */
template <typename BufferOffsetScanTileStateT,
          typename BlockOffsetScanTileStateT,
          typename TileOffsetT>
__global__ void InitTileStateKernel(BufferOffsetScanTileStateT buffer_offset_scan_tile_state,
                                    BlockOffsetScanTileStateT block_offset_scan_tile_state,
                                    TileOffsetT num_tiles)
{
  // Initialize tile status
  buffer_offset_scan_tile_state.InitializeStatus(num_tiles);
  block_offset_scan_tile_state.InitializeStatus(num_tiles);
}

/**
 * Kernel that copies buffers that need to be copied by at least one (and potentially many) thread
 * blocks.
 */
template <typename ChainedPolicyT,
          typename BufferOffsetT,
          typename InputBufferIt,
          typename OutputBufferIt,
          typename BufferSizeIteratorT,
          typename BufferTileOffsetItT,
          typename TileT,
          typename TileOffsetT,
          bool IsMemcpy>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLOCK_THREADS))
  __global__ void MultiBlockBatchMemcpyKernel(InputBufferIt input_buffer_it,
                                              OutputBufferIt output_buffer_it,
                                              BufferSizeIteratorT buffer_sizes,
                                              BufferTileOffsetItT buffer_tile_offsets,
                                              TileT buffer_offset_tile,
                                              TileOffsetT last_tile_offset)
{
  using StatusWord    = typename TileT::StatusWord;
  using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT;
  using BufferSizeT   = cub::detail::value_t<BufferSizeIteratorT>;
  /// Internal load/store type. For byte-wise memcpy, a single-byte type
  using AliasT = typename ::cuda::std::conditional<
    IsMemcpy,
    std::iterator_traits<char *>,
    std::iterator_traits<cub::detail::value_t<InputBufferIt>>>::type::value_type;
  /// Types of the input and output buffers
  using InputBufferT  = cub::detail::value_t<InputBufferIt>;
  using OutputBufferT = cub::detail::value_t<OutputBufferIt>;

  constexpr uint32_t BLOCK_THREADS    = ActivePolicyT::BLOCK_THREADS;
  constexpr uint32_t ITEMS_PER_THREAD = ActivePolicyT::BYTES_PER_THREAD;
  constexpr BufferSizeT TILE_SIZE     = static_cast<BufferSizeT>(BLOCK_THREADS * ITEMS_PER_THREAD);

  BufferOffsetT num_blev_buffers = buffer_offset_tile.LoadValid(last_tile_offset);

  uint32_t tile_id = blockIdx.x;

  // No block-level buffers => we're done here
  if (num_blev_buffers == 0)
  {
    return;
  }

  // While there's still tiles of bytes from block-level buffers to copied
  do
  {
    __shared__ BufferOffsetT block_buffer_id;

    // Make sure thread 0 does not overwrite the buffer id before other threads have finished with
    // the prior iteration of the loop
    CTA_SYNC();

    // Binary search the buffer that this tile belongs to
    if (threadIdx.x == 0)
      block_buffer_id = UpperBound(buffer_tile_offsets, num_blev_buffers, tile_id) - 1;

    // Make sure thread 0 has written the buffer this thread block is assigned to
    CTA_SYNC();

    const BufferOffsetT buffer_id = block_buffer_id;

    // The relative offset of this tile within the buffer it's assigned to
    BufferSizeT tile_offset_within_buffer =
      static_cast<BufferSizeT>(tile_id - buffer_tile_offsets[buffer_id]) * TILE_SIZE;

    // If the tile has already reached beyond the work of the end of the last buffer
    if (buffer_id >= num_blev_buffers - 1 && tile_offset_within_buffer > buffer_sizes[buffer_id])
    {
      return;
    }

    // Tiny remainders are copied without vectorizing laods
    if (buffer_sizes[buffer_id] - tile_offset_within_buffer <= 32)
    {
      BufferSizeT thread_offset = tile_offset_within_buffer + threadIdx.x;
      for (int i = 0; i < ITEMS_PER_THREAD; i++)
      {
        if (thread_offset < buffer_sizes[buffer_id])
        {
          const auto value = read_item<IsMemcpy, AliasT, InputBufferT>(input_buffer_it[buffer_id],
                                                                       thread_offset);
          write_item<IsMemcpy, AliasT, OutputBufferT>(output_buffer_it[buffer_id],
                                                      thread_offset,
                                                      value);
        }
        thread_offset += BLOCK_THREADS;
      }
    }
    else
    {
      copy_items<IsMemcpy, BLOCK_THREADS, InputBufferT, OutputBufferT, BufferSizeT>(
        input_buffer_it[buffer_id],
        output_buffer_it[buffer_id],
        (cub::min)(buffer_sizes[buffer_id] - tile_offset_within_buffer, TILE_SIZE),
        tile_offset_within_buffer);
    }

    tile_id += gridDim.x;
  } while (true);
}

/**
 * @brief Kernel that copies data from a batch of given source buffers to their corresponding
 * destination buffer. If a buffer's size is to large to be copied by a single thread block, that
 * buffer is put into a queue of buffers that will get picked up later on, where multiple blocks
 * collaborate on each of these buffers. All other buffers get copied straight away.
 *
 * @param input_buffer_it [in] Iterator providing the pointers to the source memory buffers
 * @param output_buffer_it [in] Iterator providing the pointers to the destination memory buffers
 * @param buffer_sizes [in] Iterator providing the number of bytes to be copied for each pair of
 * buffers
 * @param num_buffers [in] The total number of buffer pairs
 * @param blev_buffer_srcs [out] The source pointers of buffers that require block-level
 * collaboration
 * @param blev_buffer_dsts [out] The destination pointers of buffers that require block-level
 * collaboration
 * @param blev_buffer_sizes [out] The sizes of buffers that require block-level collaboration
 * @param blev_buffer_scan_state [in,out] Tile states for the prefix sum over the count of buffers
 * requiring block-level collaboration (to "stream compact" (aka "select") BLEV-buffers)
 * @param blev_block_scan_state [in,out] Tile states for the prefix sum over the number of thread
 * blocks getting assigned to each buffer that requires block-level collaboration
 */
template <typename ChainedPolicyT,
          typename InputBufferIt,
          typename OutputBufferIt,
          typename BufferSizeIteratorT,
          typename BufferOffsetT,
          typename BlevBufferSrcsOutItT,
          typename BlevBufferDstsOutItT,
          typename BlevBufferSizesOutItT,
          typename BlevBufferTileOffsetsOutItT,
          typename BlockOffsetT,
          typename BLevBufferOffsetTileState,
          typename BLevBlockOffsetTileState,
          bool IsMemcpy>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLOCK_THREADS))
  __global__ void BatchMemcpyKernel(InputBufferIt input_buffer_it,
                                    OutputBufferIt output_buffer_it,
                                    BufferSizeIteratorT buffer_sizes,
                                    BufferOffsetT num_buffers,
                                    BlevBufferSrcsOutItT blev_buffer_srcs,
                                    BlevBufferDstsOutItT blev_buffer_dsts,
                                    BlevBufferSizesOutItT blev_buffer_sizes,
                                    BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
                                    BLevBufferOffsetTileState blev_buffer_scan_state,
                                    BLevBlockOffsetTileState blev_block_scan_state)
{
  // Internal type used for storing a buffer's size
  using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>;

  // Alias the correct tuning policy for the current compilation pass' architecture
  using AgentBatchMemcpyPolicyT = typename ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT;

  // Block-level specialization
  using AgentBatchMemcpyT = AgentBatchMemcpy<AgentBatchMemcpyPolicyT,
                                             InputBufferIt,
                                             OutputBufferIt,
                                             BufferSizeIteratorT,
                                             BufferOffsetT,
                                             BlevBufferSrcsOutItT,
                                             BlevBufferDstsOutItT,
                                             BlevBufferSizesOutItT,
                                             BlevBufferTileOffsetsOutItT,
                                             BlockOffsetT,
                                             BLevBufferOffsetTileState,
                                             BLevBlockOffsetTileState,
                                             IsMemcpy>;

  // Shared memory for AgentBatchMemcpy
  __shared__ typename AgentBatchMemcpyT::TempStorage temp_storage;

  // Process this block's tile of input&output buffer pairs
  AgentBatchMemcpyT(temp_storage,
                    input_buffer_it,
                    output_buffer_it,
                    buffer_sizes,
                    num_buffers,
                    blev_buffer_srcs,
                    blev_buffer_dsts,
                    blev_buffer_sizes,
                    blev_buffer_tile_offsets,
                    blev_buffer_scan_state,
                    blev_block_scan_state)
    .ConsumeTile(blockIdx.x);
}

struct DeviceBatchMemcpyPolicy
{
  static constexpr uint32_t BLOCK_THREADS         = 128U;
  static constexpr uint32_t BUFFERS_PER_THREAD    = 4U;
  static constexpr uint32_t TLEV_BYTES_PER_THREAD = 8U;

  static constexpr uint32_t LARGE_BUFFER_BLOCK_THREADS    = 256U;
  static constexpr uint32_t LARGE_BUFFER_BYTES_PER_THREAD = 32U;

  /// SM35
  struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
  {
    static constexpr bool PREFER_POW2_BITS = true;
    using AgentSmallBufferPolicyT =
      AgentBatchMemcpyPolicy<BLOCK_THREADS,
                             BUFFERS_PER_THREAD,
                             TLEV_BYTES_PER_THREAD,
                             PREFER_POW2_BITS,
                             LARGE_BUFFER_BLOCK_THREADS * LARGE_BUFFER_BYTES_PER_THREAD>;

    using AgentLargeBufferPolicyT =
      AgentBatchMemcpyLargeBuffersPolicy<LARGE_BUFFER_BLOCK_THREADS, LARGE_BUFFER_BYTES_PER_THREAD>;
  };

  /// SM70
  struct Policy700 : ChainedPolicy<700, Policy700, Policy350>
  {
    static constexpr bool PREFER_POW2_BITS = false;
    using AgentSmallBufferPolicyT =
      AgentBatchMemcpyPolicy<BLOCK_THREADS,
                             BUFFERS_PER_THREAD,
                             TLEV_BYTES_PER_THREAD,
                             PREFER_POW2_BITS,
                             LARGE_BUFFER_BLOCK_THREADS * LARGE_BUFFER_BYTES_PER_THREAD>;

    using AgentLargeBufferPolicyT =
      AgentBatchMemcpyLargeBuffersPolicy<LARGE_BUFFER_BLOCK_THREADS, LARGE_BUFFER_BYTES_PER_THREAD>;
  };

  using MaxPolicy = Policy700;
};

/**
 * @tparam InputBufferIt <b>[inferred]</b> Random-access input iterator type providing the pointers
 * to the source memory buffers
 * @tparam OutputBufferIt <b>[inferred]</b> Random-access input iterator type providing the pointers
 * to the destination memory buffers
 * @tparam BufferSizeIteratorT <b>[inferred]</b> Random-access input iterator type providing the
 * number of bytes to be copied for each pair of buffers
 * @tparam BufferOffsetT Integer type large enough to hold any offset in [0, num_buffers)
 * @tparam BlockOffsetT Integer type large enough to hold any offset in [0,
 * num_thread_blocks_launched)
 */
template <typename InputBufferIt,
          typename OutputBufferIt,
          typename BufferSizeIteratorT,
          typename BufferOffsetT,
          typename BlockOffsetT,
          typename SelectedPolicy = DeviceBatchMemcpyPolicy,
          bool IsMemcpy           = true>
struct DispatchBatchMemcpy : SelectedPolicy
{
  //------------------------------------------------------------------------------
  // TYPE ALIASES
  //------------------------------------------------------------------------------
  // Tile state for the single-pass prefix scan to "stream compact" (aka "select") the buffers
  // requiring block-level collaboration
  using BufferPartitionScanTileStateT = typename cub::ScanTileState<BufferOffsetT>;

  // Tile state for the single-pass prefix scan to keep track of how many blocks are assigned to
  // each of the buffers requiring block-level collaboration
  using BufferTileOffsetScanStateT = typename cub::ScanTileState<BlockOffsetT>;

  // Internal type used to keep track of a buffer's size
  using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>;

  //------------------------------------------------------------------------------
  // Member Veriables
  //------------------------------------------------------------------------------
  void *d_temp_storage;
  size_t &temp_storage_bytes;
  InputBufferIt input_buffer_it;
  OutputBufferIt output_buffer_it;
  BufferSizeIteratorT buffer_sizes;
  BufferOffsetT num_buffers;
  cudaStream_t stream;

  //------------------------------------------------------------------------------
  // Constructor
  //------------------------------------------------------------------------------
  CUB_RUNTIME_FUNCTION __forceinline__ DispatchBatchMemcpy(void *d_temp_storage,
                                                           size_t &temp_storage_bytes,
                                                           InputBufferIt input_buffer_it,
                                                           OutputBufferIt output_buffer_it,
                                                           BufferSizeIteratorT buffer_sizes,
                                                           BufferOffsetT num_buffers,
                                                           cudaStream_t stream)
      : d_temp_storage(d_temp_storage)
      , temp_storage_bytes(temp_storage_bytes)
      , input_buffer_it(input_buffer_it)
      , output_buffer_it(output_buffer_it)
      , buffer_sizes(buffer_sizes)
      , num_buffers(num_buffers)
      , stream(stream)
  {}

  //------------------------------------------------------------------------------
  // Chained policy invocation
  //------------------------------------------------------------------------------
  /**
   * @brief Tuning policy invocation. This member function template is getting instantiated for all
   * tuning policies in the tuning policy chain. It is, however, *invoked* for the correct tuning
   * policy only.
   */
  template <typename ActivePolicyT>
  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
  {
    using MaxPolicyT = typename DispatchBatchMemcpy::MaxPolicy;

    // Single-pass prefix scan tile states for the prefix-sum over the number of block-level buffers
    using BLevBufferOffsetTileState = cub::ScanTileState<BufferOffsetT>;

    // Single-pass prefix scan tile states for the prefix sum over the number of thread blocks
    // assigned to each of the block-level buffers
    using BLevBlockOffsetTileState = cub::ScanTileState<BlockOffsetT>;

    cudaError error = cudaSuccess;

    enum : uint32_t
    {
      // Memory for the source pointers of the buffers that require block-level collaboration
      MEM_BLEV_BUFFER_SRCS = 0,
      // Memory for the destination pointers of the buffers that require block-level collaboration
      MEM_BLEV_BUFFER_DSTS,
      // Memory for the block-level buffers' sizes
      MEM_BLEV_BUFFER_SIZES,
      // Memory to keep track of the assignment of thread blocks to block-level buffers
      MEM_BLEV_BUFFER_TBLOCK,
      // Memory for the tile states of the prefix sum over the number of buffers that require
      // block-level collaboration
      MEM_BLEV_BUFFER_SCAN_STATE,
      // Memory for the scan tile states of the prefix sum over the number of thread block's
      // assigned up to and including a certain block-level buffer
      MEM_BLEV_BLOCK_SCAN_STATE,
      // Total number of distinct memory allocations in the temporary storage memory BLOB
      MEM_NUM_ALLOCATIONS
    };

    // Number of threads per block for initializing the grid states
    constexpr BlockOffsetT INIT_KERNEL_THREADS = 128U;

    // The number of buffers that get processed per thread block
    constexpr uint32_t TILE_SIZE = ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS *
                                   ActivePolicyT::AgentSmallBufferPolicyT::BUFFERS_PER_THREAD;

    // The number of thread blocks (or tiles) required to process all of the given buffers
    BlockOffsetT num_tiles = DivideAndRoundUp(num_buffers, TILE_SIZE);

    using BlevBufferSrcsOutT =
      cub::detail::conditional_t<IsMemcpy, void *, cub::detail::value_t<InputBufferIt>>;
    using BlevBufferDstOutT =
      cub::detail::conditional_t<IsMemcpy, void *, cub::detail::value_t<OutputBufferIt>>;
    using BlevBufferSrcsOutItT        = BlevBufferSrcsOutT *;
    using BlevBufferDstsOutItT        = BlevBufferDstOutT *;
    using BlevBufferSizesOutItT       = BufferSizeT *;
    using BlevBufferTileOffsetsOutItT = BlockOffsetT *;

    temporary_storage::layout<MEM_NUM_ALLOCATIONS> temporary_storage_layout;

    auto blev_buffer_srcs_slot  = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_SRCS);
    auto blev_buffer_dsts_slot  = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_DSTS);
    auto blev_buffer_sizes_slot = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_SIZES);
    auto blev_buffer_block_slot = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_TBLOCK);
    auto blev_buffer_scan_slot  = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_SCAN_STATE);
    auto blev_buffer_block_scan_slot = temporary_storage_layout.get_slot(MEM_BLEV_BLOCK_SCAN_STATE);

    auto blev_buffer_srcs_alloc =
      blev_buffer_srcs_slot->template create_alias<BlevBufferSrcsOutT>();
    auto blev_buffer_dsts_alloc = blev_buffer_dsts_slot->template create_alias<BlevBufferDstOutT>();
    auto blev_buffer_sizes_alloc = blev_buffer_sizes_slot->template create_alias<BufferSizeT>();
    auto blev_buffer_block_alloc = blev_buffer_block_slot->template create_alias<BlockOffsetT>();
    auto blev_buffer_scan_alloc  = blev_buffer_scan_slot->template create_alias<uint8_t>();
    auto blev_block_scan_alloc   = blev_buffer_block_scan_slot->template create_alias<uint8_t>();

    std::size_t buffer_offset_scan_storage = 0;
    std::size_t blev_block_scan_storage    = 0;
    CubDebug(error = BLevBufferOffsetTileState::AllocationSize(static_cast<int32_t>(num_tiles),
                                                               buffer_offset_scan_storage));
    if (error)
    {
      return error;
    }
    CubDebug(error = BLevBlockOffsetTileState::AllocationSize(static_cast<int32_t>(num_tiles),
                                                              blev_block_scan_storage));
    if (error)
    {
      return error;
    }

    blev_buffer_srcs_alloc.grow(num_buffers);
    blev_buffer_dsts_alloc.grow(num_buffers);
    blev_buffer_sizes_alloc.grow(num_buffers);
    blev_buffer_block_alloc.grow(num_buffers);
    blev_buffer_scan_alloc.grow(buffer_offset_scan_storage);
    blev_block_scan_alloc.grow(blev_block_scan_storage);

    // Just return if no temporary storage is provided
    if (d_temp_storage == nullptr)
    {
      temp_storage_bytes = temporary_storage_layout.get_size();
      return error;
    }

    // Return if empty problem
    if (num_buffers == 0)
    {
      return error;
    }

    // Alias memory buffers into the storage blob
    if (CubDebug(
          error = temporary_storage_layout.map_to_buffer(d_temp_storage, temp_storage_bytes)))
    {
      return error;
    }

    // Alias into temporary storage allocation
    BlevBufferSrcsOutItT d_blev_src_buffers          = blev_buffer_srcs_alloc.get();
    BlevBufferDstsOutItT d_blev_dst_buffers          = blev_buffer_dsts_alloc.get();
    BlevBufferSizesOutItT d_blev_buffer_sizes        = blev_buffer_sizes_alloc.get();
    BlevBufferTileOffsetsOutItT d_blev_block_offsets = blev_buffer_block_alloc.get();

    // Kernels' grid sizes
    BlockOffsetT init_grid_size         = DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS);
    BlockOffsetT batch_memcpy_grid_size = num_tiles;

    // Kernels
    auto init_scan_states_kernel =
      InitTileStateKernel<BLevBufferOffsetTileState, BLevBlockOffsetTileState, BlockOffsetT>;
    auto batch_memcpy_non_blev_kernel = BatchMemcpyKernel<MaxPolicyT,
                                                          InputBufferIt,
                                                          OutputBufferIt,
                                                          BufferSizeIteratorT,
                                                          BufferOffsetT,
                                                          BlevBufferSrcsOutItT,
                                                          BlevBufferDstsOutItT,
                                                          BlevBufferSizesOutItT,
                                                          BlevBufferTileOffsetsOutItT,
                                                          BlockOffsetT,
                                                          BLevBufferOffsetTileState,
                                                          BLevBlockOffsetTileState,
                                                          IsMemcpy>;

    auto multi_block_memcpy_kernel = MultiBlockBatchMemcpyKernel<MaxPolicyT,
                                                                 BufferOffsetT,
                                                                 BlevBufferSrcsOutItT,
                                                                 BlevBufferDstsOutItT,
                                                                 BlevBufferSizesOutItT,
                                                                 BlevBufferTileOffsetsOutItT,
                                                                 BLevBufferOffsetTileState,
                                                                 BlockOffsetT,
                                                                 IsMemcpy>;

    constexpr uint32_t BLEV_BLOCK_THREADS = ActivePolicyT::AgentLargeBufferPolicyT::BLOCK_THREADS;

    // Get device ordinal
    int device_ordinal;
    if (CubDebug(error = cudaGetDevice(&device_ordinal)))
    {
      return error;
    }

    // Get SM count
    int sm_count;
    if (CubDebug(error = cudaDeviceGetAttribute(&sm_count,
                                                cudaDevAttrMultiProcessorCount,
                                                device_ordinal)))
    {
      return error;
    }

    // Get SM occupancy for the batch memcpy block-level buffers kernel
    int batch_memcpy_blev_occupancy;
    if (CubDebug(error = MaxSmOccupancy(batch_memcpy_blev_occupancy,
                                        multi_block_memcpy_kernel,
                                        BLEV_BLOCK_THREADS)))
    {
      return error;
    }

    int batch_memcpy_blev_grid_size =
      static_cast<int>(sm_count * batch_memcpy_blev_occupancy * CUB_SUBSCRIPTION_FACTOR(0));

    // Construct the tile status for the buffer prefix sum
    BLevBufferOffsetTileState buffer_scan_tile_state;
    if (CubDebug(error = buffer_scan_tile_state.Init(static_cast<int32_t>(num_tiles),
                                                     blev_buffer_scan_alloc.get(),
                                                     buffer_offset_scan_storage)))
    {
      return error;
    }

    // Construct the tile status for thread blocks-to-buffer-assignment prefix sum
    BLevBlockOffsetTileState block_scan_tile_state;
    if (CubDebug(error = block_scan_tile_state.Init(static_cast<int32_t>(num_tiles),
                                                    blev_block_scan_alloc.get(),
                                                    blev_block_scan_storage)))
    {
      return error;
    }

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
    _CubLog("Invoking "
            "InitTileStateKernel<<<%d, %d, 0, %lld>>>()\n",
            static_cast<int>(init_grid_size),
            INIT_KERNEL_THREADS,
            (long long)stream);
#endif

    // Invoke init_kernel to initialize buffer prefix sum-tile descriptors
    error =
      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size,
                                                              INIT_KERNEL_THREADS,
                                                              0,
                                                              stream)
        .doit(init_scan_states_kernel, buffer_scan_tile_state, block_scan_tile_state, num_tiles);

    // Check for failure to launch
    if (CubDebug(error))
    {
      return error;
    }

    // Sync the stream if specified to flush runtime errors
    error = detail::DebugSyncStream(stream);

    // Check for failure to launch
    if (CubDebug(error))
    {
      return error;
    }

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
    _CubLog("Invoking "
            "BatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n",
            static_cast<int>(batch_memcpy_grid_size),
            ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS,
            (long long)stream);
#endif

    // Invoke kernel to copy small buffers and put the larger ones into a queue that will get picked
    // up by next kernel
    error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
              batch_memcpy_grid_size,
              ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS,
              0,
              stream)
              .doit(batch_memcpy_non_blev_kernel,
                    input_buffer_it,
                    output_buffer_it,
                    buffer_sizes,
                    num_buffers,
                    d_blev_src_buffers,
                    d_blev_dst_buffers,
                    d_blev_buffer_sizes,
                    d_blev_block_offsets,
                    buffer_scan_tile_state,
                    block_scan_tile_state);

    // Check for failure to launch
    if (CubDebug(error))
    {
      return error;
    }

    // Sync the stream if specified to flush runtime errors
    error = detail::DebugSyncStream(stream);
    if (CubDebug(error))
    {
      return error;
    }

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
    _CubLog("Invoking "
            "MultiBlockBatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n",
            static_cast<int>(batch_memcpy_blev_grid_size),
            BLEV_BLOCK_THREADS,
            (long long)stream);
#endif

    error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(batch_memcpy_blev_grid_size,
                                                                    BLEV_BLOCK_THREADS,
                                                                    0,
                                                                    stream)
              .doit(multi_block_memcpy_kernel,
                    d_blev_src_buffers,
                    d_blev_dst_buffers,
                    d_blev_buffer_sizes,
                    d_blev_block_offsets,
                    buffer_scan_tile_state,
                    batch_memcpy_grid_size - 1);

    // Check for failure to launch
    if (CubDebug(error))
    {
      return error;
    }

    // Sync the stream if specified to flush runtime errors
    error = detail::DebugSyncStream(stream);

    return error;
  }

  //------------------------------------------------------------------------------
  // Dispatch entrypoints
  //------------------------------------------------------------------------------
  /**
   * Internal dispatch routine
   */
  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage,
                                                                   size_t &temp_storage_bytes,
                                                                   InputBufferIt input_buffer_it,
                                                                   OutputBufferIt output_buffer_it,
                                                                   BufferSizeIteratorT buffer_sizes,
                                                                   BufferOffsetT num_buffers,
                                                                   cudaStream_t stream)
  {
    using MaxPolicyT = typename DispatchBatchMemcpy::MaxPolicy;

    cudaError_t error = cudaSuccess;

    // Get PTX version
    int ptx_version = 0;
    if (CubDebug(error = PtxVersion(ptx_version)))
    {
      return error;
    }

    // Create dispatch functor
    DispatchBatchMemcpy dispatch(d_temp_storage,
                                 temp_storage_bytes,
                                 input_buffer_it,
                                 output_buffer_it,
                                 buffer_sizes,
                                 num_buffers,
                                 stream);

    // Dispatch to chained policy
    if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
    {
      return error;
    }
    return error;
  }
};

} // namespace detail

CUB_NAMESPACE_END