File size: 25,942 Bytes

0dc1b04


/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/**
 * @file cub::DeviceScan provides device-wide, parallel operations for
 *       computing a prefix scan across a sequence of data items residing
 *       within device-accessible memory.
 */

#pragma once

#include <iterator>

#include <cub/agent/agent_scan.cuh>
#include <cub/config.cuh>
#include <cub/grid/grid_queue.cuh>
#include <cub/thread/thread_operators.cuh>
#include <cub/util_debug.cuh>
#include <cub/util_deprecated.cuh>
#include <cub/util_device.cuh>
#include <cub/util_math.cuh>

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

CUB_NAMESPACE_BEGIN

/******************************************************************************
 * Kernel entry points
 *****************************************************************************/

/**
 * @brief Initialization kernel for tile status initialization (multi-block)
 *
 * @tparam ScanTileStateT
 *   Tile status interface type
 *
 * @param[in] tile_state
 *   Tile status interface
 *
 * @param[in] num_tiles
 *   Number of tiles
 */
template <typename ScanTileStateT>
__global__ void DeviceScanInitKernel(ScanTileStateT tile_state, int num_tiles)
{
  // Initialize tile status
  tile_state.InitializeStatus(num_tiles);
}

/**
 * Initialization kernel for tile status initialization (multi-block)
 *
 * @tparam ScanTileStateT
 *   Tile status interface type
 *
 * @tparam NumSelectedIteratorT
 *   Output iterator type for recording the number of items selected
 *
 * @param[in] tile_state
 *   Tile status interface
 *
 * @param[in] num_tiles
 *   Number of tiles
 *
 * @param[out] d_num_selected_out
 *   Pointer to the total number of items selected
 *   (i.e., length of `d_selected_out`)
 */
template <typename ScanTileStateT, typename NumSelectedIteratorT>
__global__ void DeviceCompactInitKernel(ScanTileStateT tile_state,
                                        int num_tiles,
                                        NumSelectedIteratorT d_num_selected_out)
{
  // Initialize tile status
  tile_state.InitializeStatus(num_tiles);

  // Initialize d_num_selected_out
  if ((blockIdx.x == 0) && (threadIdx.x == 0))
  {
    *d_num_selected_out = 0;
  }
}

/**
 * @brief Scan kernel entry point (multi-block)
 *
 *
 * @tparam ChainedPolicyT
 *   Chained tuning policy
 *
 * @tparam InputIteratorT
 *   Random-access input iterator type for reading scan inputs \iterator
 *
 * @tparam OutputIteratorT
 *   Random-access output iterator type for writing scan outputs \iterator
 *
 * @tparam ScanTileStateT
 *   Tile status interface type
 *
 * @tparam ScanOpT
 *   Binary scan functor type having member
 *   `auto operator()(const T &a, const U &b)`
 *
 * @tparam InitValueT
 *   Initial value to seed the exclusive scan
 *   (cub::NullType for inclusive scans)
 *
 * @tparam OffsetT
 *   Signed integer type for global offsets
 *
 * @paramInput d_in
 *   data
 *
 * @paramOutput d_out
 *   data
 *
 * @paramTile tile_state
 *   status interface
 *
 * @paramThe start_tile
 *   starting tile for the current grid
 *
 * @paramBinary scan_op
 *   scan functor
 *
 * @paramInitial init_value
 *   value to seed the exclusive scan
 *
 * @paramTotal num_items
 *   number of scan items for the entire problem
 */
template <typename ChainedPolicyT,
          typename InputIteratorT,
          typename OutputIteratorT,
          typename ScanTileStateT,
          typename ScanOpT,
          typename InitValueT,
          typename OffsetT,
          typename AccumT>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
  __global__ void DeviceScanKernel(InputIteratorT d_in,
                                   OutputIteratorT d_out,
                                   ScanTileStateT tile_state,
                                   int start_tile,
                                   ScanOpT scan_op,
                                   InitValueT init_value,
                                   OffsetT num_items)
{
  using RealInitValueT = typename InitValueT::value_type;
  typedef typename ChainedPolicyT::ActivePolicy::ScanPolicyT ScanPolicyT;

  // Thread block type for scanning input tiles
  typedef AgentScan<ScanPolicyT,
                    InputIteratorT,
                    OutputIteratorT,
                    ScanOpT,
                    RealInitValueT,
                    OffsetT,
                    AccumT>
    AgentScanT;

  // Shared memory for AgentScan
  __shared__ typename AgentScanT::TempStorage temp_storage;

  RealInitValueT real_init_value = init_value;

  // Process tiles
  AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value)
    .ConsumeRange(num_items, tile_state, start_tile);
}

/******************************************************************************
 * Policy
 ******************************************************************************/

namespace detail
{
namespace scan
{

template <int Threads, int Items, int L2B, int L2W>
struct tuning
{
  static constexpr int threads = Threads;
  static constexpr int items   = Items;

  using delay_constructor = detail::fixed_delay_constructor_t<L2B, L2W>;
};

template <class AccumT,
          bool PrimitiveOp,
          bool PrimitiveAccumulator = Traits<AccumT>::PRIMITIVE,
          std::size_t AccumSize     = sizeof(AccumT)>
struct sm90_tuning
{
  static constexpr int threads = 128;
  static constexpr int items   = 15;

  using delay_constructor = detail::default_delay_constructor_t<AccumT>;
};

// clang-format off
template <class T> struct sm90_tuning<T, true, true, 1> : tuning<192, 22, 168, 1140> {};
template <class T> struct sm90_tuning<T, true, true, 2> : tuning<512, 12, 376, 1125> {};
template <class T> struct sm90_tuning<T, true, true, 4> : tuning<128, 24, 648, 1245> {};
template <class T> struct sm90_tuning<T, true, true, 8> : tuning<224, 24, 632, 1290> {};

template <> struct sm90_tuning<float,  true, true,  sizeof(float)> : tuning<128, 24, 688, 1140> {};
template <> struct sm90_tuning<double, true, true, sizeof(double)> : tuning<224, 24, 576, 1215> {};

#if CUB_IS_INT128_ENABLED 
template <> struct sm90_tuning< __int128_t, true, false,  sizeof(__int128_t)> : tuning<576, 21, 860, 630> {};
template <> struct sm90_tuning<__uint128_t, true, false, sizeof(__uint128_t)> : tuning<576, 21, 860, 630> {};
#endif
// clang-format on

} // namespace scan
} // namespace detail

template <typename AccumT, typename ScanOpT = Sum> 
struct DeviceScanPolicy
{
  // For large values, use timesliced loads/stores to fit shared memory.
  static constexpr bool LargeValues = sizeof(AccumT) > 128;
  static constexpr BlockLoadAlgorithm ScanTransposedLoad =
    LargeValues ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED
                : BLOCK_LOAD_WARP_TRANSPOSE;
  static constexpr BlockStoreAlgorithm ScanTransposedStore =
    LargeValues ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED
                : BLOCK_STORE_WARP_TRANSPOSE;

  template <int NOMINAL_BLOCK_THREADS_4B,
            int NOMINAL_ITEMS_PER_THREAD_4B,
            typename ComputeT,
            BlockLoadAlgorithm LOAD_ALGORITHM,
            CacheLoadModifier LOAD_MODIFIER,
            BlockStoreAlgorithm STORE_ALGORITHM,
            BlockScanAlgorithm SCAN_ALGORITHM,
            typename DelayConstructorT>
  using policy_t =
    AgentScanPolicy<NOMINAL_BLOCK_THREADS_4B,
                    NOMINAL_ITEMS_PER_THREAD_4B,
                    ComputeT,
                    LOAD_ALGORITHM,
                    LOAD_MODIFIER,
                    STORE_ALGORITHM,
                    SCAN_ALGORITHM,
                    MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>,
                    DelayConstructorT>;

  /// SM350
  struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
  {
    // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
    using ScanPolicyT = policy_t<128,
                                 12, ///< Threads per block, items per thread
                                 AccumT,
                                 BLOCK_LOAD_DIRECT,
                                 LOAD_CA,
                                 BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
                                 BLOCK_SCAN_RAKING,
                                 detail::default_delay_constructor_t<AccumT>>;
  };

  /// SM520
  struct Policy520 : ChainedPolicy<520, Policy520, Policy350>
  {
    // Titan X: 32.47B items/s @ 48M 32-bit T
    using ScanPolicyT = policy_t<128,
                                 12, ///< Threads per block, items per thread
                                 AccumT,
                                 BLOCK_LOAD_DIRECT,
                                 LOAD_CA,
                                 ScanTransposedStore,
                                 BLOCK_SCAN_WARP_SCANS,
                                 detail::default_delay_constructor_t<AccumT>>;
  };

  /// SM600
  struct Policy600 : ChainedPolicy<600, Policy600, Policy520>
  {
    using ScanPolicyT = policy_t<128,
                                 15, ///< Threads per block, items per thread
                                 AccumT,
                                 ScanTransposedLoad,
                                 LOAD_DEFAULT,
                                 ScanTransposedStore,
                                 BLOCK_SCAN_WARP_SCANS,
                                 detail::default_delay_constructor_t<AccumT>>;
  };

  /// SM900
  struct Policy900 : ChainedPolicy<900, Policy900, Policy600>
  {
    using tuning = detail::scan::sm90_tuning<AccumT, detail::basic_binary_op_t<ScanOpT>::value>;

    using ScanPolicyT = policy_t<tuning::threads,
                                 tuning::items,
                                 AccumT,
                                 ScanTransposedLoad,
                                 LOAD_DEFAULT,
                                 ScanTransposedStore,
                                 BLOCK_SCAN_WARP_SCANS,
                                 typename tuning::delay_constructor>;
  };

  using MaxPolicy = Policy900;
};

/******************************************************************************
 * Dispatch
 ******************************************************************************/

/**
 * @brief Utility class for dispatching the appropriately-tuned kernels for
 *        DeviceScan
 *
 * @tparam InputIteratorT
 *   Random-access input iterator type for reading scan inputs \iterator
 *
 * @tparam OutputIteratorT
 *   Random-access output iterator type for writing scan outputs \iterator
 *
 * @tparam ScanOpT
 *   Binary scan functor type having member 
 *   `auto operator()(const T &a, const U &b)`
 *
 * @tparam InitValueT
 *   The init_value element type for ScanOpT (cub::NullType for inclusive scans)
 *
 * @tparam OffsetT
 *   Signed integer type for global offsets
 *
 */
template <typename InputIteratorT,
          typename OutputIteratorT,
          typename ScanOpT,
          typename InitValueT,
          typename OffsetT,
          typename AccumT = 
            detail::accumulator_t<
              ScanOpT, 
              cub::detail::conditional_t<
                std::is_same<InitValueT, NullType>::value,
                cub::detail::value_t<InputIteratorT>,
                typename InitValueT::value_type>,
              cub::detail::value_t<InputIteratorT>>,
          typename SelectedPolicy = DeviceScanPolicy<AccumT, ScanOpT>>
struct DispatchScan : SelectedPolicy
{
  //---------------------------------------------------------------------
  // Constants and Types
  //---------------------------------------------------------------------

  static constexpr int INIT_KERNEL_THREADS = 128;

  // The input value type
  using InputT = cub::detail::value_t<InputIteratorT>;

  /// Device-accessible allocation of temporary storage.  When NULL, the
  /// required allocation size is written to \p temp_storage_bytes and no work
  /// is done.
  void *d_temp_storage;

  /// Reference to size in bytes of \p d_temp_storage allocation
  size_t &temp_storage_bytes;

  /// Iterator to the input sequence of data items
  InputIteratorT d_in;

  /// Iterator to the output sequence of data items
  OutputIteratorT d_out;

  /// Binary scan functor
  ScanOpT scan_op;

  /// Initial value to seed the exclusive scan
  InitValueT init_value;

  /// Total number of input items (i.e., the length of \p d_in)
  OffsetT num_items;

  /// CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
  cudaStream_t stream;

  int ptx_version;

  /**
   *
   * @param[in] d_temp_storage
   *   Device-accessible allocation of temporary storage. When `nullptr`, the
   *   required allocation size is written to `temp_storage_bytes` and no
   *   work is done.
   *
   * @param[in,out] temp_storage_bytes
   *   Reference to size in bytes of `d_temp_storage` allocation
   *
   * @param[in] d_in
   *   Iterator to the input sequence of data items
   *
   * @param[out] d_out
   *   Iterator to the output sequence of data items
   *
   * @param[in] num_items
   *   Total number of input items (i.e., the length of `d_in`)
   *
   * @param[in] scan_op
   *   Binary scan functor
   *
   * @param[in] init_value
   *   Initial value to seed the exclusive scan
   *
   * @param[in] stream
   *   **[optional]** CUDA stream to launch kernels within.
   *   Default is stream<sub>0</sub>.
   */
  CUB_RUNTIME_FUNCTION __forceinline__ DispatchScan(void *d_temp_storage,
                                                    size_t &temp_storage_bytes,
                                                    InputIteratorT d_in,
                                                    OutputIteratorT d_out,
                                                    OffsetT num_items,
                                                    ScanOpT scan_op,
                                                    InitValueT init_value,
                                                    cudaStream_t stream,
                                                    int ptx_version)
      : d_temp_storage(d_temp_storage)
      , temp_storage_bytes(temp_storage_bytes)
      , d_in(d_in)
      , d_out(d_out)
      , scan_op(scan_op)
      , init_value(init_value)
      , num_items(num_items)
      , stream(stream)
      , ptx_version(ptx_version)
  {}

  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
  CUB_RUNTIME_FUNCTION __forceinline__ DispatchScan(void *d_temp_storage,
                                                    size_t &temp_storage_bytes,
                                                    InputIteratorT d_in,
                                                    OutputIteratorT d_out,
                                                    OffsetT num_items,
                                                    ScanOpT scan_op,
                                                    InitValueT init_value,
                                                    cudaStream_t stream,
                                                    bool debug_synchronous,
                                                    int ptx_version)
      : d_temp_storage(d_temp_storage)
      , temp_storage_bytes(temp_storage_bytes)
      , d_in(d_in)
      , d_out(d_out)
      , scan_op(scan_op)
      , init_value(init_value)
      , num_items(num_items)
      , stream(stream)
      , ptx_version(ptx_version)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
  }

  template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
  CUB_RUNTIME_FUNCTION __host__ __forceinline__ cudaError_t
  Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
  {
    typedef typename ActivePolicyT::ScanPolicyT Policy;
    typedef typename cub::ScanTileState<AccumT> ScanTileStateT;

    // `LOAD_LDG` makes in-place execution UB and doesn't lead to better
    // performance.
    static_assert(Policy::LOAD_MODIFIER != CacheLoadModifier::LOAD_LDG,
                  "The memory consistency model does not apply to texture "
                  "accesses");

    cudaError error = cudaSuccess;
    do
    {
      // Get device ordinal
      int device_ordinal;
      if (CubDebug(error = cudaGetDevice(&device_ordinal)))
      {
        break;
      }

      // Number of input tiles
      int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD;
      int num_tiles =
        static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));

      // Specify temporary storage allocation requirements
      size_t allocation_sizes[1];
      if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles,
                                                          allocation_sizes[0])))
      {
        break; // bytes needed for tile status descriptors
      }

      // Compute allocation pointers into the single storage blob (or compute
      // the necessary size of the blob)
      void *allocations[1] = {};
      if (CubDebug(error = AliasTemporaries(d_temp_storage,
                                            temp_storage_bytes,
                                            allocations,
                                            allocation_sizes)))
      {
        break;
      }

      if (d_temp_storage == NULL)
      {
        // Return if the caller is simply requesting the size of the storage
        // allocation
        break;
      }

      // Return if empty problem
      if (num_items == 0)
      {
        break;
      }

      // Construct the tile status interface
      ScanTileStateT tile_state;
      if (CubDebug(error = tile_state.Init(num_tiles,
                                           allocations[0],
                                           allocation_sizes[0])))
      {
        break;
      }

      // Log init_kernel configuration
      int init_grid_size = cub::DivideAndRoundUp(num_tiles,
                                                 INIT_KERNEL_THREADS);

      #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
      _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n",
              init_grid_size,
              INIT_KERNEL_THREADS,
              (long long)stream);
      #endif

      // Invoke init_kernel to initialize tile descriptors
      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
        init_grid_size,
        INIT_KERNEL_THREADS,
        0,
        stream)
        .doit(init_kernel, tile_state, num_tiles);

      // Check for failure to launch
      if (CubDebug(error = cudaPeekAtLastError()))
      {
        break;
      }

      // Sync the stream if specified to flush runtime errors
      error = detail::DebugSyncStream(stream);
      if (CubDebug(error))
      {
        break;
      }

      // Get SM occupancy for scan_kernel
      int scan_sm_occupancy;
      if (CubDebug(error = MaxSmOccupancy(scan_sm_occupancy, // out
                                          scan_kernel,
                                          Policy::BLOCK_THREADS)))
      {
        break;
      }

      // Get max x-dimension of grid
      int max_dim_x;
      if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x,
                                                  cudaDevAttrMaxGridDimX,
                                                  device_ordinal)))
      {
        break;
      }

      // Run grids in epochs (in case number of tiles exceeds max x-dimension
      int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
      for (int start_tile = 0; start_tile < num_tiles;
           start_tile += scan_grid_size)
      {
        // Log scan_kernel configuration
        #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
          _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items "
                  "per thread, %d SM occupancy\n",
                  start_tile,
                  scan_grid_size,
                  Policy::BLOCK_THREADS,
                  (long long)stream,
                  Policy::ITEMS_PER_THREAD,
                  scan_sm_occupancy);
        #endif

        // Invoke scan_kernel
        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
          scan_grid_size,
          Policy::BLOCK_THREADS,
          0,
          stream)
          .doit(scan_kernel,
                d_in,
                d_out,
                tile_state,
                start_tile,
                scan_op,
                init_value,
                num_items);

        // Check for failure to launch
        if (CubDebug(error = cudaPeekAtLastError()))
        {
          break;
        }

        // Sync the stream if specified to flush runtime errors
        error = detail::DebugSyncStream(stream);
        if (CubDebug(error))
        {
          break;
        }
      }
    } while (0);

    return error;
  }

  template <typename ActivePolicyT>
  CUB_RUNTIME_FUNCTION __host__ __forceinline__ cudaError_t Invoke()
  {
    typedef typename DispatchScan::MaxPolicy MaxPolicyT;
    typedef typename cub::ScanTileState<AccumT> ScanTileStateT;
    // Ensure kernels are instantiated.
    return Invoke<ActivePolicyT>(DeviceScanInitKernel<ScanTileStateT>,
                                 DeviceScanKernel<MaxPolicyT,
                                                  InputIteratorT,
                                                  OutputIteratorT,
                                                  ScanTileStateT,
                                                  ScanOpT,
                                                  InitValueT,
                                                  OffsetT,
                                                  AccumT>);
  }

  /**
   * @brief Internal dispatch routine
   *
   * @param[in] d_temp_storage
   *   Device-accessible allocation of temporary storage. When `nullptr`, the
   *   required allocation size is written to `temp_storage_bytes` and no
   *   work is done.
   *
   * @param[in,out] temp_storage_bytes
   *   Reference to size in bytes of `d_temp_storage` allocation
   *
   * @param[in] d_in
   *   Iterator to the input sequence of data items
   *
   * @param[out] d_out
   *   Iterator to the output sequence of data items
   *
   * @param[in] scan_op
   *   Binary scan functor
   *
   * @param[in] init_value
   *   Initial value to seed the exclusive scan
   *
   * @param[in] num_items
   *   Total number of input items (i.e., the length of `d_in`)
   *
   * @param[in] stream
   *   **[optional]** CUDA stream to launch kernels within.
   *   Default is stream<sub>0</sub>.
   *
   */
  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
  Dispatch(void *d_temp_storage,
           size_t &temp_storage_bytes,
           InputIteratorT d_in,
           OutputIteratorT d_out,
           ScanOpT scan_op,
           InitValueT init_value,
           OffsetT num_items,
           cudaStream_t stream)
  {
    typedef typename DispatchScan::MaxPolicy MaxPolicyT;

    cudaError_t error;
    do
    {
      // Get PTX version
      int ptx_version = 0;
      if (CubDebug(error = PtxVersion(ptx_version)))
      {
        break;
      }

      // Create dispatch functor
      DispatchScan dispatch(d_temp_storage,
                            temp_storage_bytes,
                            d_in,
                            d_out,
                            num_items,
                            scan_op,
                            init_value,
                            stream,
                            ptx_version);

      // Dispatch to chained policy
      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
      {
        break;
      }
    } while (0);

    return error;
  }

  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
  Dispatch(void *d_temp_storage,
           size_t &temp_storage_bytes,
           InputIteratorT d_in,
           OutputIteratorT d_out,
           ScanOpT scan_op,
           InitValueT init_value,
           OffsetT num_items,
           cudaStream_t stream,
           bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return Dispatch(d_temp_storage,
                    temp_storage_bytes,
                    d_in,
                    d_out,
                    scan_op,
                    init_value,
                    num_items,
                    stream);
  }
};

CUB_NAMESPACE_END