File size: 14,322 Bytes

0dc1b04

/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#pragma once

#include <cub/agent/agent_adjacent_difference.cuh>
#include <cub/config.cuh>
#include <cub/detail/type_traits.cuh>
#include <cub/util_debug.cuh>
#include <cub/util_deprecated.cuh>
#include <cub/util_device.cuh>
#include <cub/util_math.cuh>
#include <cub/util_namespace.cuh>

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <iterator>

CUB_NAMESPACE_BEGIN


template <typename AgentDifferenceInitT,
          typename InputIteratorT,
          typename InputT,
          typename OffsetT>
void __global__ DeviceAdjacentDifferenceInitKernel(InputIteratorT first,
                                                   InputT *result,
                                                   OffsetT num_tiles,
                                                   int items_per_tile)
{
  const int tile_idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
  AgentDifferenceInitT::Process(tile_idx,
                                first,
                                result,
                                num_tiles,
                                items_per_tile);
}

template <typename ChainedPolicyT,
          typename InputIteratorT,
          typename OutputIteratorT,
          typename DifferenceOpT,
          typename OffsetT,
          typename InputT,
          bool MayAlias,
          bool ReadLeft>
void __global__
DeviceAdjacentDifferenceDifferenceKernel(InputIteratorT input,
                                         InputT *first_tile_previous,
                                         OutputIteratorT result,
                                         DifferenceOpT difference_op,
                                         OffsetT num_items)
{
  using ActivePolicyT = 
    typename ChainedPolicyT::ActivePolicy::AdjacentDifferencePolicy;

  // It is OK to introspect the return type or parameter types of the 
  // `operator()` function of `__device__` extended lambda within device code.
  using OutputT = detail::invoke_result_t<DifferenceOpT, InputT, InputT>;

  using Agent = AgentDifference<ActivePolicyT,
                                InputIteratorT,
                                OutputIteratorT,
                                DifferenceOpT,
                                OffsetT,
                                InputT,
                                OutputT,
                                MayAlias,
                                ReadLeft>;

  __shared__ typename Agent::TempStorage storage;

  Agent agent(storage,
              input,
              first_tile_previous,
              result,
              difference_op,
              num_items);

  int tile_idx = static_cast<int>(blockIdx.x);
  OffsetT tile_base  = static_cast<OffsetT>(tile_idx) 
                     * ActivePolicyT::ITEMS_PER_TILE;

  agent.Process(tile_idx, tile_base);
}

template <typename InputIteratorT, bool MayAlias = true>
struct DeviceAdjacentDifferencePolicy
{
  using ValueT = typename std::iterator_traits<InputIteratorT>::value_type;

  //------------------------------------------------------------------------------
  // Architecture-specific tuning policies
  //------------------------------------------------------------------------------

  struct Policy300 : ChainedPolicy<300, Policy300, Policy300>
  {
    using AdjacentDifferencePolicy =
      AgentAdjacentDifferencePolicy<128,
                                    Nominal8BItemsToItems<ValueT>(7),
                                    BLOCK_LOAD_WARP_TRANSPOSE,
                                    LOAD_DEFAULT,
                                    BLOCK_STORE_WARP_TRANSPOSE>;
  };

  struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
  {
    using AdjacentDifferencePolicy =
      AgentAdjacentDifferencePolicy<128,
                                    Nominal8BItemsToItems<ValueT>(7),
                                    BLOCK_LOAD_WARP_TRANSPOSE,
                                    MayAlias ? LOAD_CA : LOAD_LDG,
                                    BLOCK_STORE_WARP_TRANSPOSE>;
  };

  using MaxPolicy = Policy350;
};

template <typename InputIteratorT,
          typename OutputIteratorT,
          typename DifferenceOpT,
          typename OffsetT,
          bool MayAlias,
          bool ReadLeft,
          typename SelectedPolicy =
            DeviceAdjacentDifferencePolicy<InputIteratorT, MayAlias>>
struct DispatchAdjacentDifference : public SelectedPolicy
{
  using InputT = typename std::iterator_traits<InputIteratorT>::value_type;

  void *d_temp_storage;
  std::size_t &temp_storage_bytes;
  InputIteratorT d_input;
  OutputIteratorT d_output;
  OffsetT num_items;
  DifferenceOpT difference_op;
  cudaStream_t stream;

  CUB_RUNTIME_FUNCTION __forceinline__
  DispatchAdjacentDifference(void *d_temp_storage,
                             std::size_t &temp_storage_bytes,
                             InputIteratorT d_input,
                             OutputIteratorT d_output,
                             OffsetT num_items,
                             DifferenceOpT difference_op,
                             cudaStream_t stream)
      : d_temp_storage(d_temp_storage)
      , temp_storage_bytes(temp_storage_bytes)
      , d_input(d_input)
      , d_output(d_output)
      , num_items(num_items)
      , difference_op(difference_op)
      , stream(stream)
  {}

  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
  CUB_DEPRECATED CUB_RUNTIME_FUNCTION __forceinline__
  DispatchAdjacentDifference(void *d_temp_storage,
                             std::size_t &temp_storage_bytes,
                             InputIteratorT d_input,
                             OutputIteratorT d_output,
                             OffsetT num_items,
                             DifferenceOpT difference_op,
                             cudaStream_t stream,
                             bool debug_synchronous)
      : d_temp_storage(d_temp_storage)
      , temp_storage_bytes(temp_storage_bytes)
      , d_input(d_input)
      , d_output(d_output)
      , num_items(num_items)
      , difference_op(difference_op)
      , stream(stream)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
  }

  /// Invocation
  template <typename ActivePolicyT>
  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
  {
    using AdjacentDifferencePolicyT =
      typename ActivePolicyT::AdjacentDifferencePolicy;

    using MaxPolicyT = typename DispatchAdjacentDifference::MaxPolicy;

    cudaError error = cudaSuccess;

    do
    {
      const int tile_size = AdjacentDifferencePolicyT::ITEMS_PER_TILE;
      const int num_tiles =
        static_cast<int>(DivideAndRoundUp(num_items, tile_size));

      std::size_t first_tile_previous_size = MayAlias * num_tiles *
                                             sizeof(InputT);

      void *allocations[1]            = {nullptr};
      std::size_t allocation_sizes[1] = {MayAlias * first_tile_previous_size};

      if (CubDebug(error = AliasTemporaries(d_temp_storage,
                                            temp_storage_bytes,
                                            allocations,
                                            allocation_sizes)))
      {
        break;
      }

      if (d_temp_storage == nullptr)
      {
        // Return if the caller is simply requesting the size of the storage
        // allocation

        if (temp_storage_bytes == 0)
        {
          temp_storage_bytes = 1;
        }

        break;
      }

      if (num_items == OffsetT{})
      {
        break;
      }

      auto first_tile_previous = reinterpret_cast<InputT*>(allocations[0]);

      if (MayAlias)
      {
        using AgentDifferenceInitT =
          AgentDifferenceInit<InputIteratorT, InputT, OffsetT, ReadLeft>;

        const int init_block_size = AgentDifferenceInitT::BLOCK_THREADS;
        const int init_grid_size = DivideAndRoundUp(num_tiles, init_block_size);

        #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
        _CubLog("Invoking DeviceAdjacentDifferenceInitKernel"
                "<<<%d, %d, 0, %lld>>>()\n",
                init_grid_size,
                init_block_size,
                reinterpret_cast<long long>(stream));
        #endif

        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size,
                                                                init_block_size,
                                                                0,
                                                                stream)
          .doit(DeviceAdjacentDifferenceInitKernel<AgentDifferenceInitT,
                                                   InputIteratorT,
                                                   InputT,
                                                   OffsetT>,
                d_input,
                first_tile_previous,
                num_tiles,
                tile_size);

        error = detail::DebugSyncStream(stream);

        if (CubDebug(error))
        {
          break;
        }

        // Check for failure to launch
        if (CubDebug(error = cudaPeekAtLastError()))
        {
          break;
        }
      }

      #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
      _CubLog("Invoking DeviceAdjacentDifferenceDifferenceKernel"
              "<<<%d, %d, 0, %lld>>>()\n",
              num_tiles,
              AdjacentDifferencePolicyT::BLOCK_THREADS,
              reinterpret_cast<long long>(stream));
      #endif

      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
        num_tiles,
        AdjacentDifferencePolicyT::BLOCK_THREADS,
        0,
        stream)
        .doit(DeviceAdjacentDifferenceDifferenceKernel<MaxPolicyT,
                                                       InputIteratorT,
                                                       OutputIteratorT,
                                                       DifferenceOpT,
                                                       OffsetT,
                                                       InputT,
                                                       MayAlias,
                                                       ReadLeft>,
              d_input,
              first_tile_previous,
              d_output,
              difference_op,
              num_items);

      error = detail::DebugSyncStream(stream);
      
      if (CubDebug(error))
      {
        break;
      }

      // Check for failure to launch
      if (CubDebug(error = cudaPeekAtLastError()))
      {
        break;
      }
    } while (0);

    return error;
  }

  CUB_RUNTIME_FUNCTION
  static cudaError_t Dispatch(void *d_temp_storage,
                              std::size_t &temp_storage_bytes,
                              InputIteratorT d_input,
                              OutputIteratorT d_output,
                              OffsetT num_items,
                              DifferenceOpT difference_op,
                              cudaStream_t stream)
  {
    using MaxPolicyT = typename DispatchAdjacentDifference::MaxPolicy;

    cudaError error = cudaSuccess;
    do
    {
      // Get PTX version
      int ptx_version = 0;
      if (CubDebug(error = PtxVersion(ptx_version)))
      {
        break;
      }

      // Create dispatch functor
      DispatchAdjacentDifference dispatch(d_temp_storage,
                                          temp_storage_bytes,
                                          d_input,
                                          d_output,
                                          num_items,
                                          difference_op,
                                          stream);

      // Dispatch to chained policy
      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
      {
        break;
      }
    } while (0);

    return error;
  }

  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
  CUB_RUNTIME_FUNCTION
  static cudaError_t Dispatch(void *d_temp_storage,
                              std::size_t &temp_storage_bytes,
                              InputIteratorT d_input,
                              OutputIteratorT d_output,
                              OffsetT num_items,
                              DifferenceOpT difference_op,
                              cudaStream_t stream,
                              bool debug_synchronous)
  {
    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

    return Dispatch(d_temp_storage,
                    temp_storage_bytes,
                    d_input,
                    d_output,
                    num_items,
                    difference_op,
                    stream);
  }
};


CUB_NAMESPACE_END