File size: 4,880 Bytes

8ae5fc5

/******************************************************************************
 * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#include <cub/device/device_scan.cuh>

#include <thrust/device_vector.h>

#include <iostream>

template <class ScanTileStateT>
__global__ void init_kernel(ScanTileStateT tile_state, int blocks_in_grid)
{
  tile_state.InitializeStatus(blocks_in_grid);
}

template <class MessageT>
__global__ void decoupled_look_back_kernel(cub::ScanTileState<MessageT> tile_state)
{
  using scan_op_t         = cub::Sum;
  using scan_tile_state_t = cub::ScanTileState<MessageT>;
  using tile_prefix_op    = cub::TilePrefixCallbackOp<MessageT, scan_op_t, scan_tile_state_t>;
  using temp_storage_t    = typename tile_prefix_op::TempStorage;

  // Allocate temp storage in shared memory
  __shared__ temp_storage_t temp_storage;

  scan_op_t scan_op{};
  const unsigned int threads_in_warp = 32;
  const unsigned int tid             = threadIdx.x;

  // Construct prefix op
  tile_prefix_op prefix(tile_state, temp_storage, scan_op);
  const unsigned int tile_idx = prefix.GetTileIdx();

  // Compute block aggregate
  MessageT block_aggregate = blockIdx.x;

  if (tile_idx == 0)
  {
    // There are no blocks to look back to, immediately set the inclusive state
    if (tid == 0)
    {
      tile_state.SetInclusive(tile_idx, block_aggregate);
      printf("tile %d: inclusive = %d\n", tile_idx, block_aggregate);
    }
  }
  else
  {
    // Only the first warp in the block can perform the look back
    const unsigned int warp_id = tid / threads_in_warp;

    if (warp_id == 0)
    {
      // Perform the decoupled look-back
      // Invocation of the prefix will block until the look-back is complete.
      MessageT exclusive_prefix = prefix(block_aggregate);

      if (tid == 0)
      {
        MessageT inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
        printf("tile %d: exclusive = %d inclusive = %d\n",
               tile_idx,
               exclusive_prefix,
               inclusive_prefix);
      }
    }
  }
}

template <class MessageT>
void decoupled_look_back_example(int blocks_in_grid)
{
  using scan_tile_state_t = cub::ScanTileState<MessageT>;

  // Query temporary storage requirements
  std::size_t temp_storage_bytes{};
  scan_tile_state_t::AllocationSize(blocks_in_grid, temp_storage_bytes);

  // Allocate temporary storage
  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
  std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

  // Initialize temporary storage
  scan_tile_state_t tile_status;
  tile_status.Init(blocks_in_grid, d_temp_storage, temp_storage_bytes);
  const unsigned int threads_in_init_block = 256;
  const unsigned int blocks_in_init_grid   = cub::DivideAndRoundUp(blocks_in_grid,
                                                                 threads_in_init_block);
  init_kernel<<<blocks_in_init_grid, threads_in_init_block>>>(tile_status, blocks_in_grid);

  // Launch decoupled look-back
  const unsigned int threads_in_block = 256;
  decoupled_look_back_kernel<<<blocks_in_grid, threads_in_block>>>(tile_status);

  // Wait for kernel to finish
  cudaDeviceSynchronize();
}

int main() { decoupled_look_back_example<int>(14); }