File size: 4,880 Bytes
8ae5fc5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | /******************************************************************************
* Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#include <cub/device/device_scan.cuh>
#include <thrust/device_vector.h>
#include <iostream>
template <class ScanTileStateT>
__global__ void init_kernel(ScanTileStateT tile_state, int blocks_in_grid)
{
tile_state.InitializeStatus(blocks_in_grid);
}
template <class MessageT>
__global__ void decoupled_look_back_kernel(cub::ScanTileState<MessageT> tile_state)
{
using scan_op_t = cub::Sum;
using scan_tile_state_t = cub::ScanTileState<MessageT>;
using tile_prefix_op = cub::TilePrefixCallbackOp<MessageT, scan_op_t, scan_tile_state_t>;
using temp_storage_t = typename tile_prefix_op::TempStorage;
// Allocate temp storage in shared memory
__shared__ temp_storage_t temp_storage;
scan_op_t scan_op{};
const unsigned int threads_in_warp = 32;
const unsigned int tid = threadIdx.x;
// Construct prefix op
tile_prefix_op prefix(tile_state, temp_storage, scan_op);
const unsigned int tile_idx = prefix.GetTileIdx();
// Compute block aggregate
MessageT block_aggregate = blockIdx.x;
if (tile_idx == 0)
{
// There are no blocks to look back to, immediately set the inclusive state
if (tid == 0)
{
tile_state.SetInclusive(tile_idx, block_aggregate);
printf("tile %d: inclusive = %d\n", tile_idx, block_aggregate);
}
}
else
{
// Only the first warp in the block can perform the look back
const unsigned int warp_id = tid / threads_in_warp;
if (warp_id == 0)
{
// Perform the decoupled look-back
// Invocation of the prefix will block until the look-back is complete.
MessageT exclusive_prefix = prefix(block_aggregate);
if (tid == 0)
{
MessageT inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
printf("tile %d: exclusive = %d inclusive = %d\n",
tile_idx,
exclusive_prefix,
inclusive_prefix);
}
}
}
}
template <class MessageT>
void decoupled_look_back_example(int blocks_in_grid)
{
using scan_tile_state_t = cub::ScanTileState<MessageT>;
// Query temporary storage requirements
std::size_t temp_storage_bytes{};
scan_tile_state_t::AllocationSize(blocks_in_grid, temp_storage_bytes);
// Allocate temporary storage
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
// Initialize temporary storage
scan_tile_state_t tile_status;
tile_status.Init(blocks_in_grid, d_temp_storage, temp_storage_bytes);
const unsigned int threads_in_init_block = 256;
const unsigned int blocks_in_init_grid = cub::DivideAndRoundUp(blocks_in_grid,
threads_in_init_block);
init_kernel<<<blocks_in_init_grid, threads_in_init_block>>>(tile_status, blocks_in_grid);
// Launch decoupled look-back
const unsigned int threads_in_block = 256;
decoupled_look_back_kernel<<<blocks_in_grid, threads_in_block>>>(tile_status);
// Wait for kernel to finish
cudaDeviceSynchronize();
}
int main() { decoupled_look_back_example<int>(14); }
|