/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include #include template __global__ void init_kernel(ScanTileStateT tile_state, int blocks_in_grid) { tile_state.InitializeStatus(blocks_in_grid); } template __global__ void decoupled_look_back_kernel(cub::ScanTileState tile_state) { using scan_op_t = cub::Sum; using scan_tile_state_t = cub::ScanTileState; using tile_prefix_op = cub::TilePrefixCallbackOp; using temp_storage_t = typename tile_prefix_op::TempStorage; // Allocate temp storage in shared memory __shared__ temp_storage_t temp_storage; scan_op_t scan_op{}; const unsigned int threads_in_warp = 32; const unsigned int tid = threadIdx.x; // Construct prefix op tile_prefix_op prefix(tile_state, temp_storage, scan_op); const unsigned int tile_idx = prefix.GetTileIdx(); // Compute block aggregate MessageT block_aggregate = blockIdx.x; if (tile_idx == 0) { // There are no blocks to look back to, immediately set the inclusive state if (tid == 0) { tile_state.SetInclusive(tile_idx, block_aggregate); printf("tile %d: inclusive = %d\n", tile_idx, block_aggregate); } } else { // Only the first warp in the block can perform the look back const unsigned int warp_id = tid / threads_in_warp; if (warp_id == 0) { // Perform the decoupled look-back // Invocation of the prefix will block until the look-back is complete. MessageT exclusive_prefix = prefix(block_aggregate); if (tid == 0) { MessageT inclusive_prefix = scan_op(exclusive_prefix, block_aggregate); printf("tile %d: exclusive = %d inclusive = %d\n", tile_idx, exclusive_prefix, inclusive_prefix); } } } } template void decoupled_look_back_example(int blocks_in_grid) { using scan_tile_state_t = cub::ScanTileState; // Query temporary storage requirements std::size_t temp_storage_bytes{}; scan_tile_state_t::AllocationSize(blocks_in_grid, temp_storage_bytes); // Allocate temporary storage thrust::device_vector temp_storage(temp_storage_bytes); std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); // Initialize temporary storage scan_tile_state_t tile_status; tile_status.Init(blocks_in_grid, d_temp_storage, temp_storage_bytes); const unsigned int threads_in_init_block = 256; const unsigned int blocks_in_init_grid = cub::DivideAndRoundUp(blocks_in_grid, threads_in_init_block); init_kernel<<>>(tile_status, blocks_in_grid); // Launch decoupled look-back const unsigned int threads_in_block = 256; decoupled_look_back_kernel<<>>(tile_status); // Wait for kernel to finish cudaDeviceSynchronize(); } int main() { decoupled_look_back_example(14); }