| /****************************************************************************** |
| * Copyright (c) 2011, Duane Merrill. All rights reserved. |
| * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * * Neither the name of the NVIDIA CORPORATION nor the |
| * names of its contributors may be used to endorse or promote products |
| * derived from this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| * |
| ******************************************************************************/ |
| |
| /** |
| * \file |
| * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly the same number of fixed-size work units (grains). |
| */ |
| |
| |
| #pragma once |
| |
| #include "../config.cuh" |
| #include "../util_namespace.cuh" |
| #include "../util_macro.cuh" |
| #include "../util_type.cuh" |
| #include "grid_mapping.cuh" |
| |
| /// Optional outer namespace(s) |
| CUB_NS_PREFIX |
| |
| /// CUB namespace |
| namespace cub { |
| |
| |
| /** |
| * \addtogroup GridModule |
| * @{ |
| */ |
| |
| |
| /** |
| * \brief GridEvenShare is a descriptor utility for distributing input among |
| * CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly |
| * the same number of input tiles. |
| * |
| * \par Overview |
| * Each thread block is assigned a consecutive sequence of input tiles. To help |
| * preserve alignment and eliminate the overhead of guarded loads for all but the |
| * last thread block, to GridEvenShare assigns one of three different amounts of |
| * work to a given thread block: "big", "normal", or "last". The "big" workloads |
| * are one scheduling grain larger than "normal". The "last" work unit for the |
| * last thread block may be partially-full if the input is not an even multiple of |
| * the scheduling grain size. |
| * |
| * \par |
| * Before invoking a child grid, a parent thread will typically construct an |
| * instance of GridEvenShare. The instance can be passed to child thread blocks |
| * which can initialize their per-thread block offsets using \p BlockInit(). |
| */ |
| template <typename OffsetT> |
| struct GridEvenShare |
| { |
| private: |
| |
| OffsetT total_tiles; |
| int big_shares; |
| OffsetT big_share_items; |
| OffsetT normal_share_items; |
| OffsetT normal_base_offset; |
| |
| public: |
| |
| /// Total number of input items |
| OffsetT num_items; |
| |
| /// Grid size in thread blocks |
| int grid_size; |
| |
| /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles |
| OffsetT block_offset; |
| |
| /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles |
| OffsetT block_end; |
| |
| /// Stride between input tiles |
| OffsetT block_stride; |
| |
| |
| /** |
| * \brief Constructor. |
| */ |
| __host__ __device__ __forceinline__ GridEvenShare() : |
| total_tiles(0), |
| big_shares(0), |
| big_share_items(0), |
| normal_share_items(0), |
| normal_base_offset(0), |
| num_items(0), |
| grid_size(0), |
| block_offset(0), |
| block_end(0), |
| block_stride(0) |
| {} |
| |
|
|
| /** |
| * \brief Dispatch initializer. To be called prior prior to kernel launch. |
| */ |
| __host__ __device__ __forceinline__ void DispatchInit( |
| OffsetT num_items, ///< Total number of input items |
| int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items) |
| int tile_items) ///< Number of data items per input tile |
| { |
| this->block_offset = num_items; // Initialize past-the-end |
| this->block_end = num_items; // Initialize past-the-end |
| this->num_items = num_items; |
| this->total_tiles = (num_items + tile_items - 1) / tile_items; |
| this->grid_size = CUB_MIN(total_tiles, max_grid_size); |
| OffsetT avg_tiles_per_block = total_tiles / grid_size; |
| this->big_shares = total_tiles - (avg_tiles_per_block * grid_size); // leftover grains go to big blocks |
| this->normal_share_items = avg_tiles_per_block * tile_items; |
| this->normal_base_offset = big_shares * tile_items; |
| this->big_share_items = normal_share_items + tile_items; |
| } |
| |
|
|
| /** |
| * \brief Initializes ranges for the specified thread block index. Specialized |
| * for a "raking" access pattern in which each thread block is assigned a |
| * consecutive sequence of input tiles. |
| */ |
| template <int TILE_ITEMS> |
| __device__ __forceinline__ void BlockInit( |
| int block_id, |
| Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/) |
| { |
| block_stride = TILE_ITEMS; |
| if (block_id < big_shares) |
| { |
| // This thread block gets a big share of grains (avg_tiles_per_block + 1) |
| block_offset = (block_id * big_share_items); |
| block_end = block_offset + big_share_items; |
| } |
| else if (block_id < total_tiles) |
| { |
| // This thread block gets a normal share of grains (avg_tiles_per_block) |
| block_offset = normal_base_offset + (block_id * normal_share_items); |
| block_end = CUB_MIN(num_items, block_offset + normal_share_items); |
| } |
| // Else default past-the-end |
| } |
| |
|
|
| /** |
| * \brief Block-initialization, specialized for a "raking" access |
| * pattern in which each thread block is assigned a consecutive sequence |
| * of input tiles. |
| */ |
| template <int TILE_ITEMS> |
| __device__ __forceinline__ void BlockInit( |
| int block_id, |
| Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/) |
| { |
| block_stride = grid_size * TILE_ITEMS; |
| block_offset = (block_id * TILE_ITEMS); |
| block_end = num_items; |
| } |
| |
|
|
| /** |
| * \brief Block-initialization, specialized for "strip mining" access |
| * pattern in which the input tiles assigned to each thread block are |
| * separated by a stride equal to the the extent of the grid. |
| */ |
| template < |
| int TILE_ITEMS, |
| GridMappingStrategy STRATEGY> |
| __device__ __forceinline__ void BlockInit() |
| { |
| BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>()); |
| } |
| |
|
|
| /** |
| * \brief Block-initialization, specialized for a "raking" access |
| * pattern in which each thread block is assigned a consecutive sequence |
| * of input tiles. |
| */ |
| template <int TILE_ITEMS> |
| __device__ __forceinline__ void BlockInit( |
| OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive) |
| OffsetT block_end) ///< [in] Threadblock end offset (exclusive) |
| { |
| this->block_offset = block_offset; |
| this->block_end = block_end; |
| this->block_stride = TILE_ITEMS; |
| } |
| |
|
|
| }; |
|
|
|
|
|
|
|
|
|
|
| /** @} */ // end group GridModule |
| |
| } // CUB namespace |
| CUB_NS_POSTFIX // Optional outer namespace(s) |
| |