LIVE

Runtime error

LIVE / thrust /cub /grid /grid_even_share.cuh

Xu Ma

upload all files

28958dc almost 4 years ago

8.25 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly the same number of fixed-size work units (grains).
	*/


	#pragma once

	#include "../config.cuh"
	#include "../util_namespace.cuh"
	#include "../util_macro.cuh"
	#include "../util_type.cuh"
	#include "grid_mapping.cuh"

	/// Optional outer namespace(s)
	CUB_NS_PREFIX

	/// CUB namespace
	namespace cub {


	/**
	* \addtogroup GridModule
	* @{
	*/


	/**
	* \brief GridEvenShare is a descriptor utility for distributing input among
	* CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly
	* the same number of input tiles.
	*
	* \par Overview
	* Each thread block is assigned a consecutive sequence of input tiles. To help
	* preserve alignment and eliminate the overhead of guarded loads for all but the
	* last thread block, to GridEvenShare assigns one of three different amounts of
	* work to a given thread block: "big", "normal", or "last". The "big" workloads
	* are one scheduling grain larger than "normal". The "last" work unit for the
	* last thread block may be partially-full if the input is not an even multiple of
	* the scheduling grain size.
	*
	* \par
	* Before invoking a child grid, a parent thread will typically construct an
	* instance of GridEvenShare. The instance can be passed to child thread blocks
	* which can initialize their per-thread block offsets using \p BlockInit().
	*/
	template <typename OffsetT>
	struct GridEvenShare
	{
	private:

	OffsetT total_tiles;
	int big_shares;
	OffsetT big_share_items;
	OffsetT normal_share_items;
	OffsetT normal_base_offset;

	public:

	/// Total number of input items
	OffsetT num_items;

	/// Grid size in thread blocks
	int grid_size;

	/// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
	OffsetT block_offset;

	/// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
	OffsetT block_end;

	/// Stride between input tiles
	OffsetT block_stride;


	/**
	* \brief Constructor.
	*/
	__host__ __device__ __forceinline__ GridEvenShare() :
	total_tiles(0),
	big_shares(0),
	big_share_items(0),
	normal_share_items(0),
	normal_base_offset(0),
	num_items(0),
	grid_size(0),
	block_offset(0),
	block_end(0),
	block_stride(0)
	{}


	/**
	* \brief Dispatch initializer. To be called prior prior to kernel launch.
	*/
	__host__ __device__ __forceinline__ void DispatchInit(
	OffsetT num_items, ///< Total number of input items
	int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
	int tile_items) ///< Number of data items per input tile
	{
	this->block_offset = num_items; // Initialize past-the-end
	this->block_end = num_items; // Initialize past-the-end
	this->num_items = num_items;
	this->total_tiles = (num_items + tile_items - 1) / tile_items;
	this->grid_size = CUB_MIN(total_tiles, max_grid_size);
	OffsetT avg_tiles_per_block = total_tiles / grid_size;
	this->big_shares = total_tiles - (avg_tiles_per_block * grid_size); // leftover grains go to big blocks
	this->normal_share_items = avg_tiles_per_block * tile_items;
	this->normal_base_offset = big_shares * tile_items;
	this->big_share_items = normal_share_items + tile_items;
	}


	/**
	* \brief Initializes ranges for the specified thread block index. Specialized
	* for a "raking" access pattern in which each thread block is assigned a
	* consecutive sequence of input tiles.
	*/
	template <int TILE_ITEMS>
	__device__ __forceinline__ void BlockInit(
	int block_id,
	Int2Type<GRID_MAPPING_RAKE> /strategy_tag/)
	{
	block_stride = TILE_ITEMS;
	if (block_id < big_shares)
	{
	// This thread block gets a big share of grains (avg_tiles_per_block + 1)
	block_offset = (block_id * big_share_items);
	block_end = block_offset + big_share_items;
	}
	else if (block_id < total_tiles)
	{
	// This thread block gets a normal share of grains (avg_tiles_per_block)
	block_offset = normal_base_offset + (block_id * normal_share_items);
	block_end = CUB_MIN(num_items, block_offset + normal_share_items);
	}
	// Else default past-the-end
	}


	/**
	* \brief Block-initialization, specialized for a "raking" access
	* pattern in which each thread block is assigned a consecutive sequence
	* of input tiles.
	*/
	template <int TILE_ITEMS>
	__device__ __forceinline__ void BlockInit(
	int block_id,
	Int2Type<GRID_MAPPING_STRIP_MINE> /strategy_tag/)
	{
	block_stride = grid_size * TILE_ITEMS;
	block_offset = (block_id * TILE_ITEMS);
	block_end = num_items;
	}


	/**
	* \brief Block-initialization, specialized for "strip mining" access
	* pattern in which the input tiles assigned to each thread block are
	* separated by a stride equal to the the extent of the grid.
	*/
	template <
	int TILE_ITEMS,
	GridMappingStrategy STRATEGY>
	__device__ __forceinline__ void BlockInit()
	{
	BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>());
	}


	/**
	* \brief Block-initialization, specialized for a "raking" access
	* pattern in which each thread block is assigned a consecutive sequence
	* of input tiles.
	*/
	template <int TILE_ITEMS>
	__device__ __forceinline__ void BlockInit(
	OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive)
	OffsetT block_end) ///< [in] Threadblock end offset (exclusive)
	{
	this->block_offset = block_offset;
	this->block_end = block_end;
	this->block_stride = TILE_ITEMS;
	}


	};





	/** @} */ // end group GridModule

	} // CUB namespace
	CUB_NS_POSTFIX // Optional outer namespace(s)