thrust / install /include /cub /warp /warp_store.cuh

thanks to nvidia ❤

0dc1b04 almost 3 years ago

20.3 kB

	/******************************************************************************
	* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* @file
	* Operations for writing linear segments of data from the CUDA warp
	*/

	#pragma once

	#include <iterator>
	#include <type_traits>

	#include <cub/block/block_store.cuh>
	#include <cub/config.cuh>
	#include <cub/util_ptx.cuh>
	#include <cub/util_type.cuh>
	#include <cub/warp/warp_exchange.cuh>


	CUB_NAMESPACE_BEGIN


	/**
	* @brief cub::WarpStoreAlgorithm enumerates alternative algorithms for
	* cub::WarpStore to write a blocked arrangement of items across a CUDA
	* warp to a linear segment of memory.
	*/
	enum WarpStoreAlgorithm
	{
	/**
	* @par Overview
	* A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
	* directly to memory.
	*
	* @par Performance Considerations
	* The utilization of memory transactions (coalescing) decreases as the
	* access stride between threads increases (i.e., the number items per thread).
	*/
	WARP_STORE_DIRECT,

	/**
	* @par Overview
	* A [<em>striped arrangement</em>](index.html#sec5sec3) of data is written
	* directly to memory.
	*
	* @par Performance Considerations
	* The utilization of memory transactions (coalescing) remains high regardless
	* of items written per thread.
	*/
	WARP_STORE_STRIPED,

	/**
	* @par Overview
	*
	* A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
	* directly to memory using CUDA's built-in vectorized stores as a coalescing
	* optimization. For example, <tt>st.global.v4.s32</tt> instructions will be
	* generated when @p T = @p int and @p ITEMS_PER_THREAD % 4 == 0.
	*
	* @par Performance Considerations
	* - The utilization of memory transactions (coalescing) remains high until
	* the the access stride between threads (i.e., the number items per thread)
	* exceeds the maximum vector store width (typically 4 items or 64B,
	* whichever is lower).
	* - The following conditions will prevent vectorization and writing will fall
	* back to cub::WARP_STORE_DIRECT:
	* - @p ITEMS_PER_THREAD is odd
	* - The @p OutputIteratorT is not a simple pointer type
	* - The block output offset is not quadword-aligned
	* - The data type @p T is not a built-in primitive or CUDA vector type
	* (e.g., @p short, @p int2, @p double, @p float2, etc.)
	*/
	WARP_STORE_VECTORIZE,

	/**
	* @par Overview
	* A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
	* transposed and then efficiently written to memory as a
	* [<em>striped arrangement</em>](index.html#sec5sec3).
	*
	* @par Performance Considerations
	* - The utilization of memory transactions (coalescing) remains high
	* regardless of items written per thread.
	* - The local reordering incurs slightly longer latencies and throughput than the
	* direct cub::WARP_STORE_DIRECT and cub::WARP_STORE_VECTORIZE alternatives.
	*/
	WARP_STORE_TRANSPOSE
	};


	/**
	* @brief The WarpStore class provides [<em>collective</em>](index.html#sec0)
	* data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3)
	* of items partitioned across a CUDA warp to a linear segment of memory.
	* @ingroup WarpModule
	* @ingroup UtilIo
	*
	* @tparam T
	* The type of data to be written.
	*
	* @tparam ITEMS_PER_THREAD
	* The number of consecutive items partitioned onto each thread.
	*
	* @tparam ALGORITHM
	* <b>[optional]</b> cub::WarpStoreAlgorithm tuning policy enumeration.
	* default: cub::WARP_STORE_DIRECT.
	*
	* @tparam LOGICAL_WARP_THREADS
	* <b>[optional]</b> The number of threads per "logical" warp (may be less
	* than the number of hardware warp threads). Default is the warp size of the
	* targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a
	* power of two.
	*
	* @tparam LEGACY_PTX_ARCH
	* Unused.
	*
	* @par Overview
	* - The WarpStore class provides a single data movement abstraction that can be
	* specialized to implement different cub::WarpStoreAlgorithm strategies. This
	* facilitates different performance policies for different architectures,
	* data types, granularity sizes, etc.
	* - WarpStore can be optionally specialized by different data movement strategies:
	* -# <b>cub::WARP_STORE_DIRECT</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
	* of data is written directly to memory. [More...](@ref cub::WarpStoreAlgorithm)
	* -# <b>cub::WARP_STORE_STRIPED</b>. A [<em>striped arrangement</em>](index.html#sec5sec3)
	* of data is written directly to memory. [More...](@ref cub::WarpStoreAlgorithm)
	* -# <b>cub::WARP_STORE_VECTORIZE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
	* of data is written directly to memory using CUDA's built-in vectorized
	* stores as a coalescing optimization. [More...](@ref cub::WarpStoreAlgorithm)
	* -# <b>cub::WARP_STORE_TRANSPOSE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
	* is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3)
	* which is then written to memory. [More...](@ref cub::WarpStoreAlgorithm)
	* - \rowmajor
	*
	* @par A Simple Example
	* @par
	* The code snippet below illustrates the storing of a "blocked" arrangement
	* of 64 integers across 16 threads (where each thread owns 4 consecutive items)
	* into a linear segment of memory. The store is specialized for
	* @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so
	* that memory references will be efficiently coalesced using a warp-striped
	* access pattern.
	* @par
	* @code
	* #include <cub/cub.cuh> // or equivalently <cub/warp/warp_store.cuh>
	*
	* __global__ void ExampleKernel(int *d_data, ...)
	* {
	* constexpr int warp_threads = 16;
	* constexpr int block_threads = 256;
	* constexpr int items_per_thread = 4;
	*
	* // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
	* using WarpStoreT = WarpStore<int,
	* items_per_thread,
	* cub::WARP_STORE_TRANSPOSE,
	* warp_threads>;
	*
	* constexpr int warps_in_block = block_threads / warp_threads;
	* constexpr int tile_size = items_per_thread * warp_threads;
	* const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
	*
	* // Allocate shared memory for WarpStore
	* __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
	*
	* // Obtain a segment of consecutive items that are blocked across threads
	* int thread_data[4];
	* ...
	*
	* // Store items to linear memory
	* WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
	* @endcode
	* @par
	* Suppose the set of @p thread_data across the warp threads is
	* <tt>{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }</tt>.
	* The output @p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
	*/
	template <typename T,
	int ITEMS_PER_THREAD,
	WarpStoreAlgorithm ALGORITHM = WARP_STORE_DIRECT,
	int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
	int LEGACY_PTX_ARCH = 0>
	class WarpStore
	{
	static_assert(PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
	"LOGICAL_WARP_THREADS must be a power of two");

	constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0);

	private:

	/// Store helper
	template <WarpStoreAlgorithm _POLICY, int DUMMY>
	struct StoreInternal;

	template <int DUMMY>
	struct StoreInternal<WARP_STORE_DIRECT, DUMMY>
	{
	typedef NullType TempStorage;

	int linear_tid;

	__device__ __forceinline__ StoreInternal(TempStorage &/temp_storage/,
	int linear_tid)
	: linear_tid(linear_tid)
	{}

	template <typename OutputIteratorT>
	__device__ __forceinline__ void Store(OutputIteratorT block_itr,
	T (&items)[ITEMS_PER_THREAD])
	{
	StoreDirectBlocked(linear_tid, block_itr, items);
	}

	template <typename OutputIteratorT>
	__device__ __forceinline__ void Store(OutputIteratorT block_itr,
	T (&items)[ITEMS_PER_THREAD],
	int valid_items)
	{
	StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
	}
	};


	template <int DUMMY>
	struct StoreInternal<WARP_STORE_STRIPED, DUMMY>
	{
	typedef NullType TempStorage;

	int linear_tid;

	__device__ __forceinline__ StoreInternal(TempStorage & /temp_storage/,
	int linear_tid)
	: linear_tid(linear_tid)
	{}

	template <typename OutputIteratorT>
	__device__ __forceinline__ void Store(OutputIteratorT block_itr,
	T (&items)[ITEMS_PER_THREAD])
	{
	StoreDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items);
	}

	template <typename OutputIteratorT>
	__device__ __forceinline__ void Store(OutputIteratorT block_itr,
	T (&items)[ITEMS_PER_THREAD],
	int valid_items)
	{
	StoreDirectStriped<LOGICAL_WARP_THREADS>(linear_tid,
	block_itr,
	items,
	valid_items);
	}
	};


	template <int DUMMY>
	struct StoreInternal<WARP_STORE_VECTORIZE, DUMMY>
	{
	typedef NullType TempStorage;

	int linear_tid;

	__device__ __forceinline__ StoreInternal(TempStorage & /temp_storage/,
	int linear_tid)
	: linear_tid(linear_tid)
	{}

	__device__ __forceinline__ void Store(T *block_ptr,
	T (&items)[ITEMS_PER_THREAD])
	{
	StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
	}

	template <typename OutputIteratorT>
	__device__ __forceinline__ void Store(OutputIteratorT block_itr,
	T (&items)[ITEMS_PER_THREAD])
	{
	StoreDirectBlocked(linear_tid, block_itr, items);
	}

	template <typename OutputIteratorT>
	__device__ __forceinline__ void Store(OutputIteratorT block_itr,
	T (&items)[ITEMS_PER_THREAD],
	int valid_items)
	{
	StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
	}
	};


	template <int DUMMY>
	struct StoreInternal<WARP_STORE_TRANSPOSE, DUMMY>
	{
	using WarpExchangeT =
	WarpExchange<T, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS>;

	struct _TempStorage : WarpExchangeT::TempStorage
	{};

	struct TempStorage : Uninitialized<_TempStorage> {};

	_TempStorage &temp_storage;

	int linear_tid;

	__device__ __forceinline__ StoreInternal(TempStorage &temp_storage,
	int linear_tid)
	: temp_storage(temp_storage.Alias())
	, linear_tid(linear_tid)
	{}

	template <typename OutputIteratorT>
	__device__ __forceinline__ void Store(OutputIteratorT block_itr,
	T (&items)[ITEMS_PER_THREAD])
	{
	WarpExchangeT(temp_storage).BlockedToStriped(items, items);
	StoreDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items);
	}

	template <typename OutputIteratorT>
	__device__ __forceinline__ void Store(OutputIteratorT block_itr,
	T (&items)[ITEMS_PER_THREAD],
	int valid_items)
	{
	WarpExchangeT(temp_storage).BlockedToStriped(items, items);
	StoreDirectStriped<LOGICAL_WARP_THREADS>(linear_tid,
	block_itr,
	items,
	valid_items);
	}
	};


	/// Internal load implementation to use
	using InternalStore = StoreInternal<ALGORITHM, 0>;

	/// Shared memory storage layout type
	using _TempStorage = typename InternalStore::TempStorage;


	__device__ __forceinline__ _TempStorage& PrivateStorage()
	{
	__shared__ _TempStorage private_storage;
	return private_storage;
	}


	_TempStorage &temp_storage;

	int linear_tid;

	public:

	struct TempStorage : Uninitialized<_TempStorage> {};

	/***********************************************************************//
	* @name Collective constructors
	****************************************************************************/
	//@{

	/**
	* @brief Collective constructor using a private static allocation of shared
	* memory as temporary storage.
	*/
	__device__ __forceinline__ WarpStore()
	: temp_storage(PrivateStorage())
	, linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
	{}

	/**
	* @brief Collective constructor using the specified memory allocation as
	* temporary storage.
	*/
	__device__ __forceinline__ WarpStore(TempStorage &temp_storage)
	: temp_storage(temp_storage.Alias())
	, linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
	{}

	//@} end member group
	/***********************************************************************//
	* @name Data movement
	****************************************************************************/
	//@{

	/**
	* @brief Store items into a linear segment of memory.
	*
	* @par
	* \smemreuse
	*
	* @par Snippet
	* @par
	* The code snippet below illustrates the storing of a "blocked" arrangement
	* of 64 integers across 16 threads (where each thread owns 4 consecutive items)
	* into a linear segment of memory. The store is specialized for
	* @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so
	* that memory references will be efficiently coalesced using a warp-striped
	* access pattern.
	* @code
	* #include <cub/cub.cuh> // or equivalently <cub/warp/warp_store.cuh>
	*
	* __global__ void ExampleKernel(int *d_data, ...)
	* {
	* constexpr int warp_threads = 16;
	* constexpr int block_threads = 256;
	* constexpr int items_per_thread = 4;
	*
	* // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
	* using WarpStoreT = WarpStore<int,
	* items_per_thread,
	* cub::WARP_STORE_TRANSPOSE,
	* warp_threads>;
	*
	* constexpr int warps_in_block = block_threads / warp_threads;
	* constexpr int tile_size = items_per_thread * warp_threads;
	* const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
	*
	* // Allocate shared memory for WarpStore
	* __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
	*
	* // Obtain a segment of consecutive items that are blocked across threads
	* int thread_data[4];
	* ...
	*
	* // Store items to linear memory
	* WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
	* @endcode
	* @par
	* Suppose the set of @p thread_data across the warp threads is
	* <tt>{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }</tt>.
	* The output @p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
	*
	* @param[out] block_itr The thread block's base output iterator for storing to
	* @param[in] items Data to store
	*/
	template <typename OutputIteratorT>
	__device__ __forceinline__ void Store(OutputIteratorT block_itr,
	T (&items)[ITEMS_PER_THREAD])
	{
	InternalStore(temp_storage, linear_tid).Store(block_itr, items);
	}

	/**
	* @brief Store items into a linear segment of memory, guarded by range.
	*
	* @par
	* \smemreuse
	*
	* @par Snippet
	* @par
	* The code snippet below illustrates the storing of a "blocked" arrangement
	* of 64 integers across 16 threads (where each thread owns 4 consecutive items)
	* into a linear segment of memory. The store is specialized for
	* @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so
	* that memory references will be efficiently coalesced using a warp-striped
	* access pattern.
	* @code
	* #include <cub/cub.cuh> // or equivalently <cub/warp/warp_store.cuh>
	*
	* __global__ void ExampleKernel(int *d_data, int valid_items ...)
	* {
	* constexpr int warp_threads = 16;
	* constexpr int block_threads = 256;
	* constexpr int items_per_thread = 4;
	*
	* // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
	* using WarpStoreT = WarpStore<int,
	* items_per_thread,
	* cub::WARP_STORE_TRANSPOSE,
	* warp_threads>;
	*
	* constexpr int warps_in_block = block_threads / warp_threads;
	* constexpr int tile_size = items_per_thread * warp_threads;
	* const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
	*
	* // Allocate shared memory for WarpStore
	* __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
	*
	* // Obtain a segment of consecutive items that are blocked across threads
	* int thread_data[4];
	* ...
	*
	* // Store items to linear memory
	* WarpStoreT(temp_storage[warp_id]).Store(
	* d_data + warp_id * tile_size, thread_data, valid_items);
	* @endcode
	* @par
	* Suppose the set of @p thread_data across the warp threads is
	* <tt>{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }</tt> and @p valid_items
	* is @p 5.. The output @p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ...</tt>,
	* with only the first two threads being unmasked to store portions of valid
	* data.
	*
	* @param[out] block_itr The thread block's base output iterator for storing to
	* @param[in] items Data to store
	* @param[in] valid_items Number of valid items to write
	*/
	template <typename OutputIteratorT>
	__device__ __forceinline__ void Store(OutputIteratorT block_itr,
	T (&items)[ITEMS_PER_THREAD],
	int valid_items)
	{
	InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
	}

	//@} end member group
	};


	CUB_NAMESPACE_END