thanks to nvidia ❤

0dc1b04 over 2 years ago

25.5 kB

	/******************************************************************************
	* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* @file
	* Operations for reading linear tiles of data into the CUDA warp.
	*/

	#pragma once

	#include <iterator>
	#include <type_traits>

	#include <cub/block/block_load.cuh>
	#include <cub/config.cuh>
	#include <cub/iterator/cache_modified_input_iterator.cuh>
	#include <cub/util_ptx.cuh>
	#include <cub/util_type.cuh>
	#include <cub/warp/warp_exchange.cuh>


	CUB_NAMESPACE_BEGIN


	/**
	* @brief cub::WarpLoadAlgorithm enumerates alternative algorithms for
	* cub::WarpLoad to read a linear segment of data from memory into a
	* a CUDA warp.
	*/
	enum WarpLoadAlgorithm
	{
	/**
	* @par Overview
	*
	* A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
	* directly from memory.
	*
	* @par Performance Considerations
	* The utilization of memory transactions (coalescing) decreases as the
	* access stride between threads increases (i.e., the number items per thread).
	*/
	WARP_LOAD_DIRECT,

	/**
	* @par Overview
	*
	* A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
	* directly from memory.
	*
	* @par Performance Considerations
	* The utilization of memory transactions (coalescing) doesn't depend on
	* the number of items per thread.
	*/
	WARP_LOAD_STRIPED,

	/**
	* @par Overview
	*
	* A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
	* from memory using CUDA's built-in vectorized loads as a coalescing optimization.
	* For example, <tt>ld.global.v4.s32</tt> instructions will be generated
	* when @p T = @p int and @p ITEMS_PER_THREAD % 4 == 0.
	*
	* @par Performance Considerations
	* - The utilization of memory transactions (coalescing) remains high until the the
	* access stride between threads (i.e., the number items per thread) exceeds the
	* maximum vector load width (typically 4 items or 64B, whichever is lower).
	* - The following conditions will prevent vectorization and loading will fall
	* back to cub::WARP_LOAD_DIRECT:
	* - @p ITEMS_PER_THREAD is odd
	* - The @p InputIteratorT is not a simple pointer type
	* - The block input offset is not quadword-aligned
	* - The data type @p T is not a built-in primitive or CUDA vector type
	* (e.g., @p short, @p int2, @p double, @p float2, etc.)
	*/
	WARP_LOAD_VECTORIZE,

	/**
	* @par Overview
	*
	* A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
	* efficiently from memory and then locally transposed into a
	* [<em>blocked arrangement</em>](index.html#sec5sec3).
	*
	* @par Performance Considerations
	* - The utilization of memory transactions (coalescing) remains high
	* regardless of items loaded per thread.
	* - The local reordering incurs slightly longer latencies and throughput than
	* the direct cub::WARP_LOAD_DIRECT and cub::WARP_LOAD_VECTORIZE
	* alternatives.
	*/
	WARP_LOAD_TRANSPOSE
	};

	/**
	* @brief The WarpLoad class provides [<em>collective</em>](index.html#sec0)
	* data movement methods for loading a linear segment of items from
	* memory into a [<em>blocked arrangement</em>](index.html#sec5sec3)
	* across a CUDA thread block.
	* @ingroup WarpModule
	* @ingroup UtilIo
	*
	* @tparam InputT
	* The data type to read into (which must be convertible from the input
	* iterator's value type).
	*
	* @tparam ITEMS_PER_THREAD
	* The number of consecutive items partitioned onto each thread.
	*
	* @tparam ALGORITHM
	* <b>[optional]</b> cub::WarpLoadAlgorithm tuning policy.
	* default: cub::WARP_LOAD_DIRECT.
	*
	* @tparam LOGICAL_WARP_THREADS
	* <b>[optional]</b> The number of threads per "logical" warp (may be less
	* than the number of hardware warp threads). Default is the warp size of the
	* targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a
	* power of two.
	*
	* @tparam LEGACY_PTX_ARCH
	* Unused.
	*
	* @par Overview
	* - The WarpLoad class provides a single data movement abstraction that can be
	* specialized to implement different cub::WarpLoadAlgorithm strategies. This
	* facilitates different performance policies for different architectures, data
	* types, granularity sizes, etc.
	* - WarpLoad can be optionally specialized by different data movement strategies:
	* -# <b>cub::WARP_LOAD_DIRECT</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
	* of data is read directly from memory. [More...](@ref cub::WarpLoadAlgorithm)
	* -# <b>cub::WARP_LOAD_STRIPED,</b>. A [<em>striped arrangement</em>](index.html#sec5sec3)
	* of data is read directly from memory. [More...](@ref cub::WarpLoadAlgorithm)
	* -# <b>cub::WARP_LOAD_VECTORIZE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
	* of data is read directly from memory using CUDA's built-in vectorized
	* loads as a coalescing optimization. [More...](@ref cub::WarpLoadAlgorithm)
	* -# <b>cub::WARP_LOAD_TRANSPOSE</b>. A [<em>striped arrangement</em>](index.html#sec5sec3)
	* of data is read directly from memory and is then locally transposed into a
	* [<em>blocked arrangement</em>](index.html#sec5sec3). [More...](@ref cub::WarpLoadAlgorithm)
	*
	* @par A Simple Example
	* @par
	* The code snippet below illustrates the loading of a linear segment of 64
	* integers into a "blocked" arrangement across 16 threads where each thread
	* owns 4 consecutive items. The load is specialized for @p WARP_LOAD_TRANSPOSE,
	* meaning memory references are efficiently coalesced using a warp-striped access
	* pattern (after which items are locally reordered among threads).
	* @par
	* @code
	* #include <cub/cub.cuh> // or equivalently <cub/warp/warp_load.cuh>
	*
	* __global__ void ExampleKernel(int *d_data, ...)
	* {
	* constexpr int warp_threads = 16;
	* constexpr int block_threads = 256;
	* constexpr int items_per_thread = 4;
	*
	* // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each
	* using WarpLoadT = WarpLoad<int,
	* items_per_thread,
	* cub::WARP_LOAD_TRANSPOSE,
	* warp_threads>;
	*
	* constexpr int warps_in_block = block_threads / warp_threads;
	* constexpr int tile_size = items_per_thread * warp_threads;
	* const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
	*
	* // Allocate shared memory for WarpLoad
	* __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block];
	*
	* // Load a segment of consecutive items that are blocked across threads
	* int thread_data[items_per_thread];
	* WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
	* thread_data);
	* @endcode
	* @par
	* Suppose the input @p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
	* The set of @p thread_data across the first logical warp of threads in those
	* threads will be:
	* <tt>{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }</tt>.
	*/
	template <typename InputT,
	int ITEMS_PER_THREAD,
	WarpLoadAlgorithm ALGORITHM = WARP_LOAD_DIRECT,
	int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
	int LEGACY_PTX_ARCH = 0>
	class WarpLoad
	{
	constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0);

	static_assert(PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
	"LOGICAL_WARP_THREADS must be a power of two");

	private:

	/*****************************************************************************
	* Algorithmic variants
	****************************************************************************/

	/// Load helper
	template <WarpLoadAlgorithm _POLICY, int DUMMY>
	struct LoadInternal;


	template <int DUMMY>
	struct LoadInternal<WARP_LOAD_DIRECT, DUMMY>
	{
	using TempStorage = NullType;

	int linear_tid;

	__device__ __forceinline__
	LoadInternal(TempStorage & /temp_storage/,
	int linear_tid)
	: linear_tid(linear_tid)
	{}

	template <typename InputIteratorT>
	__device__ __forceinline__ void Load(
	InputIteratorT block_itr,
	InputT (&items)[ITEMS_PER_THREAD])
	{
	LoadDirectBlocked(linear_tid, block_itr, items);
	}

	template <typename InputIteratorT>
	__device__ __forceinline__ void Load(
	InputIteratorT block_itr,
	InputT (&items)[ITEMS_PER_THREAD],
	int valid_items)
	{
	LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
	}

	template <typename InputIteratorT, typename DefaultT>
	__device__ __forceinline__ void Load(
	InputIteratorT block_itr,
	InputT (&items)[ITEMS_PER_THREAD],
	int valid_items,
	DefaultT oob_default)
	{
	LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
	}
	};


	template <int DUMMY>
	struct LoadInternal<WARP_LOAD_STRIPED, DUMMY>
	{
	using TempStorage = NullType;

	int linear_tid;

	__device__ __forceinline__
	LoadInternal(TempStorage & /temp_storage/,
	int linear_tid)
	: linear_tid(linear_tid)
	{}

	template <typename InputIteratorT>
	__device__ __forceinline__ void Load(
	InputIteratorT block_itr,
	InputT (&items)[ITEMS_PER_THREAD])
	{
	LoadDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items);
	}

	template <typename InputIteratorT>
	__device__ __forceinline__ void Load(
	InputIteratorT block_itr,
	InputT (&items)[ITEMS_PER_THREAD],
	int valid_items)
	{
	LoadDirectStriped<LOGICAL_WARP_THREADS>(linear_tid,
	block_itr,
	items,
	valid_items);
	}

	template <typename InputIteratorT,
	typename DefaultT>
	__device__ __forceinline__ void Load(
	InputIteratorT block_itr,
	InputT (&items)[ITEMS_PER_THREAD],
	int valid_items,
	DefaultT oob_default)
	{
	LoadDirectStriped<LOGICAL_WARP_THREADS>(linear_tid,
	block_itr,
	items,
	valid_items,
	oob_default);
	}
	};


	template <int DUMMY>
	struct LoadInternal<WARP_LOAD_VECTORIZE, DUMMY>
	{
	using TempStorage = NullType;

	int linear_tid;

	__device__ __forceinline__
	LoadInternal(TempStorage &/temp_storage/,
	int linear_tid)
	: linear_tid(linear_tid)
	{}

	template <typename InputIteratorT>
	__device__ __forceinline__ void Load(
	InputT *block_ptr,
	InputT (&items)[ITEMS_PER_THREAD])
	{
	InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid,
	block_ptr,
	items);
	}

	template <typename InputIteratorT>
	__device__ __forceinline__ void Load(
	const InputT *block_ptr,
	InputT (&items)[ITEMS_PER_THREAD])
	{
	InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid,
	block_ptr,
	items);
	}

	template <
	CacheLoadModifier MODIFIER,
	typename ValueType,
	typename OffsetT>
	__device__ __forceinline__ void Load(
	CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT> block_itr,
	InputT (&items)[ITEMS_PER_THREAD])
	{
	InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid,
	block_itr.ptr,
	items);
	}

	template <typename _InputIteratorT>
	__device__ __forceinline__ void Load(
	_InputIteratorT block_itr,
	InputT (&items)[ITEMS_PER_THREAD])
	{
	LoadDirectBlocked(linear_tid, block_itr, items);
	}

	template <typename InputIteratorT>
	__device__ __forceinline__ void Load(
	InputIteratorT block_itr,
	InputT (&items)[ITEMS_PER_THREAD],
	int valid_items)
	{
	LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
	}

	template <typename InputIteratorT, typename DefaultT>
	__device__ __forceinline__ void Load(
	InputIteratorT block_itr,
	InputT (&items)[ITEMS_PER_THREAD],
	int valid_items,
	DefaultT oob_default)
	{
	LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
	}
	};


	template <int DUMMY>
	struct LoadInternal<WARP_LOAD_TRANSPOSE, DUMMY>
	{
	using WarpExchangeT =
	WarpExchange<InputT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS>;

	struct _TempStorage : WarpExchangeT::TempStorage
	{};

	struct TempStorage : Uninitialized<_TempStorage> {};

	_TempStorage &temp_storage;

	int linear_tid;

	__device__ __forceinline__ LoadInternal(
	TempStorage &temp_storage,
	int linear_tid)
	:
	temp_storage(temp_storage.Alias()),
	linear_tid(linear_tid)
	{}

	template <typename InputIteratorT>
	__device__ __forceinline__ void Load(
	InputIteratorT block_itr,
	InputT (&items)[ITEMS_PER_THREAD])
	{
	LoadDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items);
	WarpExchangeT(temp_storage).StripedToBlocked(items, items);
	}

	template <typename InputIteratorT>
	__device__ __forceinline__ void Load(
	InputIteratorT block_itr,
	InputT (&items)[ITEMS_PER_THREAD],
	int valid_items)
	{
	LoadDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items, valid_items);
	WarpExchangeT(temp_storage).StripedToBlocked(items, items);
	}

	template <typename InputIteratorT, typename DefaultT>
	__device__ __forceinline__ void Load(
	InputIteratorT block_itr,
	InputT (&items)[ITEMS_PER_THREAD],
	int valid_items,
	DefaultT oob_default)
	{
	LoadDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
	WarpExchangeT(temp_storage).StripedToBlocked(items, items);
	}
	};

	/*****************************************************************************
	* Type definitions
	****************************************************************************/

	/// Internal load implementation to use
	using InternalLoad = LoadInternal<ALGORITHM, 0>;

	/// Shared memory storage layout type
	using _TempStorage = typename InternalLoad::TempStorage;


	/*****************************************************************************
	* Utility methods
	****************************************************************************/

	/// Internal storage allocator
	__device__ __forceinline__ _TempStorage& PrivateStorage()
	{
	__shared__ _TempStorage private_storage;
	return private_storage;
	}


	/*****************************************************************************
	* Thread fields
	****************************************************************************/

	/// Thread reference to shared storage
	_TempStorage &temp_storage;

	/// Linear thread-id
	int linear_tid;

	public:

	/// @smemstorage{WarpLoad}
	struct TempStorage : Uninitialized<_TempStorage> {};

	/***********************************************************************//
	* @name Collective constructors
	****************************************************************************/
	//@{

	/**
	* @brief Collective constructor using a private static allocation of
	* shared memory as temporary storage.
	*/
	__device__ __forceinline__
	WarpLoad()
	: temp_storage(PrivateStorage())
	, linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
	{}

	/**
	* @brief Collective constructor using the specified memory allocation as
	* temporary storage.
	*/
	__device__ __forceinline__
	WarpLoad(TempStorage &temp_storage)
	: temp_storage(temp_storage.Alias())
	, linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
	{}

	//@} end member group
	/***********************************************************************//
	* @name Data movement
	****************************************************************************/
	//@{

	/**
	* @brief Load a linear segment of items from memory.
	*
	* @par
	* \smemreuse
	*
	* @par Snippet
	* @code
	* #include <cub/cub.cuh> // or equivalently <cub/warp/warp_load.cuh>
	*
	* __global__ void ExampleKernel(int *d_data, ...)
	* {
	* constexpr int warp_threads = 16;
	* constexpr int block_threads = 256;
	* constexpr int items_per_thread = 4;
	*
	* // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each
	* using WarpLoadT = WarpLoad<int,
	* items_per_thread,
	* cub::WARP_LOAD_TRANSPOSE,
	* warp_threads>;
	*
	* constexpr int warps_in_block = block_threads / warp_threads;
	* constexpr int tile_size = items_per_thread * warp_threads;
	* const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
	*
	* // Allocate shared memory for WarpLoad
	* __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block];
	*
	* // Load a segment of consecutive items that are blocked across threads
	* int thread_data[items_per_thread];
	* WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
	* thread_data);
	* @endcode
	* @par
	* Suppose the input @p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
	* The set of @p thread_data across the first logical warp of threads in those
	* threads will be:
	* <tt>{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }</tt>.
	*
	* @param[in] block_itr The thread block's base input iterator for loading from
	* @param[out] items Data to load
	*/
	template <typename InputIteratorT>
	__device__ __forceinline__ void Load(InputIteratorT block_itr,
	InputT (&items)[ITEMS_PER_THREAD])
	{
	InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
	}

	/**
	* @brief Load a linear segment of items from memory, guarded by range.
	*
	* @par
	* \smemreuse
	*
	* @par Snippet
	* @code
	* #include <cub/cub.cuh> // or equivalently <cub/warp/warp_load.cuh>
	*
	* __global__ void ExampleKernel(int *d_data, int valid_items, ...)
	* {
	* constexpr int warp_threads = 16;
	* constexpr int block_threads = 256;
	* constexpr int items_per_thread = 4;
	*
	* // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each
	* using WarpLoadT = WarpLoad<int,
	* items_per_thread,
	* cub::WARP_LOAD_TRANSPOSE,
	* warp_threads>;
	*
	* constexpr int warps_in_block = block_threads / warp_threads;
	* constexpr int tile_size = items_per_thread * warp_threads;
	* const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
	*
	* // Allocate shared memory for WarpLoad
	* __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block];
	*
	* // Load a segment of consecutive items that are blocked across threads
	* int thread_data[items_per_thread];
	* WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
	* thread_data,
	* valid_items);
	* @endcod
	* @par
	* Suppose the input @p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt> and @p valid_items
	* is @p 5.
	* The set of @p thread_data across the first logical warp of threads in those
	* threads will be:
	* <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt> with only the first
	* two threads being unmasked to load portions of valid data (and other items
	* remaining unassigned).
	*
	* @param[in] block_itr The thread block's base input iterator for loading from
	* @param[out] items Data to load
	* @param[in] valid_items Number of valid items to load
	*/
	template <typename InputIteratorT>
	__device__ __forceinline__ void Load(InputIteratorT block_itr,
	InputT (&items)[ITEMS_PER_THREAD],
	int valid_items)
	{
	InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
	}


	/**
	* @brief Load a linear segment of items from memory, guarded by range.
	*
	* @par
	* \smemreuse
	*
	* @par Snippet
	* @code
	* #include <cub/cub.cuh> // or equivalently <cub/warp/warp_load.cuh>
	*
	* __global__ void ExampleKernel(int *d_data, int valid_items, ...)
	* {
	* constexpr int warp_threads = 16;
	* constexpr int block_threads = 256;
	* constexpr int items_per_thread = 4;
	*
	* // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each
	* using WarpLoadT = WarpLoad<int,
	* items_per_thread,
	* cub::WARP_LOAD_TRANSPOSE,
	* warp_threads>;
	*
	* constexpr int warps_in_block = block_threads / warp_threads;
	* constexpr int tile_size = items_per_thread * warp_threads;
	* const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
	*
	* // Allocate shared memory for WarpLoad
	* __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block];
	*
	* // Load a segment of consecutive items that are blocked across threads
	* int thread_data[items_per_thread];
	* WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
	* thread_data,
	* valid_items,
	* -1);
	* @endcode
	* @par
	* Suppose the input @p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>, @p valid_items
	* is @p 5, and the out-of-bounds default is @p -1.
	* The set of @p thread_data across the first logical warp of threads in those
	* threads will be:
	* <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt> with only the first
	* two threads being unmasked to load portions of valid data (and other items
	* are assigned @p -1).
	*
	* @param[in] block_itr The thread block's base input iterator for loading from
	* @param[out] items Data to load
	* @param[in] valid_items Number of valid items to load
	* @param[in] oob_default Default value to assign out-of-bound items
	*/
	template <typename InputIteratorT,
	typename DefaultT>
	__device__ __forceinline__ void Load(InputIteratorT block_itr,
	InputT (&items)[ITEMS_PER_THREAD],
	int valid_items,
	DefaultT oob_default)
	{
	InternalLoad(temp_storage, linear_tid)
	.Load(block_itr, items, valid_items, oob_default);
	}

	//@} end member group
	};


	CUB_NAMESPACE_END