thrust / install /include /cub /device /dispatch /dispatch_reduce.cuh

thanks to nvidia ❤

0dc1b04 almost 3 years ago

45.8 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* @file cub::DeviceReduce provides device-wide, parallel operations for
	* computing a reduction across a sequence of data items residing within
	* device-accessible memory.
	*/

	#pragma once

	#include <stdio.h>
	#include <iterator>

	#include <cub/agent/agent_reduce.cuh>
	#include <cub/config.cuh>
	#include <cub/grid/grid_even_share.cuh>
	#include <cub/iterator/arg_index_input_iterator.cuh>
	#include <cub/thread/thread_operators.cuh>
	#include <cub/thread/thread_store.cuh>
	#include <cub/util_debug.cuh>
	#include <cub/util_deprecated.cuh>
	#include <cub/util_device.cuh>

	#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

	CUB_NAMESPACE_BEGIN

	namespace detail
	{
	namespace reduce
	{

	/**
	* All cub::DeviceReduce::* algorithms are using the same implementation. Some of them, however,
	* should use initial value only for empty problems. If this struct is used as initial value with
	* one of the `DeviceReduce` algorithms, the `init` value wrapped by this struct will only be used
	* for empty problems; it will not be incorporated into the aggregate of non-empty problems.
	*/
	template <class T>
	struct empty_problem_init_t
	{
	T init;

	__host__ __device__ operator T() const { return init; }
	};

	/**
	* @brief Applies initial value to the block aggregate and stores the result to the output iterator.
	*
	* @param d_out Iterator to the output aggregate
	* @param reduction_op Binary reduction functor
	* @param init Initial value
	* @param block_aggregate Aggregate value computed by the block
	*/
	template <class OutputIteratorT, class ReductionOpT, class InitT, class AccumT>
	__host__ __device__ void finalize_and_store_aggregate(OutputIteratorT d_out,
	ReductionOpT reduction_op,
	InitT init,
	AccumT block_aggregate)
	{
	*d_out = reduction_op(init, block_aggregate);
	}

	/**
	* @brief Ignores initial value and stores the block aggregate to the output iterator.
	*
	* @param d_out Iterator to the output aggregate
	* @param block_aggregate Aggregate value computed by the block
	*/
	template <class OutputIteratorT, class ReductionOpT, class InitT, class AccumT>
	__host__ __device__ void finalize_and_store_aggregate(OutputIteratorT d_out,
	ReductionOpT,
	empty_problem_init_t<InitT>,
	AccumT block_aggregate)
	{
	*d_out = block_aggregate;
	}
	} // namespace reduce
	} // namespace detail

	/******************************************************************************
	* Kernel entry points
	*****************************************************************************/

	/**
	* @brief Reduce region kernel entry point (multi-block). Computes privatized
	* reductions, one per thread block.
	*
	* @tparam ChainedPolicyT
	* Chained tuning policy
	*
	* @tparam InputIteratorT
	* Random-access input iterator type for reading input items \iterator
	*
	* @tparam OffsetT
	* Signed integer type for global offsets
	*
	* @tparam ReductionOpT
	* Binary reduction functor type having member
	* `auto operator()(const T &a, const U &b)`
	*
	* @tparam InitT
	* Initial value type
	*
	* @tparam AccumT
	* Accumulator type
	*
	* @param[in] d_in
	* Pointer to the input sequence of data items
	*
	* @param[out] d_out
	* Pointer to the output aggregate
	*
	* @param[in] num_items
	* Total number of input data items
	*
	* @param[in] even_share
	* Even-share descriptor for mapping an equal number of tiles onto each
	* thread block
	*
	* @param[in] reduction_op
	* Binary reduction functor
	*/
	template <typename ChainedPolicyT,
	typename InputIteratorT,
	typename OffsetT,
	typename ReductionOpT,
	typename AccumT>
	__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
	__global__ void DeviceReduceKernel(InputIteratorT d_in,
	AccumT* d_out,
	OffsetT num_items,
	GridEvenShare<OffsetT> even_share,
	ReductionOpT reduction_op)
	{
	// Thread block type for reducing input tiles
	using AgentReduceT =
	AgentReduce<typename ChainedPolicyT::ActivePolicy::ReducePolicy,
	InputIteratorT,
	AccumT*,
	OffsetT,
	ReductionOpT,
	AccumT>;

	// Shared memory storage
	__shared__ typename AgentReduceT::TempStorage temp_storage;

	// Consume input tiles
	AccumT block_aggregate =
	AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share);

	// Output result
	if (threadIdx.x == 0)
	{
	detail::uninitialized_copy(d_out + blockIdx.x, block_aggregate);
	}
	}

	/**
	* @brief Reduce a single tile kernel entry point (single-block). Can be used
	* to aggregate privatized thread block reductions from a previous
	* multi-block reduction pass.
	*
	* @tparam ChainedPolicyT
	* Chained tuning policy
	*
	* @tparam InputIteratorT
	* Random-access input iterator type for reading input items \iterator
	*
	* @tparam OutputIteratorT
	* Output iterator type for recording the reduced aggregate \iterator
	*
	* @tparam OffsetT
	* Signed integer type for global offsets
	*
	* @tparam ReductionOpT
	* Binary reduction functor type having member
	* `T operator()(const T &a, const U &b)`
	*
	* @tparam InitT
	* Initial value type
	*
	* @tparam AccumT
	* Accumulator type
	*
	* @param[in] d_in
	* Pointer to the input sequence of data items
	*
	* @param[out] d_out
	* Pointer to the output aggregate
	*
	* @param[in] num_items
	* Total number of input data items
	*
	* @param[in] reduction_op
	* Binary reduction functor
	*
	* @param[in] init
	* The initial value of the reduction
	*/
	template <typename ChainedPolicyT,
	typename InputIteratorT,
	typename OutputIteratorT,
	typename OffsetT,
	typename ReductionOpT,
	typename InitT,
	typename AccumT>
	__launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
	__global__ void DeviceReduceSingleTileKernel(InputIteratorT d_in,
	OutputIteratorT d_out,
	OffsetT num_items,
	ReductionOpT reduction_op,
	InitT init)
	{
	// Thread block type for reducing input tiles
	using AgentReduceT =
	AgentReduce<typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
	InputIteratorT,
	OutputIteratorT,
	OffsetT,
	ReductionOpT,
	AccumT>;

	// Shared memory storage
	__shared__ typename AgentReduceT::TempStorage temp_storage;

	// Check if empty problem
	if (num_items == 0)
	{
	if (threadIdx.x == 0)
	{
	*d_out = init;
	}

	return;
	}

	// Consume input tiles
	AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op)
	.ConsumeRange(OffsetT(0), num_items);

	// Output result
	if (threadIdx.x == 0)
	{
	detail::reduce::finalize_and_store_aggregate(d_out, reduction_op, init, block_aggregate);
	}
	}

	/// Normalize input iterator to segment offset
	template <typename T, typename OffsetT, typename IteratorT>
	__device__ __forceinline__ void NormalizeReductionOutput(T & /val/,
	OffsetT /base_offset/,
	IteratorT /itr/)
	{}

	/// Normalize input iterator to segment offset (specialized for arg-index)
	template <typename KeyValuePairT,
	typename OffsetT,
	typename WrappedIteratorT,
	typename OutputValueT>
	__device__ __forceinline__ void NormalizeReductionOutput(
	KeyValuePairT &val,
	OffsetT base_offset,
	ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /itr/)
	{
	val.key -= base_offset;
	}

	/**
	* Segmented reduction (one block per segment)
	* @tparam ChainedPolicyT
	* Chained tuning policy
	*
	* @tparam InputIteratorT
	* Random-access input iterator type for reading input items \iterator
	*
	* @tparam OutputIteratorT
	* Output iterator type for recording the reduced aggregate \iterator
	*
	* @tparam BeginOffsetIteratorT
	* Random-access input iterator type for reading segment beginning offsets
	* \iterator
	*
	* @tparam EndOffsetIteratorT
	* Random-access input iterator type for reading segment ending offsets
	* \iterator
	*
	* @tparam OffsetT
	* Signed integer type for global offsets
	*
	* @tparam ReductionOpT
	* Binary reduction functor type having member
	* `T operator()(const T &a, const U &b)`
	*
	* @tparam InitT
	* Initial value type
	*
	* @param[in] d_in
	* Pointer to the input sequence of data items
	*
	* @param[out] d_out
	* Pointer to the output aggregate
	*
	* @param[in] d_begin_offsets
	* Random-access input iterator to the sequence of beginning offsets of
	* length `num_segments`, such that `d_begin_offsets[i]` is the first element
	* of the i<sup>th</sup> data segment in `d_keys_` and `d_values_`
	*
	* @param[in] d_end_offsets
	* Random-access input iterator to the sequence of ending offsets of length
	* `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
	* the i<sup>th</sup> data segment in `d_keys_` and `d_values_`.
	* If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i<sup>th</sup> is
	* considered empty.
	*
	* @param[in] num_segments
	* The number of segments that comprise the sorting data
	*
	* @param[in] reduction_op
	* Binary reduction functor
	*
	* @param[in] init
	* The initial value of the reduction
	*/
	template <typename ChainedPolicyT,
	typename InputIteratorT,
	typename OutputIteratorT,
	typename BeginOffsetIteratorT,
	typename EndOffsetIteratorT,
	typename OffsetT,
	typename ReductionOpT,
	typename InitT,
	typename AccumT>
	__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
	__global__ void DeviceSegmentedReduceKernel(
	InputIteratorT d_in,
	OutputIteratorT d_out,
	BeginOffsetIteratorT d_begin_offsets,
	EndOffsetIteratorT d_end_offsets,
	int /num_segments/,
	ReductionOpT reduction_op,
	InitT init)
	{
	// Thread block type for reducing input tiles
	using AgentReduceT =
	AgentReduce<typename ChainedPolicyT::ActivePolicy::ReducePolicy,
	InputIteratorT,
	OutputIteratorT,
	OffsetT,
	ReductionOpT,
	AccumT>;

	// Shared memory storage
	__shared__ typename AgentReduceT::TempStorage temp_storage;

	OffsetT segment_begin = d_begin_offsets[blockIdx.x];
	OffsetT segment_end = d_end_offsets[blockIdx.x];

	// Check if empty problem
	if (segment_begin == segment_end)
	{
	if (threadIdx.x == 0)
	{
	*(d_out + blockIdx.x) = init;
	}
	return;
	}

	// Consume input tiles
	AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op)
	.ConsumeRange(segment_begin, segment_end);

	// Normalize as needed
	NormalizeReductionOutput(block_aggregate, segment_begin, d_in);

	if (threadIdx.x == 0)
	{
	detail::reduce::finalize_and_store_aggregate(d_out + blockIdx.x,
	reduction_op,
	init,
	block_aggregate);
	}
	}

	/******************************************************************************
	* Policy
	******************************************************************************/

	/**
	* @tparam AccumT
	* Accumulator data type
	*
	* OffsetT
	* Signed integer type for global offsets
	*
	* ReductionOpT
	* Binary reduction functor type having member
	* `auto operator()(const T &a, const U &b)`
	*/
	template <
	typename AccumT,
	typename OffsetT,
	typename ReductionOpT>
	struct DeviceReducePolicy
	{
	//---------------------------------------------------------------------------
	// Architecture-specific tuning policies
	//---------------------------------------------------------------------------

	/// SM30
	struct Policy300 : ChainedPolicy<300, Policy300, Policy300>
	{
	static constexpr int threads_per_block = 256;
	static constexpr int items_per_thread = 20;
	static constexpr int items_per_vec_load = 2;

	// ReducePolicy (GTX670: 154.0 @ 48M 4B items)
	using ReducePolicy = AgentReducePolicy<threads_per_block,
	items_per_thread,
	AccumT,
	items_per_vec_load,
	BLOCK_REDUCE_WARP_REDUCTIONS,
	LOAD_DEFAULT>;

	// SingleTilePolicy
	using SingleTilePolicy = ReducePolicy;

	// SegmentedReducePolicy
	using SegmentedReducePolicy = ReducePolicy;
	};

	/// SM35
	struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
	{
	static constexpr int threads_per_block = 256;
	static constexpr int items_per_thread = 20;
	static constexpr int items_per_vec_load = 4;

	// ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B
	// items)
	using ReducePolicy = AgentReducePolicy<threads_per_block,
	items_per_thread,
	AccumT,
	items_per_vec_load,
	BLOCK_REDUCE_WARP_REDUCTIONS,
	LOAD_LDG>;

	// SingleTilePolicy
	using SingleTilePolicy = ReducePolicy;

	// SegmentedReducePolicy
	using SegmentedReducePolicy = ReducePolicy;
	};

	/// SM60
	struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
	{
	static constexpr int threads_per_block = 256;
	static constexpr int items_per_thread = 16;
	static constexpr int items_per_vec_load = 4;

	// ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
	using ReducePolicy = AgentReducePolicy<threads_per_block,
	items_per_thread,
	AccumT,
	items_per_vec_load,
	BLOCK_REDUCE_WARP_REDUCTIONS,
	LOAD_LDG>;

	// SingleTilePolicy
	using SingleTilePolicy = ReducePolicy;

	// SegmentedReducePolicy
	using SegmentedReducePolicy = ReducePolicy;
	};

	using MaxPolicy = Policy600;
	};



	/******************************************************************************
	* Single-problem dispatch
	*****************************************************************************/

	/**
	* @brief Utility class for dispatching the appropriately-tuned kernels for
	* device-wide reduction
	*
	* @tparam InputIteratorT
	* Random-access input iterator type for reading input items \iterator
	*
	* @tparam OutputIteratorT
	* Output iterator type for recording the reduced aggregate \iterator
	*
	* @tparam OffsetT
	* Signed integer type for global offsets
	*
	* @tparam ReductionOpT
	* Binary reduction functor type having member
	* `auto operator()(const T &a, const U &b)`
	*
	* @tparam InitT
	* Initial value type
	*/
	template <
	typename InputIteratorT,
	typename OutputIteratorT,
	typename OffsetT,
	typename ReductionOpT,
	typename InitT =
	cub::detail::non_void_value_t<
	OutputIteratorT,
	cub::detail::value_t<InputIteratorT>>,
	typename AccumT =
	detail::accumulator_t<
	ReductionOpT,
	InitT,
	cub::detail::value_t<InputIteratorT>>,
	typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
	struct DispatchReduce : SelectedPolicy
	{
	//---------------------------------------------------------------------------
	// Problem state
	//---------------------------------------------------------------------------

	/// Device-accessible allocation of temporary storage. When `nullptr`, the
	/// required allocation size is written to `temp_storage_bytes` and no work
	/// is done.
	void *d_temp_storage;

	/// Reference to size in bytes of `d_temp_storage` allocation
	size_t &temp_storage_bytes;

	/// Pointer to the input sequence of data items
	InputIteratorT d_in;

	/// Pointer to the output aggregate
	OutputIteratorT d_out;

	/// Total number of input items (i.e., length of `d_in`)
	OffsetT num_items;

	/// Binary reduction functor
	ReductionOpT reduction_op;

	/// The initial value of the reduction
	InitT init;

	/// CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
	cudaStream_t stream;

	int ptx_version;

	//---------------------------------------------------------------------------
	// Constructor
	//---------------------------------------------------------------------------

	/// Constructor
	CUB_RUNTIME_FUNCTION __forceinline__ DispatchReduce(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	OffsetT num_items,
	ReductionOpT reduction_op,
	InitT init,
	cudaStream_t stream,
	int ptx_version)
	: d_temp_storage(d_temp_storage)
	, temp_storage_bytes(temp_storage_bytes)
	, d_in(d_in)
	, d_out(d_out)
	, num_items(num_items)
	, reduction_op(reduction_op)
	, init(init)
	, stream(stream)
	, ptx_version(ptx_version)
	{}

	CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
	CUB_RUNTIME_FUNCTION __forceinline__
	DispatchReduce(void* d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	OffsetT num_items,
	ReductionOpT reduction_op,
	InitT init,
	cudaStream_t stream,
	bool debug_synchronous,
	int ptx_version)
	: d_temp_storage(d_temp_storage)
	, temp_storage_bytes(temp_storage_bytes)
	, d_in(d_in)
	, d_out(d_out)
	, num_items(num_items)
	, reduction_op(reduction_op)
	, init(init)
	, stream(stream)
	, ptx_version(ptx_version)
	{
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
	}

	//---------------------------------------------------------------------------
	// Small-problem (single tile) invocation
	//---------------------------------------------------------------------------

	/**
	* @brief Invoke a single block block to reduce in-core
	*
	* @tparam ActivePolicyT
	* Umbrella policy active for the target device
	*
	* @tparam SingleTileKernelT
	* Function type of cub::DeviceReduceSingleTileKernel
	*
	* @param[in] single_tile_kernel
	* Kernel function pointer to parameterization of
	* cub::DeviceReduceSingleTileKernel
	*/
	template <typename ActivePolicyT, typename SingleTileKernelT>
	CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
	InvokeSingleTile(SingleTileKernelT single_tile_kernel)
	{
	cudaError error = cudaSuccess;
	do
	{
	// Return if the caller is simply requesting the size of the storage
	// allocation
	if (d_temp_storage == NULL)
	{
	temp_storage_bytes = 1;
	break;
	}

	// Log single_reduce_sweep_kernel configuration
	#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
	_CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
	"%d items per thread\n",
	ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
	(long long)stream,
	ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
	#endif

	// Invoke single_reduce_sweep_kernel
	THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
	1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream)
	.doit(single_tile_kernel, d_in, d_out, num_items, reduction_op, init);

	// Check for failure to launch
	if (CubDebug(error = cudaPeekAtLastError()))
	{
	break;
	}

	// Sync the stream if specified to flush runtime errors
	error = detail::DebugSyncStream(stream);
	if (CubDebug(error))
	{
	break;
	}
	} while (0);

	return error;
	}

	//---------------------------------------------------------------------------
	// Normal problem size invocation (two-pass)
	//---------------------------------------------------------------------------

	/**
	* @brief Invoke two-passes to reduce
	* @tparam ActivePolicyT
	* Umbrella policy active for the target device
	*
	* @tparam ReduceKernelT
	* Function type of cub::DeviceReduceKernel
	*
	* @tparam SingleTileKernelT
	* Function type of cub::DeviceReduceSingleTileKernel
	*
	* @param[in] reduce_kernel
	* Kernel function pointer to parameterization of cub::DeviceReduceKernel
	*
	* @param[in] single_tile_kernel
	* Kernel function pointer to parameterization of
	* cub::DeviceReduceSingleTileKernel
	*/
	template <typename ActivePolicyT,
	typename ReduceKernelT,
	typename SingleTileKernelT>
	CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
	InvokePasses(ReduceKernelT reduce_kernel,
	SingleTileKernelT single_tile_kernel)
	{
	cudaError error = cudaSuccess;
	do
	{
	// Get device ordinal
	int device_ordinal;
	if (CubDebug(error = cudaGetDevice(&device_ordinal)))
	break;

	// Get SM count
	int sm_count;
	if (CubDebug(
	error = cudaDeviceGetAttribute(&sm_count,
	cudaDevAttrMultiProcessorCount,
	device_ordinal)))
	{
	break;
	}

	// Init regular kernel configuration
	KernelConfig reduce_config;
	if (CubDebug(
	error = reduce_config.Init<typename ActivePolicyT::ReducePolicy>(
	reduce_kernel)))
	{
	break;
	}

	int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;

	// Even-share work distribution
	int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(0);
	GridEvenShare<OffsetT> even_share;
	even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);

	// Temporary storage allocation requirements
	void *allocations[1] = {};
	size_t allocation_sizes[1] = {
	max_blocks * sizeof(AccumT) // bytes needed for privatized block
	// reductions
	};

	// Alias the temporary allocations from the single storage blob (or
	// compute the necessary size of the blob)
	if (CubDebug(error = AliasTemporaries(d_temp_storage,
	temp_storage_bytes,
	allocations,
	allocation_sizes)))
	{
	break;
	}

	if (d_temp_storage == NULL)
	{
	// Return if the caller is simply requesting the size of the storage
	// allocation
	return cudaSuccess;
	}

	// Alias the allocation for the privatized per-block reductions
	AccumT d_block_reductions = (AccumT )allocations[0];

	// Get grid size for device_reduce_sweep_kernel
	int reduce_grid_size = even_share.grid_size;

	// Log device_reduce_sweep_kernel configuration
	#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
	_CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items "
	"per thread, %d SM occupancy\n",
	reduce_grid_size,
	ActivePolicyT::ReducePolicy::BLOCK_THREADS,
	(long long)stream,
	ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
	reduce_config.sm_occupancy);
	#endif

	// Invoke DeviceReduceKernel
	THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
	reduce_grid_size,
	ActivePolicyT::ReducePolicy::BLOCK_THREADS,
	0,
	stream)
	.doit(reduce_kernel,
	d_in,
	d_block_reductions,
	num_items,
	even_share,
	reduction_op);

	// Check for failure to launch
	if (CubDebug(error = cudaPeekAtLastError()))
	{
	break;
	}

	// Sync the stream if specified to flush runtime errors
	error = detail::DebugSyncStream(stream);
	if (CubDebug(error))
	{
	break;
	}

	// Log single_reduce_sweep_kernel configuration
	#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
	_CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
	"%d items per thread\n",
	ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
	(long long)stream,
	ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
	#endif

	// Invoke DeviceReduceSingleTileKernel
	THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
	1,
	ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
	0,
	stream)
	.doit(single_tile_kernel,
	d_block_reductions,
	d_out,
	reduce_grid_size, // triple_chevron is not type safe, make sure to use int
	reduction_op,
	init);

	// Check for failure to launch
	if (CubDebug(error = cudaPeekAtLastError()))
	{
	break;
	}

	// Sync the stream if specified to flush runtime errors
	error = detail::DebugSyncStream(stream);
	if (CubDebug(error))
	{
	break;
	}
	} while (0);

	return error;
	}

	//---------------------------------------------------------------------------
	// Chained policy invocation
	//---------------------------------------------------------------------------

	/// Invocation
	template <typename ActivePolicyT>
	CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
	{
	typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT;
	typedef typename DispatchReduce::MaxPolicy MaxPolicyT;

	// Force kernel code-generation in all compiler passes
	if (num_items <= (SingleTilePolicyT::BLOCK_THREADS *
	SingleTilePolicyT::ITEMS_PER_THREAD))
	{
	// Small, single tile size
	return InvokeSingleTile<ActivePolicyT>(
	DeviceReduceSingleTileKernel<MaxPolicyT,
	InputIteratorT,
	OutputIteratorT,
	OffsetT,
	ReductionOpT,
	InitT,
	AccumT>);
	}
	else
	{
	// Regular size
	return InvokePasses<ActivePolicyT>(
	DeviceReduceKernel<typename DispatchReduce::MaxPolicy,
	InputIteratorT,
	OffsetT,
	ReductionOpT,
	AccumT>,
	DeviceReduceSingleTileKernel<MaxPolicyT,
	AccumT *,
	OutputIteratorT,
	int, // Always used with int offsets
	ReductionOpT,
	InitT,
	AccumT>);
	}
	}

	//---------------------------------------------------------------------------
	// Dispatch entrypoints
	//---------------------------------------------------------------------------

	/**
	* @brief Internal dispatch routine for computing a device-wide reduction
	*
	* @param[in] d_temp_storage
	* Device-accessible allocation of temporary storage. When `nullptr`, the
	* required allocation size is written to `temp_storage_bytes` and no work
	* is done.
	*
	* @param[in,out] temp_storage_bytes
	* Reference to size in bytes of `d_temp_storage` allocation
	*
	* @param[in] d_in
	* Pointer to the input sequence of data items
	*
	* @param[out] d_out
	* Pointer to the output aggregate
	*
	* @param[in] num_items
	* Total number of input items (i.e., length of `d_in`)
	*
	* @param[in] reduction_op
	* Binary reduction functor
	*
	* @param[in] init
	* The initial value of the reduction
	*
	* @param[in] stream
	* [optional] CUDA stream to launch kernels within.
	* Default is stream<sub>0</sub>.
	*/
	CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
	Dispatch(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	OffsetT num_items,
	ReductionOpT reduction_op,
	InitT init,
	cudaStream_t stream)
	{
	typedef typename DispatchReduce::MaxPolicy MaxPolicyT;

	cudaError error = cudaSuccess;
	do
	{
	// Get PTX version
	int ptx_version = 0;
	if (CubDebug(error = PtxVersion(ptx_version)))
	{
	break;
	}

	// Create dispatch functor
	DispatchReduce dispatch(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	num_items,
	reduction_op,
	init,
	stream,
	ptx_version);

	// Dispatch to chained policy
	if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
	{
	break;
	}
	} while (0);

	return error;
	}

	CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
	CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
	Dispatch(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	OffsetT num_items,
	ReductionOpT reduction_op,
	InitT init,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

	return Dispatch(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	num_items,
	reduction_op,
	init,
	stream);
	}
	};



	/******************************************************************************
	* Segmented dispatch
	*****************************************************************************/

	/**
	* @brief Utility class for dispatching the appropriately-tuned kernels for
	* device-wide reduction
	*
	* @tparam InputIteratorT
	* Random-access input iterator type for reading input items \iterator
	*
	* @tparam OutputIteratorT
	* Output iterator type for recording the reduced aggregate \iterator
	*
	* @tparam BeginOffsetIteratorT
	* Random-access input iterator type for reading segment beginning offsets
	* \iterator
	*
	* @tparam EndOffsetIteratorT
	* Random-access input iterator type for reading segment ending offsets
	* \iterator
	*
	* @tparam OffsetT
	* Signed integer type for global offsets
	*
	* @tparam ReductionOpT
	* Binary reduction functor type having member
	* `auto operator()(const T &a, const U &b)`
	*
	* @tparam InitT
	* value type
	*/
	template <
	typename InputIteratorT,
	typename OutputIteratorT,
	typename BeginOffsetIteratorT,
	typename EndOffsetIteratorT,
	typename OffsetT,
	typename ReductionOpT,
	typename InitT =
	cub::detail::non_void_value_t<
	OutputIteratorT,
	cub::detail::value_t<InputIteratorT>>,
	typename AccumT =
	detail::accumulator_t<
	ReductionOpT,
	InitT,
	cub::detail::value_t<InputIteratorT>>,
	typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
	struct DispatchSegmentedReduce : SelectedPolicy
	{
	//---------------------------------------------------------------------------
	// Problem state
	//---------------------------------------------------------------------------

	/// Device-accessible allocation of temporary storage. When `nullptr`, the
	/// required allocation size is written to `temp_storage_bytes` and no work
	/// is done.
	void *d_temp_storage;

	/// Reference to size in bytes of `d_temp_storage` allocation
	size_t &temp_storage_bytes;

	/// Pointer to the input sequence of data items
	InputIteratorT d_in;

	/// Pointer to the output aggregate
	OutputIteratorT d_out;

	/// The number of segments that comprise the sorting data
	OffsetT num_segments;

	/// Random-access input iterator to the sequence of beginning offsets of
	/// length `num_segments`, such that `d_begin_offsets[i]` is the first
	/// element of the i<sup>th</sup> data segment in `d_keys_*` and
	/// `d_values_*`
	BeginOffsetIteratorT d_begin_offsets;

	/// Random-access input iterator to the sequence of ending offsets of length
	/// `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
	/// the i<sup>th</sup> data segment in `d_keys_` and `d_values_`.
	/// If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i<sup>th</sup> is
	/// considered empty.
	EndOffsetIteratorT d_end_offsets;

	/// Binary reduction functor
	ReductionOpT reduction_op;

	/// The initial value of the reduction
	InitT init;

	/// CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
	cudaStream_t stream;

	int ptx_version;

	//---------------------------------------------------------------------------
	// Constructor
	//---------------------------------------------------------------------------

	/// Constructor
	CUB_RUNTIME_FUNCTION __forceinline__
	DispatchSegmentedReduce(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	OffsetT num_segments,
	BeginOffsetIteratorT d_begin_offsets,
	EndOffsetIteratorT d_end_offsets,
	ReductionOpT reduction_op,
	InitT init,
	cudaStream_t stream,
	int ptx_version)
	: d_temp_storage(d_temp_storage)
	, temp_storage_bytes(temp_storage_bytes)
	, d_in(d_in)
	, d_out(d_out)
	, num_segments(num_segments)
	, d_begin_offsets(d_begin_offsets)
	, d_end_offsets(d_end_offsets)
	, reduction_op(reduction_op)
	, init(init)
	, stream(stream)
	, ptx_version(ptx_version)
	{}

	CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
	CUB_RUNTIME_FUNCTION __forceinline__
	DispatchSegmentedReduce(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	OffsetT num_segments,
	BeginOffsetIteratorT d_begin_offsets,
	EndOffsetIteratorT d_end_offsets,
	ReductionOpT reduction_op,
	InitT init,
	cudaStream_t stream,
	bool debug_synchronous,
	int ptx_version)
	: d_temp_storage(d_temp_storage)
	, temp_storage_bytes(temp_storage_bytes)
	, d_in(d_in)
	, d_out(d_out)
	, num_segments(num_segments)
	, d_begin_offsets(d_begin_offsets)
	, d_end_offsets(d_end_offsets)
	, reduction_op(reduction_op)
	, init(init)
	, stream(stream)
	, ptx_version(ptx_version)
	{
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
	}

	//---------------------------------------------------------------------------
	// Chained policy invocation
	//---------------------------------------------------------------------------

	/**
	* @brief Invocation
	*
	* @tparam ActivePolicyT
	* Umbrella policy active for the target device
	*
	* @tparam DeviceSegmentedReduceKernelT
	* Function type of cub::DeviceSegmentedReduceKernel
	*
	* @param[in] segmented_reduce_kernel
	* Kernel function pointer to parameterization of
	* cub::DeviceSegmentedReduceKernel
	*/
	template <typename ActivePolicyT, typename DeviceSegmentedReduceKernelT>
	CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
	InvokePasses(DeviceSegmentedReduceKernelT segmented_reduce_kernel)
	{
	cudaError error = cudaSuccess;

	do
	{
	// Return if the caller is simply requesting the size of the storage
	// allocation
	if (d_temp_storage == NULL)
	{
	temp_storage_bytes = 1;
	return cudaSuccess;
	}

	// Init kernel configuration
	KernelConfig segmented_reduce_config;
	if (CubDebug(
	error = segmented_reduce_config
	.Init<typename ActivePolicyT::SegmentedReducePolicy>(
	segmented_reduce_kernel)))
	{
	break;
	}

	// Log device_reduce_sweep_kernel configuration
	#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
	_CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), "
	"%d items per thread, %d SM occupancy\n",
	num_segments,
	ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
	(long long)stream,
	ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
	segmented_reduce_config.sm_occupancy);
	#endif

	// Invoke DeviceReduceKernel
	THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
	num_segments,
	ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
	0,
	stream)
	.doit(segmented_reduce_kernel,
	d_in,
	d_out,
	d_begin_offsets,
	d_end_offsets,
	num_segments,
	reduction_op,
	init);

	// Check for failure to launch
	if (CubDebug(error = cudaPeekAtLastError()))
	{
	break;
	}

	// Sync the stream if specified to flush runtime errors
	error = detail::DebugSyncStream(stream);
	if (CubDebug(error))
	{
	break;
	}
	} while (0);

	return error;
	}

	/// Invocation
	template <typename ActivePolicyT>
	CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
	{
	typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;

	// Force kernel code-generation in all compiler passes
	return InvokePasses<ActivePolicyT>(
	DeviceSegmentedReduceKernel<MaxPolicyT,
	InputIteratorT,
	OutputIteratorT,
	BeginOffsetIteratorT,
	EndOffsetIteratorT,
	OffsetT,
	ReductionOpT,
	InitT,
	AccumT>);
	}

	//---------------------------------------------------------------------------
	// Dispatch entrypoints
	//---------------------------------------------------------------------------

	/**
	* @brief Internal dispatch routine for computing a device-wide reduction
	*
	* @param[in] d_temp_storage
	* Device-accessible allocation of temporary storage. When `nullptr`, the
	* required allocation size is written to `temp_storage_bytes` and no work
	* is done.
	*
	* @param[in,out] temp_storage_bytes
	* Reference to size in bytes of `d_temp_storage` allocation
	*
	* @param[in] d_in
	* Pointer to the input sequence of data items
	*
	* @param[out] d_out
	* Pointer to the output aggregate
	*
	* @param[in] num_segments
	* The number of segments that comprise the sorting data
	*
	* @param[in] d_begin_offsets
	* Random-access input iterator to the sequence of beginning offsets of
	* length `num_segments`, such that `d_begin_offsets[i]` is the first
	* element of the i<sup>th</sup> data segment in `d_keys_*` and
	* `d_values_*`
	*
	* @param[in] d_end_offsets
	* Random-access input iterator to the sequence of ending offsets of length
	* `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
	* the i<sup>th</sup> data segment in `d_keys_` and `d_values_`.
	* If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i<sup>th</sup> is
	* considered empty.
	*
	* @param[in] reduction_op
	* Binary reduction functor
	*
	* @param[in] init
	* The initial value of the reduction
	*
	* @param[in] stream
	* [optional] CUDA stream to launch kernels within.
	* Default is stream<sub>0</sub>.
	*/
	CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
	Dispatch(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_segments,
	BeginOffsetIteratorT d_begin_offsets,
	EndOffsetIteratorT d_end_offsets,
	ReductionOpT reduction_op,
	InitT init,
	cudaStream_t stream)
	{
	typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;

	if (num_segments <= 0)
	{
	return cudaSuccess;
	}

	cudaError error = cudaSuccess;

	do
	{
	// Get PTX version
	int ptx_version = 0;
	if (CubDebug(error = PtxVersion(ptx_version)))
	{
	break;
	}

	// Create dispatch functor
	DispatchSegmentedReduce dispatch(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	num_segments,
	d_begin_offsets,
	d_end_offsets,
	reduction_op,
	init,
	stream,
	ptx_version);

	// Dispatch to chained policy
	if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
	{
	break;
	}
	} while (0);

	return error;
	}

	CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
	CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
	Dispatch(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_segments,
	BeginOffsetIteratorT d_begin_offsets,
	EndOffsetIteratorT d_end_offsets,
	ReductionOpT reduction_op,
	InitT init,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

	return Dispatch(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	num_segments,
	d_begin_offsets,
	d_end_offsets,
	reduction_op,
	init,
	stream);
	}
	};


	CUB_NAMESPACE_END