thrust / install /include /cub /device /device_reduce.cuh

thanks to nvidia ❤

0dc1b04 over 2 years ago

48.5 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* @file cub::DeviceReduce provides device-wide, parallel operations for
	* computing a reduction across a sequence of data items residing within
	* device-accessible memory.
	*/

	#pragma once

	#include <iterator>
	#include <limits>

	#include <cub/config.cuh>
	#include <cub/detail/choose_offset.cuh>
	#include <cub/device/dispatch/dispatch_reduce.cuh>
	#include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
	#include <cub/iterator/arg_index_input_iterator.cuh>
	#include <cub/util_deprecated.cuh>

	CUB_NAMESPACE_BEGIN


	//! @ingroup SingleModule
	//!
	//! @rst
	//! DeviceReduce provides device-wide, parallel operations for computing
	//! a reduction across a sequence of data items residing within
	//! device-accessible memory.
	//!
	//! .. image:: ../img/reduce_logo.png
	//! :align: center
	//!
	//! Overview
	//! ====================================
	//! A `reduction <http://en.wikipedia.org/wiki/Reduce_(higher-order_function)>`_
	//! (or fold) uses a binary combining operator to compute a single aggregate
	//! from a sequence of input elements.
	//!
	//! Usage Considerations
	//! ====================================
	//! @cdp_class{DeviceReduce}
	//!
	//! Performance
	//! ====================================
	//! @linear_performance{reduction, reduce-by-key, and run-length encode}
	//!
	//! The following chart illustrates DeviceReduce::Sum
	//! performance across different CUDA architectures for \p int32 keys.
	//!
	//! .. image:: ../img/reduce_int32.png
	//! :align: center
	//!
	//! @par
	//! The following chart illustrates DeviceReduce::ReduceByKey (summation)
	//! performance across different CUDA architectures for `fp32` values. Segments
	//! are identified by `int32` keys, and have lengths uniformly sampled
	//! from `[1, 1000]`.
	//!
	//! .. image:: ../img/reduce_by_key_fp32_len_500.png
	//! :align: center
	//!
	//! @endrst
	struct DeviceReduce
	{
	/**
	* @brief Computes a device-wide reduction using the specified binary
	* `reduction_op` functor and initial value `init`.
	*
	* @par
	* - Does not support binary reduction operators that are non-commutative.
	* - Provides "run-to-run" determinism for pseudo-associative reduction
	* (e.g., addition of floating point types) on the same GPU device.
	* However, results for pseudo-associative reduction may be inconsistent
	* from one device to a another device of a different compute-capability
	* because CUB can employ different tile-sizing for different architectures.
	* - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
	* - @devicestorage
	*
	* @par Snippet
	* The code snippet below illustrates a user-defined min-reduction of a
	* device vector of `int` data elements.
	* @par
	* @code
	* #include <cub/cub.cuh>
	* // or equivalently <cub/device/device_radix_sort.cuh>
	*
	* // CustomMin functor
	* struct CustomMin
	* {
	* template <typename T>
	* __device__ __forceinline__
	* T operator()(const T &a, const T &b) const {
	* return (b < a) ? b : a;
	* }
	* };
	*
	* // Declare, allocate, and initialize device-accessible pointers for
	* // input and output
	* int num_items; // e.g., 7
	* int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
	* int *d_out; // e.g., [-]
	* CustomMin min_op;
	* int init; // e.g., INT_MAX
	* ...
	*
	* // Determine temporary device storage requirements
	* void *d_temp_storage = NULL;
	* size_t temp_storage_bytes = 0;
	* cub::DeviceReduce::Reduce(
	* d_temp_storage, temp_storage_bytes,
	* d_in, d_out, num_items, min_op, init);
	*
	* // Allocate temporary storage
	* cudaMalloc(&d_temp_storage, temp_storage_bytes);
	*
	* // Run reduction
	* cub::DeviceReduce::Reduce(
	* d_temp_storage, temp_storage_bytes,
	* d_in, d_out, num_items, min_op, init);
	*
	* // d_out <-- [0]
	* @endcode
	*
	* @tparam InputIteratorT
	* [inferred] Random-access input iterator type for reading input
	* items \iterator
	*
	* @tparam OutputIteratorT
	* [inferred] Output iterator type for recording the reduced
	* aggregate \iterator
	*
	* @tparam ReductionOpT
	* [inferred] Binary reduction functor type having member
	* `T operator()(const T &a, const T &b)`
	*
	* @tparam T
	* [inferred] Data element type that is convertible to the `value` type
	* of `InputIteratorT`
	*
	* @tparam NumItemsT [inferred] Type of num_items
	*
	* @param[in] d_temp_storage
	* Device-accessible allocation of temporary storage. When `nullptr`, the
	* required allocation size is written to `temp_storage_bytes` and no work
	* is done.
	*
	* @param[in,out] temp_storage_bytes
	* Reference to size in bytes of `d_temp_storage` allocation
	*
	* @param d_in[in]
	* Pointer to the input sequence of data items
	*
	* @param d_out[out]
	* Pointer to the output aggregate
	*
	* @param num_items[in]
	* Total number of input items (i.e., length of `d_in`)
	*
	* @param reduction_op[in]
	* Binary reduction functor
	*
	* @param[in] init
	* Initial value of the reduction
	*
	* @param[in] stream
	* [optional] CUDA stream to launch kernels within.
	* Default is stream<sub>0</sub>.
	*/
	template <typename InputIteratorT,
	typename OutputIteratorT,
	typename ReductionOpT,
	typename T,
	typename NumItemsT>
	CUB_RUNTIME_FUNCTION static cudaError_t Reduce(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	NumItemsT num_items,
	ReductionOpT reduction_op,
	T init,
	cudaStream_t stream = 0)
	{
	// Signed integer type for global offsets
	using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;

	return DispatchReduce<InputIteratorT,
	OutputIteratorT,
	OffsetT,
	ReductionOpT,
	T>::Dispatch(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	static_cast<OffsetT>(num_items),
	reduction_op,
	init,
	stream);
	}

	template <typename InputIteratorT,
	typename OutputIteratorT,
	typename ReductionOpT,
	typename T>
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
	CUB_RUNTIME_FUNCTION static cudaError_t Reduce(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	ReductionOpT reduction_op,
	T init,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

	return Reduce<InputIteratorT, OutputIteratorT, ReductionOpT, T>(
	d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	num_items,
	reduction_op,
	init,
	stream);
	}

	/**
	* @brief Computes a device-wide sum using the addition (`+`) operator.
	*
	* @par
	* - Uses `0` as the initial value of the reduction.
	* - Does not support \p + operators that are non-commutative..
	* - Provides "run-to-run" determinism for pseudo-associative reduction
	* (e.g., addition of floating point types) on the same GPU device.
	* However, results for pseudo-associative reduction may be inconsistent
	* from one device to a another device of a different compute-capability
	* because CUB can employ different tile-sizing for different architectures.
	* - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
	* - @devicestorage
	*
	* @par Performance
	* The following charts illustrate saturated sum-reduction performance across
	* different CUDA architectures for `int32` and `int64` items, respectively.
	*
	* @image html reduce_int32.png
	* @image html reduce_int64.png
	*
	* @par Snippet
	* The code snippet below illustrates the sum-reduction of a device vector
	* of `int` data elements.
	* @par
	* @code
	* #include <cub/cub.cuh>
	* // or equivalently <cub/device/device_radix_sort.cuh>
	*
	* // Declare, allocate, and initialize device-accessible pointers
	* // for input and output
	* int num_items; // e.g., 7
	* int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
	* int *d_out; // e.g., [-]
	* ...
	*
	* // Determine temporary device storage requirements
	* void *d_temp_storage = NULL;
	* size_t temp_storage_bytes = 0;
	* cub::DeviceReduce::Sum(
	* d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
	*
	* // Allocate temporary storage
	* cudaMalloc(&d_temp_storage, temp_storage_bytes);
	*
	* // Run sum-reduction
	* cub::DeviceReduce::Sum(
	* d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
	*
	* // d_out <-- [38]
	* @endcode
	*
	* @tparam InputIteratorT
	* [inferred] Random-access input iterator type for reading input
	* items \iterator
	*
	* @tparam OutputIteratorT
	* [inferred] Output iterator type for recording the reduced
	* aggregate \iterator
	*
	* @tparam NumItemsT [inferred] Type of num_items
	*
	* @param[in] d_temp_storage
	* Device-accessible allocation of temporary storage. When `nullptr`, the
	* required allocation size is written to `temp_storage_bytes` and no work
	* is done.
	*
	* @param[in,out] temp_storage_bytes
	* Reference to size in bytes of `d_temp_storage` allocation
	*
	* @param[in] d_in
	* Pointer to the input sequence of data items
	*
	* @param[out] d_out
	* Pointer to the output aggregate
	*
	* @param[in] num_items
	* Total number of input items (i.e., length of `d_in`)
	*
	* @param[in] stream
	* [optional] CUDA stream to launch kernels within.
	* Default is stream<sub>0</sub>.
	*/
	template <typename InputIteratorT,
	typename OutputIteratorT,
	typename NumItemsT>
	CUB_RUNTIME_FUNCTION static cudaError_t Sum(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	NumItemsT num_items,
	cudaStream_t stream = 0)
	{
	// Signed integer type for global offsets
	using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;

	// The output value type
	using OutputT =
	cub::detail::non_void_value_t<OutputIteratorT,
	cub::detail::value_t<InputIteratorT>>;

	using InitT = OutputT;

	return DispatchReduce<InputIteratorT,
	OutputIteratorT,
	OffsetT,
	cub::Sum,
	InitT>::Dispatch(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	static_cast<OffsetT>(num_items),
	cub::Sum(),
	InitT{}, // zero-initialize
	stream);
	}

	template <typename InputIteratorT, typename OutputIteratorT>
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
	CUB_RUNTIME_FUNCTION static cudaError_t Sum(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

	return Sum<InputIteratorT, OutputIteratorT>(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	num_items,
	stream);
	}

	/**
	* @brief Computes a device-wide minimum using the less-than ('<') operator.
	*
	* @par
	* - Uses `std::numeric_limits<T>::max()` as the initial value of the reduction.
	* - Does not support `<` operators that are non-commutative.
	* - Provides "run-to-run" determinism for pseudo-associative reduction
	* (e.g., addition of floating point types) on the same GPU device.
	* However, results for pseudo-associative reduction may be inconsistent
	* from one device to a another device of a different compute-capability
	* because CUB can employ different tile-sizing for different architectures.
	* - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
	* - @devicestorage
	*
	* @par Snippet
	* The code snippet below illustrates the min-reduction of a device vector of
	* `int` data elements.
	* @par
	* @code
	* #include <cub/cub.cuh>
	* // or equivalently <cub/device/device_radix_sort.cuh>
	*
	* // Declare, allocate, and initialize device-accessible pointers
	* // for input and output
	* int num_items; // e.g., 7
	* int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
	* int *d_out; // e.g., [-]
	* ...
	*
	* // Determine temporary device storage requirements
	* void *d_temp_storage = NULL;
	* size_t temp_storage_bytes = 0;
	* cub::DeviceReduce::Min(
	* d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
	*
	* // Allocate temporary storage
	* cudaMalloc(&d_temp_storage, temp_storage_bytes);
	*
	* // Run min-reduction
	* cub::DeviceReduce::Min(
	* d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
	*
	* // d_out <-- [0]
	* @endcode
	*
	* @tparam InputIteratorT
	* [inferred] Random-access input iterator type for reading input
	* items \iterator
	*
	* @tparam OutputIteratorT
	* [inferred] Output iterator type for recording the reduced
	* aggregate \iterator
	*
	* @tparam NumItemsT [inferred] Type of num_items
	*
	* @param[in] d_temp_storage
	* Device-accessible allocation of temporary storage. When `nullptr`, the
	* required allocation size is written to `temp_storage_bytes` and no work
	* is done.
	*
	* @param[in,out] temp_storage_bytes
	* Reference to size in bytes of `d_temp_storage` allocation
	*
	* @param[in] d_in
	* Pointer to the input sequence of data items
	*
	* @param[out] d_out
	* Pointer to the output aggregate
	*
	* @param[in] num_items
	* Total number of input items (i.e., length of `d_in`)
	*
	* @param[in] stream
	* [optional] CUDA stream to launch kernels within.
	* Default is stream<sub>0</sub>.
	*/
	template <typename InputIteratorT,
	typename OutputIteratorT,
	typename NumItemsT>
	CUB_RUNTIME_FUNCTION static cudaError_t Min(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	NumItemsT num_items,
	cudaStream_t stream = 0)
	{
	// Signed integer type for global offsets
	using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;

	// The input value type
	using InputT = cub::detail::value_t<InputIteratorT>;

	using InitT = InputT;

	return DispatchReduce<InputIteratorT,
	OutputIteratorT,
	OffsetT,
	cub::Min,
	InitT>::Dispatch(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	static_cast<OffsetT>(num_items),
	cub::Min(),
	// replace with
	// std::numeric_limits<T>::max() when
	// C++11 support is more prevalent
	Traits<InitT>::Max(),
	stream);
	}

	template <typename InputIteratorT, typename OutputIteratorT>
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
	CUB_RUNTIME_FUNCTION static cudaError_t Min(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

	return Min<InputIteratorT, OutputIteratorT>(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	num_items,
	stream);
	}

	/**
	* @brief Finds the first device-wide minimum using the less-than ('<')
	* operator, also returning the index of that item.
	*
	* @par
	* - The output value type of `d_out` is cub::KeyValuePair `<int, T>`
	* (assuming the value type of `d_in` is `T`)
	* - The minimum is written to `d_out.value` and its offset in the input
	* array is written to `d_out.key`.
	* - The `{1, std::numeric_limits<T>::max()}` tuple is produced for
	* zero-length inputs
	* - Does not support `<` operators that are non-commutative.
	* - Provides "run-to-run" determinism for pseudo-associative reduction
	* (e.g., addition of floating point types) on the same GPU device.
	* However, results for pseudo-associative reduction may be inconsistent
	* from one device to a another device of a different compute-capability
	* because CUB can employ different tile-sizing for different architectures.
	* - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
	* - @devicestorage
	*
	* @par Snippet
	* The code snippet below illustrates the argmin-reduction of a device vector
	* of `int` data elements.
	* @par
	* @code
	* #include <cub/cub.cuh>
	* // or equivalently <cub/device/device_radix_sort.cuh>
	*
	* // Declare, allocate, and initialize device-accessible pointers
	* // for input and output
	* int num_items; // e.g., 7
	* int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
	* KeyValuePair<int, int> *d_out; // e.g., [{-,-}]
	* ...
	*
	* // Determine temporary device storage requirements
	* void *d_temp_storage = NULL;
	* size_t temp_storage_bytes = 0;
	* cub::DeviceReduce::ArgMin(
	* d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
	*
	* // Allocate temporary storage
	* cudaMalloc(&d_temp_storage, temp_storage_bytes);
	*
	* // Run argmin-reduction
	* cub::DeviceReduce::ArgMin(
	* d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
	*
	* // d_out <-- [{5, 0}]
	*
	* @endcode
	*
	* @tparam InputIteratorT
	* [inferred] Random-access input iterator type for reading input items
	* (of some type `T`) \iterator
	*
	* @tparam OutputIteratorT
	* [inferred] Output iterator type for recording the reduced aggregate
	* (having value type `cub::KeyValuePair<int, T>`) \iterator
	*
	* @param[in] d_temp_storage
	* Device-accessible allocation of temporary storage. When `nullptr`, the
	* required allocation size is written to \p temp_storage_bytes and no work is done.
	*
	* @param[in,out] temp_storage_bytes
	* Reference to size in bytes of `d_temp_storage` allocation
	*
	* @param[in] d_in
	* Pointer to the input sequence of data items
	*
	* @param[out] d_out
	* Pointer to the output aggregate
	*
	* @param[in] num_items
	* Total number of input items (i.e., length of `d_in`)
	*
	* @param[in] stream
	* [optional] CUDA stream to launch kernels within.
	* Default is stream<sub>0</sub>.
	*/
	template <typename InputIteratorT,
	typename OutputIteratorT>
	CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	cudaStream_t stream = 0)
	{
	// Signed integer type for global offsets
	using OffsetT = int;

	// The input type
	using InputValueT = cub::detail::value_t<InputIteratorT>;

	// The output tuple type
	using OutputTupleT =
	cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;

	using AccumT = OutputTupleT;

	using InitT = detail::reduce::empty_problem_init_t<AccumT>;

	// The output value type
	using OutputValueT = typename OutputTupleT::Value;

	// Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
	using ArgIndexInputIteratorT =
	ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;

	ArgIndexInputIteratorT d_indexed_in(d_in);

	// Initial value
	// TODO Address https://github.com/NVIDIA/cub/issues/651
	InitT initial_value{AccumT(1, Traits<InputValueT>::Max())};

	return DispatchReduce<ArgIndexInputIteratorT,
	OutputIteratorT,
	OffsetT,
	cub::ArgMin,
	InitT,
	AccumT>::Dispatch(d_temp_storage,
	temp_storage_bytes,
	d_indexed_in,
	d_out,
	num_items,
	cub::ArgMin(),
	initial_value,
	stream);
	}

	template <typename InputIteratorT, typename OutputIteratorT>
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
	CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

	return ArgMin<InputIteratorT, OutputIteratorT>(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	num_items,
	stream);
	}

	/**
	* @brief Computes a device-wide maximum using the greater-than ('>') operator.
	*
	* @par
	* - Uses `std::numeric_limits<T>::lowest()` as the initial value of the
	* reduction.
	* - Does not support `>` operators that are non-commutative.
	* - Provides "run-to-run" determinism for pseudo-associative reduction
	* (e.g., addition of floating point types) on the same GPU device.
	* However, results for pseudo-associative reduction may be inconsistent
	* from one device to a another device of a different compute-capability
	* because CUB can employ different tile-sizing for different architectures.
	* - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
	* - @devicestorage
	*
	* @par Snippet
	* The code snippet below illustrates the max-reduction of a device vector of
	* `int` data elements.
	* @par
	* @code
	* #include <cub/cub.cuh>
	* // or equivalently <cub/device/device_radix_sort.cuh>
	*
	* // Declare, allocate, and initialize device-accessible pointers
	* // for input and output
	* int num_items; // e.g., 7
	* int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
	* int *d_out; // e.g., [-]
	* ...
	*
	* // Determine temporary device storage requirements
	* void *d_temp_storage = NULL;
	* size_t temp_storage_bytes = 0;
	* cub::DeviceReduce::Max(
	* d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
	*
	* // Allocate temporary storage
	* cudaMalloc(&d_temp_storage, temp_storage_bytes);
	*
	* // Run max-reduction
	* cub::DeviceReduce::Max(
	* d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
	*
	* // d_out <-- [9]
	* @endcode
	*
	* @tparam InputIteratorT
	* [inferred] Random-access input iterator type for reading input
	* items \iterator
	*
	* @tparam OutputIteratorT
	* [inferred] Output iterator type for recording the reduced
	* aggregate \iterator
	*
	* @tparam NumItemsT [inferred] Type of num_items
	*
	* @param[in] d_temp_storage
	* Device-accessible allocation of temporary storage. When `nullptr`, the
	* required allocation size is written to `temp_storage_bytes` and no work
	* is done.
	*
	* @param[in,out] temp_storage_bytes
	* Reference to size in bytes of `d_temp_storage` allocation
	*
	* @param[in] d_in
	* Pointer to the input sequence of data items
	*
	* @param[out] d_out
	* Pointer to the output aggregate
	*
	* @param[in] num_items
	* Total number of input items (i.e., length of `d_in`)
	*
	* @param[in] stream
	* [optional] CUDA stream to launch kernels within.
	* Default is stream<sub>0</sub>.
	*/
	template <typename InputIteratorT,
	typename OutputIteratorT,
	typename NumItemsT>
	CUB_RUNTIME_FUNCTION static cudaError_t Max(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	NumItemsT num_items,
	cudaStream_t stream = 0)
	{
	// Signed integer type for global offsets
	using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;

	// The input value type
	using InputT = cub::detail::value_t<InputIteratorT>;

	using InitT = InputT;

	return DispatchReduce<InputIteratorT,
	OutputIteratorT,
	OffsetT,
	cub::Max,
	InitT>::Dispatch(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	static_cast<OffsetT>(num_items),
	cub::Max(),
	// replace with
	// std::numeric_limits<T>::lowest()
	// when C++11 support is more
	// prevalent
	Traits<InitT>::Lowest(),
	stream);
	}

	template <typename InputIteratorT, typename OutputIteratorT>
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
	CUB_RUNTIME_FUNCTION static cudaError_t Max(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

	return Max<InputIteratorT, OutputIteratorT>(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	num_items,
	stream);
	}

	/**
	* @brief Finds the first device-wide maximum using the greater-than ('>')
	* operator, also returning the index of that item
	*
	* @par
	* - The output value type of `d_out` is cub::KeyValuePair `<int, T>`
	* (assuming the value type of `d_in` is `T`)
	* - The maximum is written to `d_out.value` and its offset in the input
	* array is written to `d_out.key`.
	* - The `{1, std::numeric_limits<T>::lowest()}` tuple is produced for
	* zero-length inputs
	* - Does not support `>` operators that are non-commutative.
	* - Provides "run-to-run" determinism for pseudo-associative reduction
	* (e.g., addition of floating point types) on the same GPU device.
	* However, results for pseudo-associative reduction may be inconsistent
	* from one device to a another device of a different compute-capability
	* because CUB can employ different tile-sizing for different architectures.
	* - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
	* - @devicestorage
	*
	* @par Snippet
	* The code snippet below illustrates the argmax-reduction of a device vector
	* of `int` data elements.
	* @par
	* @code
	* #include <cub/cub.cuh>
	* // or equivalently <cub/device/device_reduce.cuh>
	*
	* // Declare, allocate, and initialize device-accessible pointers
	* // for input and output
	* int num_items; // e.g., 7
	* int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
	* KeyValuePair<int, int> *d_out; // e.g., [{-,-}]
	* ...
	*
	* // Determine temporary device storage requirements
	* void *d_temp_storage = NULL;
	* size_t temp_storage_bytes = 0;
	* cub::DeviceReduce::ArgMax(
	* d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
	*
	* // Allocate temporary storage
	* cudaMalloc(&d_temp_storage, temp_storage_bytes);
	*
	* // Run argmax-reduction
	* cub::DeviceReduce::ArgMax(
	* d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
	*
	* // d_out <-- [{6, 9}]
	*
	* @endcode
	*
	* @tparam InputIteratorT
	* [inferred] Random-access input iterator type for reading input items
	* (of some type \p T) \iterator
	*
	* @tparam OutputIteratorT
	* [inferred] Output iterator type for recording the reduced aggregate
	* (having value type `cub::KeyValuePair<int, T>`) \iterator
	*
	* @param[in] d_temp_storage
	* Device-accessible allocation of temporary storage. When `nullptr`, the
	* required allocation size is written to `temp_storage_bytes` and no work
	* is done.
	*
	* @param[in,out] temp_storage_bytes
	* Reference to size in bytes of `d_temp_storage` allocation
	*
	* @param[in] d_in
	* Pointer to the input sequence of data items
	*
	* @param[out] d_out
	* Pointer to the output aggregate
	*
	* @param[in] num_items
	* Total number of input items (i.e., length of `d_in`)
	*
	* @param[in] stream
	* [optional] CUDA stream to launch kernels within.
	* Default is stream<sub>0</sub>.
	*/
	template <typename InputIteratorT,
	typename OutputIteratorT>
	CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	cudaStream_t stream = 0)
	{
	// Signed integer type for global offsets
	using OffsetT = int;

	// The input type
	using InputValueT = cub::detail::value_t<InputIteratorT>;

	// The output tuple type
	using OutputTupleT =
	cub::detail::non_void_value_t<OutputIteratorT,
	KeyValuePair<OffsetT, InputValueT>>;

	using AccumT = OutputTupleT;

	// The output value type
	using OutputValueT = typename OutputTupleT::Value;

	using InitT = detail::reduce::empty_problem_init_t<AccumT>;

	// Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
	using ArgIndexInputIteratorT =
	ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;

	ArgIndexInputIteratorT d_indexed_in(d_in);

	// Initial value
	// TODO Address https://github.com/NVIDIA/cub/issues/651
	InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())};

	return DispatchReduce<ArgIndexInputIteratorT,
	OutputIteratorT,
	OffsetT,
	cub::ArgMax,
	InitT,
	AccumT>::Dispatch(d_temp_storage,
	temp_storage_bytes,
	d_indexed_in,
	d_out,
	num_items,
	cub::ArgMax(),
	initial_value,
	stream);
	}

	template <typename InputIteratorT, typename OutputIteratorT>
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
	CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

	return ArgMax<InputIteratorT, OutputIteratorT>(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	num_items,
	stream);
	}

	/**
	* @brief Reduces segments of values, where segments are demarcated by
	* corresponding runs of identical keys.
	*
	* @par
	* This operation computes segmented reductions within `d_values_in` using
	* the specified binary `reduction_op` functor. The segments are identified
	* by "runs" of corresponding keys in `d_keys_in`, where runs are maximal
	* ranges of consecutive, identical keys. For the i<sup>th</sup> run
	* encountered, the first key of the run and the corresponding value
	* aggregate of that run are written to `d_unique_out[i] and
	* `d_aggregates_out[i]`, respectively. The total number of runs encountered
	* is written to `d_num_runs_out`.
	*
	* @par
	* - The `==` equality operator is used to determine whether keys are
	* equivalent
	* - Provides "run-to-run" determinism for pseudo-associative reduction
	* (e.g., addition of floating point types) on the same GPU device.
	* However, results for pseudo-associative reduction may be inconsistent
	* from one device to a another device of a different compute-capability
	* because CUB can employ different tile-sizing for different architectures.
	* - Let `out` be any of
	* `[d_unique_out, d_unique_out + *d_num_runs_out)`
	* `[d_aggregates_out, d_aggregates_out + *d_num_runs_out)`
	* `d_num_runs_out`. The ranges represented by `out` shall not overlap
	* `[d_keys_in, d_keys_in + num_items)`,
	* `[d_values_in, d_values_in + num_items)` nor `out` in any way.
	* - @devicestorage
	*
	* @par Performance
	* The following chart illustrates reduction-by-key (sum) performance across
	* different CUDA architectures for `fp32` and `fp64` values, respectively.
	* Segments are identified by `int32` keys, and have lengths uniformly
	* sampled from `[1, 1000]`.
	*
	* @image html reduce_by_key_fp32_len_500.png
	* @image html reduce_by_key_fp64_len_500.png
	*
	* @par
	* The following charts are similar, but with segment lengths uniformly
	* sampled from [1,10]:
	*
	* @image html reduce_by_key_fp32_len_5.png
	* @image html reduce_by_key_fp64_len_5.png
	*
	* @par Snippet
	* The code snippet below illustrates the segmented reduction of `int` values
	* grouped by runs of associated `int` keys.
	* @par
	* @code
	* #include <cub/cub.cuh>
	* // or equivalently <cub/device/device_reduce.cuh>
	*
	* // CustomMin functor
	* struct CustomMin
	* {
	* template <typename T>
	* CUB_RUNTIME_FUNCTION __forceinline__
	* T operator()(const T &a, const T &b) const {
	* return (b < a) ? b : a;
	* }
	* };
	*
	* // Declare, allocate, and initialize device-accessible pointers
	* // for input and output
	* int num_items; // e.g., 8
	* int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
	* int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
	* int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -]
	* int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -]
	* int *d_num_runs_out; // e.g., [-]
	* CustomMin reduction_op;
	* ...
	*
	* // Determine temporary device storage requirements
	* void *d_temp_storage = NULL;
	* size_t temp_storage_bytes = 0;
	* cub::DeviceReduce::ReduceByKey(
	* d_temp_storage, temp_storage_bytes,
	* d_keys_in, d_unique_out, d_values_in,
	* d_aggregates_out, d_num_runs_out, reduction_op, num_items);
	*
	* // Allocate temporary storage
	* cudaMalloc(&d_temp_storage, temp_storage_bytes);
	*
	* // Run reduce-by-key
	* cub::DeviceReduce::ReduceByKey(
	* d_temp_storage, temp_storage_bytes,
	* d_keys_in, d_unique_out, d_values_in,
	* d_aggregates_out, d_num_runs_out, reduction_op, num_items);
	*
	* // d_unique_out <-- [0, 2, 9, 5, 8]
	* // d_aggregates_out <-- [0, 1, 6, 2, 4]
	* // d_num_runs_out <-- [5]
	* @endcode
	*
	* @tparam KeysInputIteratorT
	* [inferred] Random-access input iterator type for reading input
	* keys \iterator
	*
	* @tparam UniqueOutputIteratorT
	* [inferred] Random-access output iterator type for writing unique
	* output keys \iterator
	*
	* @tparam ValuesInputIteratorT
	* [inferred] Random-access input iterator type for reading input
	* values \iterator
	*
	* @tparam AggregatesOutputIterator
	* [inferred] Random-access output iterator type for writing output
	* value aggregates \iterator
	*
	* @tparam NumRunsOutputIteratorT
	* [inferred] Output iterator type for recording the number of runs
	* encountered \iterator
	*
	* @tparam ReductionOpT
	* *[inferred]8 Binary reduction functor type having member
	* `T operator()(const T &a, const T &b)`
	*
	* @tparam NumItemsT [inferred] Type of num_items
	*
	* @param[in] d_temp_storage
	* Device-accessible allocation of temporary storage. When `nullptr`, the
	* required allocation size is written to `temp_storage_bytes` and no work
	* is done.
	*
	* @param[in,out] temp_storage_bytes
	* Reference to size in bytes of `d_temp_storage` allocation
	*
	* @param[in] d_keys_in
	* Pointer to the input sequence of keys
	*
	* @param[out] d_unique_out
	* Pointer to the output sequence of unique keys (one key per run)
	*
	* @param[in] d_values_in
	* Pointer to the input sequence of corresponding values
	*
	* @param[out] d_aggregates_out
	* Pointer to the output sequence of value aggregates
	* (one aggregate per run)
	*
	* @param[out] d_num_runs_out
	* Pointer to total number of runs encountered
	* (i.e., the length of `d_unique_out`)
	*
	* @param[in] reduction_op
	* Binary reduction functor
	*
	* @param[in] num_items
	* Total number of associated key+value pairs
	* (i.e., the length of `d_in_keys` and `d_in_values`)
	*
	* @param[in] stream
	* [optional] CUDA stream to launch kernels within.
	* Default is stream<sub>0</sub>.
	*/
	template <typename KeysInputIteratorT,
	typename UniqueOutputIteratorT,
	typename ValuesInputIteratorT,
	typename AggregatesOutputIteratorT,
	typename NumRunsOutputIteratorT,
	typename ReductionOpT,
	typename NumItemsT>
	CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
	ReduceByKey(void *d_temp_storage,
	size_t &temp_storage_bytes,
	KeysInputIteratorT d_keys_in,
	UniqueOutputIteratorT d_unique_out,
	ValuesInputIteratorT d_values_in,
	AggregatesOutputIteratorT d_aggregates_out,
	NumRunsOutputIteratorT d_num_runs_out,
	ReductionOpT reduction_op,
	NumItemsT num_items,
	cudaStream_t stream = 0)
	{
	// Signed integer type for global offsets
	using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;

	// FlagT iterator type (not used)

	// Selection op (not used)

	// Default == operator
	typedef Equality EqualityOp;

	return DispatchReduceByKey<KeysInputIteratorT,
	UniqueOutputIteratorT,
	ValuesInputIteratorT,
	AggregatesOutputIteratorT,
	NumRunsOutputIteratorT,
	EqualityOp,
	ReductionOpT,
	OffsetT>::Dispatch(d_temp_storage,
	temp_storage_bytes,
	d_keys_in,
	d_unique_out,
	d_values_in,
	d_aggregates_out,
	d_num_runs_out,
	EqualityOp(),
	reduction_op,
	static_cast<OffsetT>(num_items),
	stream);
	}

	template <typename KeysInputIteratorT,
	typename UniqueOutputIteratorT,
	typename ValuesInputIteratorT,
	typename AggregatesOutputIteratorT,
	typename NumRunsOutputIteratorT,
	typename ReductionOpT>
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
	CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
	ReduceByKey(void *d_temp_storage,
	size_t &temp_storage_bytes,
	KeysInputIteratorT d_keys_in,
	UniqueOutputIteratorT d_unique_out,
	ValuesInputIteratorT d_values_in,
	AggregatesOutputIteratorT d_aggregates_out,
	NumRunsOutputIteratorT d_num_runs_out,
	ReductionOpT reduction_op,
	int num_items,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

	return ReduceByKey<KeysInputIteratorT,
	UniqueOutputIteratorT,
	ValuesInputIteratorT,
	AggregatesOutputIteratorT,
	NumRunsOutputIteratorT,
	ReductionOpT>(d_temp_storage,
	temp_storage_bytes,
	d_keys_in,
	d_unique_out,
	d_values_in,
	d_aggregates_out,
	d_num_runs_out,
	reduction_op,
	num_items,
	stream);
	}
	};

	/**
	* @example example_device_reduce.cu
	*/

	CUB_NAMESPACE_END