thrust / install /include /cub /device /device_partition.cuh

thanks to nvidia ❤

0dc1b04 over 2 years ago

29 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* @file
	* cub::DevicePartition provides device-wide, parallel operations for
	* partitioning sequences of data items residing within device-accessible memory.
	*/

	#pragma once

	#include <stdio.h>
	#include <iterator>

	#include <cub/config.cuh>
	#include <cub/device/dispatch/dispatch_select_if.cuh>
	#include <cub/device/dispatch/dispatch_three_way_partition.cuh>
	#include <cub/util_deprecated.cuh>

	CUB_NAMESPACE_BEGIN


	/**
	* @brief DevicePartition provides device-wide, parallel operations for
	* partitioning sequences of data items residing within device-accessible
	* memory. ![](partition_logo.png)
	* @ingroup SingleModule
	*
	* @par Overview
	* These operations apply a selection criterion to construct a partitioned
	* output sequence from items selected/unselected from a specified input
	* sequence.
	*
	* @par Usage Considerations
	* \cdp_class{DevicePartition}
	*
	* @par Performance
	* \linear_performance{partition}
	*
	* @par
	* The following chart illustrates DevicePartition::If
	* performance across different CUDA architectures for @p int32 items,
	* where 50% of the items are randomly selected for the first partition.
	* \plots_below
	*
	* @image html partition_if_int32_50_percent.png
	*
	*/
	struct DevicePartition
	{
	/**
	* @brief Uses the @p d_flags sequence to split the corresponding items from
	* @p d_in into a partitioned sequence @p d_out. The total number of
	* items copied into the first partition is written to
	* @p d_num_selected_out. ![](partition_flags_logo.png)
	*
	* @par
	* - The value type of @p d_flags must be castable to @p bool (e.g.,
	* @p bool, @p char, @p int, etc.).
	* - Copies of the selected items are compacted into @p d_out and maintain
	* their original relative ordering, however copies of the unselected
	* items are compacted into the rear of @p d_out in reverse order.
	* - The range `[d_out, d_out + num_items)` shall not overlap
	* `[d_in, d_in + num_items)` nor `[d_flags, d_flags + num_items)` in any
	* way. The range `[d_in, d_in + num_items)` may overlap
	* `[d_flags, d_flags + num_items)`.
	* - \devicestorage
	*
	* @par Snippet
	* The code snippet below illustrates the compaction of items selected from
	* an @p int device vector.
	* @par
	* @code
	* #include <cub/cub.cuh>
	* // or equivalently <cub/device/device_partition.cuh>
	*
	* // Declare, allocate, and initialize device-accessible pointers for
	* // input, flags, and output
	* int num_items; // e.g., 8
	* int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
	* char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
	* int *d_out; // e.g., [ , , , , , , , ]
	* int *d_num_selected_out; // e.g., [ ]
	* ...
	*
	* // Determine temporary device storage requirements
	* void *d_temp_storage = nullptr;
	* std::size_t temp_storage_bytes = 0;
	* cub::DevicePartition::Flagged(
	* d_temp_storage, temp_storage_bytes,
	* d_in, d_flags, d_out, d_num_selected_out, num_items);
	*
	* // Allocate temporary storage
	* cudaMalloc(&d_temp_storage, temp_storage_bytes);
	*
	* // Run selection
	* cub::DevicePartition::Flagged(
	* d_temp_storage, temp_storage_bytes,
	* d_in, d_flags, d_out, d_num_selected_out, num_items);
	*
	* // d_out <-- [1, 4, 6, 7, 8, 5, 3, 2]
	* // d_num_selected_out <-- [4]
	* @endcode
	*
	* @tparam InputIteratorT
	* [inferred] Random-access input iterator type for reading
	* input items \iterator
	*
	* @tparam FlagIterator
	* [inferred] Random-access input iterator type for reading
	* selection flags \iterator
	*
	* @tparam OutputIteratorT
	* [inferred] Random-access output iterator type for writing
	* output items \iterator
	*
	* @tparam NumSelectedIteratorT
	* [inferred] Output iterator type for recording the number
	* of items selected \iterator
	*
	* @param[in] d_temp_storage
	* Device-accessible allocation of temporary storage. When `nullptr`, the
	* required allocation size is written to @p temp_storage_bytes and no
	* work is done.
	*
	* @param[in,out] temp_storage_bytes
	* Reference to size in bytes of @p d_temp_storage allocation
	*
	* @param[in] d_in
	* Pointer to the input sequence of data items
	*
	* @param[in] d_flags
	* Pointer to the input sequence of selection flags
	*
	* @param[out] d_out
	* Pointer to the output sequence of partitioned data items
	*
	* @param[out] d_num_selected_out
	* Pointer to the output total number of items selected (i.e., the
	* offset of the unselected partition)
	*
	* @param[in] num_items
	* Total number of items to select from
	*
	* @param[in] stream
	* [optional] CUDA stream to launch kernels within.
	* Default is stream<sub>0</sub>.
	*/
	template <typename InputIteratorT,
	typename FlagIterator,
	typename OutputIteratorT,
	typename NumSelectedIteratorT>
	CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
	Flagged(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	FlagIterator d_flags,
	OutputIteratorT d_out,
	NumSelectedIteratorT d_num_selected_out,
	int num_items,
	cudaStream_t stream = 0)
	{
	using OffsetT = int; // Signed integer type for global offsets
	using SelectOp = NullType; // Selection op (not used)
	using EqualityOp = NullType; // Equality operator (not used)
	using DispatchSelectIfT = DispatchSelectIf<InputIteratorT,
	FlagIterator,
	OutputIteratorT,
	NumSelectedIteratorT,
	SelectOp,
	EqualityOp,
	OffsetT,
	true>;

	return DispatchSelectIfT::Dispatch(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_flags,
	d_out,
	d_num_selected_out,
	SelectOp{},
	EqualityOp{},
	num_items,
	stream);
	}

	template <typename InputIteratorT,
	typename FlagIterator,
	typename OutputIteratorT,
	typename NumSelectedIteratorT>
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
	CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
	Flagged(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	FlagIterator d_flags,
	OutputIteratorT d_out,
	NumSelectedIteratorT d_num_selected_out,
	int num_items,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

	return Flagged<InputIteratorT,
	FlagIterator,
	OutputIteratorT,
	NumSelectedIteratorT>(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_flags,
	d_out,
	d_num_selected_out,
	num_items,
	stream);
	}


	/**
	* @brief Uses the @p select_op functor to split the corresponding items
	* from @p d_in into a partitioned sequence @p d_out. The total
	* number of items copied into the first partition is written to
	* @p d_num_selected_out. ![](partition_logo.png)
	*
	* @par
	* - Copies of the selected items are compacted into @p d_out and maintain
	* their original relative ordering, however copies of the unselected
	* items are compacted into the rear of @p d_out in reverse order.
	* - The range `[d_out, d_out + num_items)` shall not overlap
	* `[d_in, d_in + num_items)` in any way.
	* - \devicestorage
	*
	* @par Performance
	* The following charts illustrate saturated partition-if performance across
	* different CUDA architectures for @p int32 and @p int64 items,
	* respectively. Items are selected for the first partition with 50%
	* probability.
	*
	* @image html partition_if_int32_50_percent.png
	* @image html partition_if_int64_50_percent.png
	*
	* @par
	* The following charts are similar, but 5% selection probability for the
	* first partition:
	*
	* @image html partition_if_int32_5_percent.png
	* @image html partition_if_int64_5_percent.png
	*
	* @par Snippet
	* The code snippet below illustrates the compaction of items selected from
	* an @p int device vector.
	* @par
	* @code
	* #include <cub/cub.cuh>
	* // or equivalently <cub/device/device_partition.cuh>
	*
	* // Functor type for selecting values less than some criteria
	* struct LessThan
	* {
	* int compare;
	*
	* CUB_RUNTIME_FUNCTION __forceinline__
	* explicit LessThan(int compare) : compare(compare) {}
	*
	* CUB_RUNTIME_FUNCTION __forceinline__
	* bool operator()(const int &a) const
	* {
	* return (a < compare);
	* }
	* };
	*
	* // Declare, allocate, and initialize device-accessible pointers for
	* // input and output
	* int num_items; // e.g., 8
	* int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
	* int *d_out; // e.g., [ , , , , , , , ]
	* int *d_num_selected_out; // e.g., [ ]
	* LessThan select_op(7);
	* ...
	*
	* // Determine temporary device storage requirements
	* void *d_temp_storage = nullptr;
	* std::size_t temp_storage_bytes = 0;
	* cub::DevicePartition::If(
	* d_temp_storage, temp_storage_bytes,
	* d_in, d_out, d_num_selected_out, num_items, select_op);
	*
	* // Allocate temporary storage
	* cudaMalloc(&d_temp_storage, temp_storage_bytes);
	*
	* // Run selection
	* cub::DevicePartition::If(
	* d_temp_storage, temp_storage_bytes,
	* d_in, d_out, d_num_selected_out, num_items, select_op);
	*
	* // d_out <-- [0, 2, 3, 5, 2, 8, 81, 9]
	* // d_num_selected_out <-- [5]
	*
	* @endcode
	*
	* @tparam InputIteratorT
	* [inferred] Random-access input iterator type for reading input
	* items \iterator
	*
	* @tparam OutputIteratorT
	* [inferred] Random-access output iterator type for writing output
	* items \iterator
	*
	* @tparam NumSelectedIteratorT
	* [inferred] Output iterator type for recording the number of items
	* selected \iterator
	*
	* @tparam SelectOp
	* [inferred] Selection functor type having member
	* `bool operator()(const T &a)`
	*
	* @param[in] d_temp_storage
	* Device-accessible allocation of temporary storage. When `nullptr`, the
	* required allocation size is written to `temp_storage_bytes` and no
	* work is done.
	*
	* @param[in,out] temp_storage_bytes
	* Reference to size in bytes of @p d_temp_storage allocation
	*
	* @param[in] d_in
	* Pointer to the input sequence of data items
	*
	* @param[out] d_out
	* Pointer to the output sequence of partitioned data items
	*
	* @param[out] d_num_selected_out
	* Pointer to the output total number of items selected (i.e., the
	* offset of the unselected partition)
	*
	* @param[in] num_items
	* Total number of items to select from
	*
	* @param[in] select_op
	* Unary selection operator
	*
	* @param[in] stream
	* [optional] CUDA stream to launch kernels within.
	* Default is stream<sub>0</sub>.
	*/
	template <typename InputIteratorT,
	typename OutputIteratorT,
	typename NumSelectedIteratorT,
	typename SelectOp>
	CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
	If(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	NumSelectedIteratorT d_num_selected_out,
	int num_items,
	SelectOp select_op,
	cudaStream_t stream = 0)
	{
	using OffsetT = int; // Signed integer type for global offsets
	using FlagIterator = NullType *; // FlagT iterator type (not used)
	using EqualityOp = NullType; // Equality operator (not used)

	using DispatchSelectIfT = DispatchSelectIf<InputIteratorT,
	FlagIterator,
	OutputIteratorT,
	NumSelectedIteratorT,
	SelectOp,
	EqualityOp,
	OffsetT,
	true>;

	return DispatchSelectIfT::Dispatch(d_temp_storage,
	temp_storage_bytes,
	d_in,
	nullptr,
	d_out,
	d_num_selected_out,
	select_op,
	EqualityOp{},
	num_items,
	stream);
	}

	template <typename InputIteratorT,
	typename OutputIteratorT,
	typename NumSelectedIteratorT,
	typename SelectOp>
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
	CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
	If(void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	NumSelectedIteratorT d_num_selected_out,
	int num_items,
	SelectOp select_op,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

	return If<InputIteratorT, OutputIteratorT, NumSelectedIteratorT, SelectOp>(
	d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_out,
	d_num_selected_out,
	num_items,
	select_op,
	stream);
	}

	/**
	* @brief Uses two functors to split the corresponding items from @p d_in
	* into a three partitioned sequences @p d_first_part_out
	* @p d_second_part_out and @p d_unselected_out.
	* The total number of items copied into the first partition is written
	* to `d_num_selected_out[0]`, while the total number of items copied
	* into the second partition is written to `d_num_selected_out[1]`.
	*
	* @par
	* - Copies of the items selected by @p select_first_part_op are compacted
	* into @p d_first_part_out and maintain their original relative ordering.
	* - Copies of the items selected by @p select_second_part_op are compacted
	* into @p d_second_part_out and maintain their original relative ordering.
	* - Copies of the unselected items are compacted into the
	* @p d_unselected_out in reverse order.
	* - The ranges `[d_out, d_out + num_items)`,
	* `[d_first_part_out, d_first_part_out + d_num_selected_out[0])`,
	* `[d_second_part_out, d_second_part_out + d_num_selected_out[1])`,
	* `[d_unselected_out, d_unselected_out + num_items - d_num_selected_out[0] - d_num_selected_out[1])`,
	* shall not overlap in any way.
	*
	* @par Snippet
	* The code snippet below illustrates how this algorithm can partition an
	* input vector into small, medium, and large items so that the relative
	* order of items remain deterministic.
	*
	* Let's consider any value that doesn't exceed six a small one. On the
	* other hand, any value that exceeds 50 will be considered a large one.
	* Since the value used to define a small part doesn't match one that
	* defines the large part, the intermediate segment is implied.
	*
	* These definitions partition a value space into three categories. We want
	* to preserve the order of items in which they appear in the input vector.
	* Since the algorithm provides stable partitioning, this is possible.
	*
	* Since the number of items in each category is unknown beforehand, we need
	* three output arrays of num_items elements each. To reduce the memory
	* requirements, we can combine the output storage for two categories.
	*
	* Since each value falls precisely in one category, it's safe to add
	* "large" values into the head of the shared output vector and the "middle"
	* values into its tail. To add items into the tail of the output array, we
	* can use `thrust::reverse_iterator`.
	* @par
	* @code
	* #include <cub/cub.cuh>
	* // or equivalently <cub/device/device_partition.cuh>
	*
	* // Functor type for selecting values less than some criteria
	* struct LessThan
	* {
	* int compare;
	*
	* CUB_RUNTIME_FUNCTION __forceinline__
	* explicit LessThan(int compare) : compare(compare) {}
	*
	* CUB_RUNTIME_FUNCTION __forceinline__
	* bool operator()(const int &a) const
	* {
	* return a < compare;
	* }
	* };
	*
	* // Functor type for selecting values greater than some criteria
	* struct GreaterThan
	* {
	* int compare;
	*
	* CUB_RUNTIME_FUNCTION __forceinline__
	* explicit GreaterThan(int compare) : compare(compare) {}
	*
	* CUB_RUNTIME_FUNCTION __forceinline__
	* bool operator()(const int &a) const
	* {
	* return a > compare;
	* }
	* };
	*
	* // Declare, allocate, and initialize device-accessible pointers for
	* // input and output
	* int num_items; // e.g., 8
	* int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
	* int *d_large_and_unselected_out; // e.g., [ , , , , , , , ]
	* int *d_small_out; // e.g., [ , , , , , , , ]
	* int *d_num_selected_out; // e.g., [ , ]
	* thrust::reverse_iterator<T> unselected_out(d_large_and_unselected_out + num_items);
	* LessThan small_items_selector(7);
	* GreaterThan large_items_selector(50);
	* ...
	*
	* // Determine temporary device storage requirements
	* void *d_temp_storage = nullptr;
	* std::size_t temp_storage_bytes = 0;
	* cub::DevicePartition::If(
	* d_temp_storage, temp_storage_bytes,
	* d_in, d_large_and_medium_out, d_small_out, unselected_out,
	* d_num_selected_out, num_items,
	* large_items_selector, small_items_selector);
	*
	* // Allocate temporary storage
	* cudaMalloc(&d_temp_storage, temp_storage_bytes);
	*
	* // Run selection
	* cub::DevicePartition::If(
	* d_temp_storage, temp_storage_bytes,
	* d_in, d_large_and_medium_out, d_small_out, unselected_out,
	* d_num_selected_out, num_items,
	* large_items_selector, small_items_selector);
	*
	* // d_large_and_unselected_out <-- [ 81, , , , , , 8, 9 ]
	* // d_small_out <-- [ 0, 2, 3, 5, 2, , , ]
	* // d_num_selected_out <-- [ 1, 5 ]
	* @endcode
	*
	* @tparam InputIteratorT
	* [inferred] Random-access input iterator type for reading
	* input items \iterator
	*
	* @tparam FirstOutputIteratorT
	* [inferred] Random-access output iterator type for writing output
	* items selected by first operator \iterator
	*
	* @tparam SecondOutputIteratorT
	* [inferred] Random-access output iterator type for writing output
	* items selected by second operator \iterator
	*
	* @tparam UnselectedOutputIteratorT
	* [inferred] Random-access output iterator type for writing
	* unselected items \iterator
	*
	* @tparam NumSelectedIteratorT
	* [inferred] Output iterator type for recording the number of items
	* selected \iterator
	*
	* @tparam SelectFirstPartOp
	* [inferred] Selection functor type having member
	* `bool operator()(const T &a)`
	*
	* @tparam SelectSecondPartOp
	* [inferred] Selection functor type having member
	* `bool operator()(const T &a)`
	*
	* @param[in] d_temp_storage
	* Device-accessible allocation of temporary storage. When `nullptr`, the
	* required allocation size is written to @p temp_storage_bytes and
	* no work is done.
	*
	* @param[in,out] temp_storage_bytes
	* Reference to size in bytes of @p d_temp_storage allocation
	*
	* @param[in] d_in
	* Pointer to the input sequence of data items
	*
	* @param[out] d_first_part_out
	* Pointer to the output sequence of data items selected by
	* @p select_first_part_op
	*
	* @param[out] d_second_part_out
	* Pointer to the output sequence of data items selected by
	* @p select_second_part_op
	*
	* @param[out] d_unselected_out
	* Pointer to the output sequence of unselected data items
	*
	* @param[out] d_num_selected_out
	* Pointer to the output array with two elements, where total number of
	* items selected by @p select_first_part_op is stored as
	* `d_num_selected_out[0]` and total number of items selected by
	* @p select_second_part_op is stored as `d_num_selected_out[1]`,
	* respectively
	*
	* @param[in] num_items
	* Total number of items to select from
	*
	* @param[in] select_first_part_op
	* Unary selection operator to select @p d_first_part_out
	*
	* @param[in] select_second_part_op
	* Unary selection operator to select @p d_second_part_out
	*
	* @param[in] stream
	* [optional] CUDA stream to launch kernels within.
	* Default is stream<sub>0</sub>.
	*/
	template <typename InputIteratorT,
	typename FirstOutputIteratorT,
	typename SecondOutputIteratorT,
	typename UnselectedOutputIteratorT,
	typename NumSelectedIteratorT,
	typename SelectFirstPartOp,
	typename SelectSecondPartOp>
	CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
	If(void *d_temp_storage,
	std::size_t &temp_storage_bytes,
	InputIteratorT d_in,
	FirstOutputIteratorT d_first_part_out,
	SecondOutputIteratorT d_second_part_out,
	UnselectedOutputIteratorT d_unselected_out,
	NumSelectedIteratorT d_num_selected_out,
	int num_items,
	SelectFirstPartOp select_first_part_op,
	SelectSecondPartOp select_second_part_op,
	cudaStream_t stream = 0)
	{
	using OffsetT = int;
	using DispatchThreeWayPartitionIfT =
	DispatchThreeWayPartitionIf<InputIteratorT,
	FirstOutputIteratorT,
	SecondOutputIteratorT,
	UnselectedOutputIteratorT,
	NumSelectedIteratorT,
	SelectFirstPartOp,
	SelectSecondPartOp,
	OffsetT>;

	return DispatchThreeWayPartitionIfT::Dispatch(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_first_part_out,
	d_second_part_out,
	d_unselected_out,
	d_num_selected_out,
	select_first_part_op,
	select_second_part_op,
	num_items,
	stream);
	}

	template <typename InputIteratorT,
	typename FirstOutputIteratorT,
	typename SecondOutputIteratorT,
	typename UnselectedOutputIteratorT,
	typename NumSelectedIteratorT,
	typename SelectFirstPartOp,
	typename SelectSecondPartOp>
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
	CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
	If(void *d_temp_storage,
	std::size_t &temp_storage_bytes,
	InputIteratorT d_in,
	FirstOutputIteratorT d_first_part_out,
	SecondOutputIteratorT d_second_part_out,
	UnselectedOutputIteratorT d_unselected_out,
	NumSelectedIteratorT d_num_selected_out,
	int num_items,
	SelectFirstPartOp select_first_part_op,
	SelectSecondPartOp select_second_part_op,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

	return If<InputIteratorT,
	FirstOutputIteratorT,
	SecondOutputIteratorT,
	UnselectedOutputIteratorT,
	NumSelectedIteratorT,
	SelectFirstPartOp,
	SelectSecondPartOp>(d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_first_part_out,
	d_second_part_out,
	d_unselected_out,
	d_num_selected_out,
	num_items,
	select_first_part_op,
	select_second_part_op,
	stream);
	}
	};

	/**
	* @example example_device_partition_flagged.cu
	* @example example_device_partition_if.cu
	*/

	CUB_NAMESPACE_END