| /****************************************************************************** | |
| * Copyright (c) 2011, Duane Merrill. All rights reserved. | |
| * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. | |
| * | |
| * Redistribution and use in source and binary forms, with or without | |
| * modification, are permitted provided that the following conditions are met: | |
| * * Redistributions of source code must retain the above copyright | |
| * notice, this list of conditions and the following disclaimer. | |
| * * Redistributions in binary form must reproduce the above copyright | |
| * notice, this list of conditions and the following disclaimer in the | |
| * documentation and/or other materials provided with the distribution. | |
| * * Neither the name of the NVIDIA CORPORATION nor the | |
| * names of its contributors may be used to endorse or promote products | |
| * derived from this software without specific prior written permission. | |
| * | |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| * | |
| ******************************************************************************/ | |
| /** | |
| * @file | |
| * cub::DevicePartition provides device-wide, parallel operations for | |
| * partitioning sequences of data items residing within device-accessible memory. | |
| */ | |
| #pragma once | |
| #include <stdio.h> | |
| #include <iterator> | |
| #include <cub/config.cuh> | |
| #include <cub/device/dispatch/dispatch_select_if.cuh> | |
| #include <cub/device/dispatch/dispatch_three_way_partition.cuh> | |
| #include <cub/util_deprecated.cuh> | |
| CUB_NAMESPACE_BEGIN | |
| /** | |
| * @brief DevicePartition provides device-wide, parallel operations for | |
| * partitioning sequences of data items residing within device-accessible | |
| * memory.  | |
| * @ingroup SingleModule | |
| * | |
| * @par Overview | |
| * These operations apply a selection criterion to construct a partitioned | |
| * output sequence from items selected/unselected from a specified input | |
| * sequence. | |
| * | |
| * @par Usage Considerations | |
| * \cdp_class{DevicePartition} | |
| * | |
| * @par Performance | |
| * \linear_performance{partition} | |
| * | |
| * @par | |
| * The following chart illustrates DevicePartition::If | |
| * performance across different CUDA architectures for @p int32 items, | |
| * where 50% of the items are randomly selected for the first partition. | |
| * \plots_below | |
| * | |
| * @image html partition_if_int32_50_percent.png | |
| * | |
| */ | |
| struct DevicePartition | |
| { | |
| /** | |
| * @brief Uses the @p d_flags sequence to split the corresponding items from | |
| * @p d_in into a partitioned sequence @p d_out. The total number of | |
| * items copied into the first partition is written to | |
| * @p d_num_selected_out.  | |
| * | |
| * @par | |
| * - The value type of @p d_flags must be castable to @p bool (e.g., | |
| * @p bool, @p char, @p int, etc.). | |
| * - Copies of the selected items are compacted into @p d_out and maintain | |
| * their original relative ordering, however copies of the unselected | |
| * items are compacted into the rear of @p d_out in reverse order. | |
| * - The range `[d_out, d_out + num_items)` shall not overlap | |
| * `[d_in, d_in + num_items)` nor `[d_flags, d_flags + num_items)` in any | |
| * way. The range `[d_in, d_in + num_items)` may overlap | |
| * `[d_flags, d_flags + num_items)`. | |
| * - \devicestorage | |
| * | |
| * @par Snippet | |
| * The code snippet below illustrates the compaction of items selected from | |
| * an @p int device vector. | |
| * @par | |
| * @code | |
| * #include <cub/cub.cuh> | |
| * // or equivalently <cub/device/device_partition.cuh> | |
| * | |
| * // Declare, allocate, and initialize device-accessible pointers for | |
| * // input, flags, and output | |
| * int num_items; // e.g., 8 | |
| * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] | |
| * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] | |
| * int *d_out; // e.g., [ , , , , , , , ] | |
| * int *d_num_selected_out; // e.g., [ ] | |
| * ... | |
| * | |
| * // Determine temporary device storage requirements | |
| * void *d_temp_storage = nullptr; | |
| * std::size_t temp_storage_bytes = 0; | |
| * cub::DevicePartition::Flagged( | |
| * d_temp_storage, temp_storage_bytes, | |
| * d_in, d_flags, d_out, d_num_selected_out, num_items); | |
| * | |
| * // Allocate temporary storage | |
| * cudaMalloc(&d_temp_storage, temp_storage_bytes); | |
| * | |
| * // Run selection | |
| * cub::DevicePartition::Flagged( | |
| * d_temp_storage, temp_storage_bytes, | |
| * d_in, d_flags, d_out, d_num_selected_out, num_items); | |
| * | |
| * // d_out <-- [1, 4, 6, 7, 8, 5, 3, 2] | |
| * // d_num_selected_out <-- [4] | |
| * @endcode | |
| * | |
| * @tparam InputIteratorT | |
| * **[inferred]** Random-access input iterator type for reading | |
| * input items \iterator | |
| * | |
| * @tparam FlagIterator | |
| * **[inferred]** Random-access input iterator type for reading | |
| * selection flags \iterator | |
| * | |
| * @tparam OutputIteratorT | |
| * **[inferred]** Random-access output iterator type for writing | |
| * output items \iterator | |
| * | |
| * @tparam NumSelectedIteratorT | |
| * **[inferred]** Output iterator type for recording the number | |
| * of items selected \iterator | |
| * | |
| * @param[in] d_temp_storage | |
| * Device-accessible allocation of temporary storage. When `nullptr`, the | |
| * required allocation size is written to @p temp_storage_bytes and no | |
| * work is done. | |
| * | |
| * @param[in,out] temp_storage_bytes | |
| * Reference to size in bytes of @p d_temp_storage allocation | |
| * | |
| * @param[in] d_in | |
| * Pointer to the input sequence of data items | |
| * | |
| * @param[in] d_flags | |
| * Pointer to the input sequence of selection flags | |
| * | |
| * @param[out] d_out | |
| * Pointer to the output sequence of partitioned data items | |
| * | |
| * @param[out] d_num_selected_out | |
| * Pointer to the output total number of items selected (i.e., the | |
| * offset of the unselected partition) | |
| * | |
| * @param[in] num_items | |
| * Total number of items to select from | |
| * | |
| * @param[in] stream | |
| * **[optional]** CUDA stream to launch kernels within. | |
| * Default is stream<sub>0</sub>. | |
| */ | |
| template <typename InputIteratorT, | |
| typename FlagIterator, | |
| typename OutputIteratorT, | |
| typename NumSelectedIteratorT> | |
| CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t | |
| Flagged(void *d_temp_storage, | |
| size_t &temp_storage_bytes, | |
| InputIteratorT d_in, | |
| FlagIterator d_flags, | |
| OutputIteratorT d_out, | |
| NumSelectedIteratorT d_num_selected_out, | |
| int num_items, | |
| cudaStream_t stream = 0) | |
| { | |
| using OffsetT = int; // Signed integer type for global offsets | |
| using SelectOp = NullType; // Selection op (not used) | |
| using EqualityOp = NullType; // Equality operator (not used) | |
| using DispatchSelectIfT = DispatchSelectIf<InputIteratorT, | |
| FlagIterator, | |
| OutputIteratorT, | |
| NumSelectedIteratorT, | |
| SelectOp, | |
| EqualityOp, | |
| OffsetT, | |
| true>; | |
| return DispatchSelectIfT::Dispatch(d_temp_storage, | |
| temp_storage_bytes, | |
| d_in, | |
| d_flags, | |
| d_out, | |
| d_num_selected_out, | |
| SelectOp{}, | |
| EqualityOp{}, | |
| num_items, | |
| stream); | |
| } | |
| template <typename InputIteratorT, | |
| typename FlagIterator, | |
| typename OutputIteratorT, | |
| typename NumSelectedIteratorT> | |
| CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED | |
| CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t | |
| Flagged(void *d_temp_storage, | |
| size_t &temp_storage_bytes, | |
| InputIteratorT d_in, | |
| FlagIterator d_flags, | |
| OutputIteratorT d_out, | |
| NumSelectedIteratorT d_num_selected_out, | |
| int num_items, | |
| cudaStream_t stream, | |
| bool debug_synchronous) | |
| { | |
| CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG | |
| return Flagged<InputIteratorT, | |
| FlagIterator, | |
| OutputIteratorT, | |
| NumSelectedIteratorT>(d_temp_storage, | |
| temp_storage_bytes, | |
| d_in, | |
| d_flags, | |
| d_out, | |
| d_num_selected_out, | |
| num_items, | |
| stream); | |
| } | |
| /** | |
| * @brief Uses the @p select_op functor to split the corresponding items | |
| * from @p d_in into a partitioned sequence @p d_out. The total | |
| * number of items copied into the first partition is written to | |
| * @p d_num_selected_out.  | |
| * | |
| * @par | |
| * - Copies of the selected items are compacted into @p d_out and maintain | |
| * their original relative ordering, however copies of the unselected | |
| * items are compacted into the rear of @p d_out in reverse order. | |
| * - The range `[d_out, d_out + num_items)` shall not overlap | |
| * `[d_in, d_in + num_items)` in any way. | |
| * - \devicestorage | |
| * | |
| * @par Performance | |
| * The following charts illustrate saturated partition-if performance across | |
| * different CUDA architectures for @p int32 and @p int64 items, | |
| * respectively. Items are selected for the first partition with 50% | |
| * probability. | |
| * | |
| * @image html partition_if_int32_50_percent.png | |
| * @image html partition_if_int64_50_percent.png | |
| * | |
| * @par | |
| * The following charts are similar, but 5% selection probability for the | |
| * first partition: | |
| * | |
| * @image html partition_if_int32_5_percent.png | |
| * @image html partition_if_int64_5_percent.png | |
| * | |
| * @par Snippet | |
| * The code snippet below illustrates the compaction of items selected from | |
| * an @p int device vector. | |
| * @par | |
| * @code | |
| * #include <cub/cub.cuh> | |
| * // or equivalently <cub/device/device_partition.cuh> | |
| * | |
| * // Functor type for selecting values less than some criteria | |
| * struct LessThan | |
| * { | |
| * int compare; | |
| * | |
| * CUB_RUNTIME_FUNCTION __forceinline__ | |
| * explicit LessThan(int compare) : compare(compare) {} | |
| * | |
| * CUB_RUNTIME_FUNCTION __forceinline__ | |
| * bool operator()(const int &a) const | |
| * { | |
| * return (a < compare); | |
| * } | |
| * }; | |
| * | |
| * // Declare, allocate, and initialize device-accessible pointers for | |
| * // input and output | |
| * int num_items; // e.g., 8 | |
| * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] | |
| * int *d_out; // e.g., [ , , , , , , , ] | |
| * int *d_num_selected_out; // e.g., [ ] | |
| * LessThan select_op(7); | |
| * ... | |
| * | |
| * // Determine temporary device storage requirements | |
| * void *d_temp_storage = nullptr; | |
| * std::size_t temp_storage_bytes = 0; | |
| * cub::DevicePartition::If( | |
| * d_temp_storage, temp_storage_bytes, | |
| * d_in, d_out, d_num_selected_out, num_items, select_op); | |
| * | |
| * // Allocate temporary storage | |
| * cudaMalloc(&d_temp_storage, temp_storage_bytes); | |
| * | |
| * // Run selection | |
| * cub::DevicePartition::If( | |
| * d_temp_storage, temp_storage_bytes, | |
| * d_in, d_out, d_num_selected_out, num_items, select_op); | |
| * | |
| * // d_out <-- [0, 2, 3, 5, 2, 8, 81, 9] | |
| * // d_num_selected_out <-- [5] | |
| * | |
| * @endcode | |
| * | |
| * @tparam InputIteratorT | |
| * **[inferred]** Random-access input iterator type for reading input | |
| * items \iterator | |
| * | |
| * @tparam OutputIteratorT | |
| * **[inferred]** Random-access output iterator type for writing output | |
| * items \iterator | |
| * | |
| * @tparam NumSelectedIteratorT | |
| * **[inferred]** Output iterator type for recording the number of items | |
| * selected \iterator | |
| * | |
| * @tparam SelectOp | |
| * **[inferred]** Selection functor type having member | |
| * `bool operator()(const T &a)` | |
| * | |
| * @param[in] d_temp_storage | |
| * Device-accessible allocation of temporary storage. When `nullptr`, the | |
| * required allocation size is written to `temp_storage_bytes` and no | |
| * work is done. | |
| * | |
| * @param[in,out] temp_storage_bytes | |
| * Reference to size in bytes of @p d_temp_storage allocation | |
| * | |
| * @param[in] d_in | |
| * Pointer to the input sequence of data items | |
| * | |
| * @param[out] d_out | |
| * Pointer to the output sequence of partitioned data items | |
| * | |
| * @param[out] d_num_selected_out | |
| * Pointer to the output total number of items selected (i.e., the | |
| * offset of the unselected partition) | |
| * | |
| * @param[in] num_items | |
| * Total number of items to select from | |
| * | |
| * @param[in] select_op | |
| * Unary selection operator | |
| * | |
| * @param[in] stream | |
| * **[optional]** CUDA stream to launch kernels within. | |
| * Default is stream<sub>0</sub>. | |
| */ | |
| template <typename InputIteratorT, | |
| typename OutputIteratorT, | |
| typename NumSelectedIteratorT, | |
| typename SelectOp> | |
| CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t | |
| If(void *d_temp_storage, | |
| size_t &temp_storage_bytes, | |
| InputIteratorT d_in, | |
| OutputIteratorT d_out, | |
| NumSelectedIteratorT d_num_selected_out, | |
| int num_items, | |
| SelectOp select_op, | |
| cudaStream_t stream = 0) | |
| { | |
| using OffsetT = int; // Signed integer type for global offsets | |
| using FlagIterator = NullType *; // FlagT iterator type (not used) | |
| using EqualityOp = NullType; // Equality operator (not used) | |
| using DispatchSelectIfT = DispatchSelectIf<InputIteratorT, | |
| FlagIterator, | |
| OutputIteratorT, | |
| NumSelectedIteratorT, | |
| SelectOp, | |
| EqualityOp, | |
| OffsetT, | |
| true>; | |
| return DispatchSelectIfT::Dispatch(d_temp_storage, | |
| temp_storage_bytes, | |
| d_in, | |
| nullptr, | |
| d_out, | |
| d_num_selected_out, | |
| select_op, | |
| EqualityOp{}, | |
| num_items, | |
| stream); | |
| } | |
| template <typename InputIteratorT, | |
| typename OutputIteratorT, | |
| typename NumSelectedIteratorT, | |
| typename SelectOp> | |
| CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED | |
| CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t | |
| If(void *d_temp_storage, | |
| size_t &temp_storage_bytes, | |
| InputIteratorT d_in, | |
| OutputIteratorT d_out, | |
| NumSelectedIteratorT d_num_selected_out, | |
| int num_items, | |
| SelectOp select_op, | |
| cudaStream_t stream, | |
| bool debug_synchronous) | |
| { | |
| CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG | |
| return If<InputIteratorT, OutputIteratorT, NumSelectedIteratorT, SelectOp>( | |
| d_temp_storage, | |
| temp_storage_bytes, | |
| d_in, | |
| d_out, | |
| d_num_selected_out, | |
| num_items, | |
| select_op, | |
| stream); | |
| } | |
| /** | |
| * @brief Uses two functors to split the corresponding items from @p d_in | |
| * into a three partitioned sequences @p d_first_part_out | |
| * @p d_second_part_out and @p d_unselected_out. | |
| * The total number of items copied into the first partition is written | |
| * to `d_num_selected_out[0]`, while the total number of items copied | |
| * into the second partition is written to `d_num_selected_out[1]`. | |
| * | |
| * @par | |
| * - Copies of the items selected by @p select_first_part_op are compacted | |
| * into @p d_first_part_out and maintain their original relative ordering. | |
| * - Copies of the items selected by @p select_second_part_op are compacted | |
| * into @p d_second_part_out and maintain their original relative ordering. | |
| * - Copies of the unselected items are compacted into the | |
| * @p d_unselected_out in reverse order. | |
| * - The ranges `[d_out, d_out + num_items)`, | |
| * `[d_first_part_out, d_first_part_out + d_num_selected_out[0])`, | |
| * `[d_second_part_out, d_second_part_out + d_num_selected_out[1])`, | |
| * `[d_unselected_out, d_unselected_out + num_items - d_num_selected_out[0] - d_num_selected_out[1])`, | |
| * shall not overlap in any way. | |
| * | |
| * @par Snippet | |
| * The code snippet below illustrates how this algorithm can partition an | |
| * input vector into small, medium, and large items so that the relative | |
| * order of items remain deterministic. | |
| * | |
| * Let's consider any value that doesn't exceed six a small one. On the | |
| * other hand, any value that exceeds 50 will be considered a large one. | |
| * Since the value used to define a small part doesn't match one that | |
| * defines the large part, the intermediate segment is implied. | |
| * | |
| * These definitions partition a value space into three categories. We want | |
| * to preserve the order of items in which they appear in the input vector. | |
| * Since the algorithm provides stable partitioning, this is possible. | |
| * | |
| * Since the number of items in each category is unknown beforehand, we need | |
| * three output arrays of num_items elements each. To reduce the memory | |
| * requirements, we can combine the output storage for two categories. | |
| * | |
| * Since each value falls precisely in one category, it's safe to add | |
| * "large" values into the head of the shared output vector and the "middle" | |
| * values into its tail. To add items into the tail of the output array, we | |
| * can use `thrust::reverse_iterator`. | |
| * @par | |
| * @code | |
| * #include <cub/cub.cuh> | |
| * // or equivalently <cub/device/device_partition.cuh> | |
| * | |
| * // Functor type for selecting values less than some criteria | |
| * struct LessThan | |
| * { | |
| * int compare; | |
| * | |
| * CUB_RUNTIME_FUNCTION __forceinline__ | |
| * explicit LessThan(int compare) : compare(compare) {} | |
| * | |
| * CUB_RUNTIME_FUNCTION __forceinline__ | |
| * bool operator()(const int &a) const | |
| * { | |
| * return a < compare; | |
| * } | |
| * }; | |
| * | |
| * // Functor type for selecting values greater than some criteria | |
| * struct GreaterThan | |
| * { | |
| * int compare; | |
| * | |
| * CUB_RUNTIME_FUNCTION __forceinline__ | |
| * explicit GreaterThan(int compare) : compare(compare) {} | |
| * | |
| * CUB_RUNTIME_FUNCTION __forceinline__ | |
| * bool operator()(const int &a) const | |
| * { | |
| * return a > compare; | |
| * } | |
| * }; | |
| * | |
| * // Declare, allocate, and initialize device-accessible pointers for | |
| * // input and output | |
| * int num_items; // e.g., 8 | |
| * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] | |
| * int *d_large_and_unselected_out; // e.g., [ , , , , , , , ] | |
| * int *d_small_out; // e.g., [ , , , , , , , ] | |
| * int *d_num_selected_out; // e.g., [ , ] | |
| * thrust::reverse_iterator<T> unselected_out(d_large_and_unselected_out + num_items); | |
| * LessThan small_items_selector(7); | |
| * GreaterThan large_items_selector(50); | |
| * ... | |
| * | |
| * // Determine temporary device storage requirements | |
| * void *d_temp_storage = nullptr; | |
| * std::size_t temp_storage_bytes = 0; | |
| * cub::DevicePartition::If( | |
| * d_temp_storage, temp_storage_bytes, | |
| * d_in, d_large_and_medium_out, d_small_out, unselected_out, | |
| * d_num_selected_out, num_items, | |
| * large_items_selector, small_items_selector); | |
| * | |
| * // Allocate temporary storage | |
| * cudaMalloc(&d_temp_storage, temp_storage_bytes); | |
| * | |
| * // Run selection | |
| * cub::DevicePartition::If( | |
| * d_temp_storage, temp_storage_bytes, | |
| * d_in, d_large_and_medium_out, d_small_out, unselected_out, | |
| * d_num_selected_out, num_items, | |
| * large_items_selector, small_items_selector); | |
| * | |
| * // d_large_and_unselected_out <-- [ 81, , , , , , 8, 9 ] | |
| * // d_small_out <-- [ 0, 2, 3, 5, 2, , , ] | |
| * // d_num_selected_out <-- [ 1, 5 ] | |
| * @endcode | |
| * | |
| * @tparam InputIteratorT | |
| * **[inferred]** Random-access input iterator type for reading | |
| * input items \iterator | |
| * | |
| * @tparam FirstOutputIteratorT | |
| * **[inferred]** Random-access output iterator type for writing output | |
| * items selected by first operator \iterator | |
| * | |
| * @tparam SecondOutputIteratorT | |
| * **[inferred]** Random-access output iterator type for writing output | |
| * items selected by second operator \iterator | |
| * | |
| * @tparam UnselectedOutputIteratorT | |
| * **[inferred]** Random-access output iterator type for writing | |
| * unselected items \iterator | |
| * | |
| * @tparam NumSelectedIteratorT | |
| * **[inferred]** Output iterator type for recording the number of items | |
| * selected \iterator | |
| * | |
| * @tparam SelectFirstPartOp | |
| * **[inferred]** Selection functor type having member | |
| * `bool operator()(const T &a)` | |
| * | |
| * @tparam SelectSecondPartOp | |
| * **[inferred]** Selection functor type having member | |
| * `bool operator()(const T &a)` | |
| * | |
| * @param[in] d_temp_storage | |
| * Device-accessible allocation of temporary storage. When `nullptr`, the | |
| * required allocation size is written to @p temp_storage_bytes and | |
| * no work is done. | |
| * | |
| * @param[in,out] temp_storage_bytes | |
| * Reference to size in bytes of @p d_temp_storage allocation | |
| * | |
| * @param[in] d_in | |
| * Pointer to the input sequence of data items | |
| * | |
| * @param[out] d_first_part_out | |
| * Pointer to the output sequence of data items selected by | |
| * @p select_first_part_op | |
| * | |
| * @param[out] d_second_part_out | |
| * Pointer to the output sequence of data items selected by | |
| * @p select_second_part_op | |
| * | |
| * @param[out] d_unselected_out | |
| * Pointer to the output sequence of unselected data items | |
| * | |
| * @param[out] d_num_selected_out | |
| * Pointer to the output array with two elements, where total number of | |
| * items selected by @p select_first_part_op is stored as | |
| * `d_num_selected_out[0]` and total number of items selected by | |
| * @p select_second_part_op is stored as `d_num_selected_out[1]`, | |
| * respectively | |
| * | |
| * @param[in] num_items | |
| * Total number of items to select from | |
| * | |
| * @param[in] select_first_part_op | |
| * Unary selection operator to select @p d_first_part_out | |
| * | |
| * @param[in] select_second_part_op | |
| * Unary selection operator to select @p d_second_part_out | |
| * | |
| * @param[in] stream | |
| * **[optional]** CUDA stream to launch kernels within. | |
| * Default is stream<sub>0</sub>. | |
| */ | |
| template <typename InputIteratorT, | |
| typename FirstOutputIteratorT, | |
| typename SecondOutputIteratorT, | |
| typename UnselectedOutputIteratorT, | |
| typename NumSelectedIteratorT, | |
| typename SelectFirstPartOp, | |
| typename SelectSecondPartOp> | |
| CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t | |
| If(void *d_temp_storage, | |
| std::size_t &temp_storage_bytes, | |
| InputIteratorT d_in, | |
| FirstOutputIteratorT d_first_part_out, | |
| SecondOutputIteratorT d_second_part_out, | |
| UnselectedOutputIteratorT d_unselected_out, | |
| NumSelectedIteratorT d_num_selected_out, | |
| int num_items, | |
| SelectFirstPartOp select_first_part_op, | |
| SelectSecondPartOp select_second_part_op, | |
| cudaStream_t stream = 0) | |
| { | |
| using OffsetT = int; | |
| using DispatchThreeWayPartitionIfT = | |
| DispatchThreeWayPartitionIf<InputIteratorT, | |
| FirstOutputIteratorT, | |
| SecondOutputIteratorT, | |
| UnselectedOutputIteratorT, | |
| NumSelectedIteratorT, | |
| SelectFirstPartOp, | |
| SelectSecondPartOp, | |
| OffsetT>; | |
| return DispatchThreeWayPartitionIfT::Dispatch(d_temp_storage, | |
| temp_storage_bytes, | |
| d_in, | |
| d_first_part_out, | |
| d_second_part_out, | |
| d_unselected_out, | |
| d_num_selected_out, | |
| select_first_part_op, | |
| select_second_part_op, | |
| num_items, | |
| stream); | |
| } | |
| template <typename InputIteratorT, | |
| typename FirstOutputIteratorT, | |
| typename SecondOutputIteratorT, | |
| typename UnselectedOutputIteratorT, | |
| typename NumSelectedIteratorT, | |
| typename SelectFirstPartOp, | |
| typename SelectSecondPartOp> | |
| CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED | |
| CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t | |
| If(void *d_temp_storage, | |
| std::size_t &temp_storage_bytes, | |
| InputIteratorT d_in, | |
| FirstOutputIteratorT d_first_part_out, | |
| SecondOutputIteratorT d_second_part_out, | |
| UnselectedOutputIteratorT d_unselected_out, | |
| NumSelectedIteratorT d_num_selected_out, | |
| int num_items, | |
| SelectFirstPartOp select_first_part_op, | |
| SelectSecondPartOp select_second_part_op, | |
| cudaStream_t stream, | |
| bool debug_synchronous) | |
| { | |
| CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG | |
| return If<InputIteratorT, | |
| FirstOutputIteratorT, | |
| SecondOutputIteratorT, | |
| UnselectedOutputIteratorT, | |
| NumSelectedIteratorT, | |
| SelectFirstPartOp, | |
| SelectSecondPartOp>(d_temp_storage, | |
| temp_storage_bytes, | |
| d_in, | |
| d_first_part_out, | |
| d_second_part_out, | |
| d_unselected_out, | |
| d_num_selected_out, | |
| num_items, | |
| select_first_part_op, | |
| select_second_part_op, | |
| stream); | |
| } | |
| }; | |
| /** | |
| * @example example_device_partition_flagged.cu | |
| * @example example_device_partition_if.cu | |
| */ | |
| CUB_NAMESPACE_END | |