/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of * data items residing within device-accessible memory. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Kernel entry points *****************************************************************************/ /** * Select kernel entry point (multi-block) * * Performs functor-based selection if SelectOp functor type != NullType * Otherwise performs flag-based selection if FlagIterator's value type != NullType * Otherwise performs discontinuity selection (keep unique) * * @tparam AgentRlePolicyT * Parameterized AgentRlePolicyT tuning policy type * * @tparam InputIteratorT * Random-access input iterator type for reading input items \iterator * * @tparam OffsetsOutputIteratorT * Random-access output iterator type for writing run-offset values \iterator * * @tparam LengthsOutputIteratorT * Random-access output iterator type for writing run-length values \iterator * * @tparam NumRunsOutputIteratorT * Output iterator type for recording the number of runs encountered \iterator * * @tparam ScanTileStateT * Tile status interface type * * @tparam EqualityOpT * T equality operator type * * @tparam OffsetT * Signed integer type for global offsets * * @param d_in * Pointer to input sequence of data items * * @param d_offsets_out * Pointer to output sequence of run-offsets * * @param d_lengths_out * Pointer to output sequence of run-lengths * * @param d_num_runs_out * Pointer to total number of runs (i.e., length of `d_offsets_out`) * * @param tile_status * Tile status interface * * @param equality_op * Equality operator for input items * * @param num_items * Total number of input items (i.e., length of `d_in`) * * @param num_tiles * Total number of tiles for the entire problem */ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREADS)) __global__ void DeviceRleSweepKernel(InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, ScanTileStateT tile_status, EqualityOpT equality_op, OffsetT num_items, int num_tiles) { using AgentRlePolicyT = typename ChainedPolicyT::ActivePolicy::RleSweepPolicyT; // Thread block type for selecting data from input tiles using AgentRleT = AgentRle; // Shared memory for AgentRle __shared__ typename AgentRleT::TempStorage temp_storage; // Process tiles AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items) .ConsumeRange(num_tiles, tile_status, d_num_runs_out); } /****************************************************************************** * Dispatch ******************************************************************************/ namespace detail { template struct device_rle_policy_hub { /// SM35 struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { enum { NOMINAL_4B_ITEMS_PER_THREAD = 15, ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), }; using RleSweepPolicyT = AgentRlePolicy<96, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLOCK_SCAN_WARP_SCANS, detail::default_reduce_by_key_delay_constructor_t>; }; using MaxPolicy = Policy350; }; } // namespace detail /** * Utility class for dispatching the appropriately-tuned kernels for DeviceRle * * @tparam InputIteratorT * Random-access input iterator type for reading input items \iterator * * @tparam OffsetsOutputIteratorT * Random-access output iterator type for writing run-offset values \iterator * * @tparam LengthsOutputIteratorT * Random-access output iterator type for writing run-length values \iterator * * @tparam NumRunsOutputIteratorT * Output iterator type for recording the number of runs encountered \iterator * * @tparam EqualityOpT * T equality operator type * * @tparam OffsetT * Signed integer type for global offsets * * @tparam SelectedPolicy * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template >> struct DeviceRleDispatch { /****************************************************************************** * Types and constants ******************************************************************************/ // The lengths output value type using LengthT = cub::detail::non_void_value_t; enum { INIT_KERNEL_THREADS = 128, }; // Tile status descriptor interface type using ScanTileStateT = ReduceByKeyScanTileState; void *d_temp_storage; size_t &temp_storage_bytes; InputIteratorT d_in; OffsetsOutputIteratorT d_offsets_out; LengthsOutputIteratorT d_lengths_out; NumRunsOutputIteratorT d_num_runs_out; EqualityOpT equality_op; OffsetT num_items; cudaStream_t stream; CUB_RUNTIME_FUNCTION __forceinline__ DeviceRleDispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_in(d_in) , d_offsets_out(d_offsets_out) , d_lengths_out(d_lengths_out) , d_num_runs_out(d_num_runs_out) , equality_op(equality_op) , num_items(num_items) , stream(stream) {} /****************************************************************************** * Dispatch entrypoints ******************************************************************************/ /** * Internal dispatch routine for computing a device-wide run-length-encode using the * specified kernel functions. * * @tparam DeviceScanInitKernelPtr * Function type of cub::DeviceScanInitKernel * * @tparam DeviceRleSweepKernelPtr * Function type of cub::DeviceRleSweepKernelPtr * * @param d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to * `temp_storage_bytes` and no work is done. * * @param temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param d_in * Pointer to the input sequence of data items * * @param d_offsets_out * Pointer to the output sequence of run-offsets * * @param d_lengths_out * Pointer to the output sequence of run-lengths * * @param d_num_runs_out * Pointer to the total number of runs encountered (i.e., length of `d_offsets_out`) * * @param equality_op * Equality operator for input items * * @param num_items * Total number of input items (i.e., length of `d_in`) * * @param stream * CUDA stream to launch kernels within. Default is stream₀. * * @param ptx_version * PTX version of dispatch kernels * * @param device_scan_init_kernel * Kernel function pointer to parameterization of cub::DeviceScanInitKernel * * @param device_rle_sweep_kernel * Kernel function pointer to parameterization of cub::DeviceRleSweepKernel */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke(DeviceScanInitKernelPtr device_scan_init_kernel, DeviceRleSweepKernelPtr device_rle_sweep_kernel) { cudaError error = cudaSuccess; const int block_threads = ActivePolicyT::RleSweepPolicyT::BLOCK_THREADS; const int items_per_thread = ActivePolicyT::RleSweepPolicyT::ITEMS_PER_THREAD; do { // Get device ordinal int device_ordinal; if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; // Number of input tiles int tile_size = block_threads * items_per_thread; int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); // Specify temporary storage allocation requirements size_t allocation_sizes[1]; if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) { break; // bytes needed for tile status descriptors } // Compute allocation pointers into the single storage blob (or compute the necessary size of // the blob) void *allocations[1] = {}; if (CubDebug( error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) { break; } if (d_temp_storage == nullptr) { // Return if the caller is simply requesting the size of the storage allocation break; } // Construct the tile status interface ScanTileStateT tile_status; if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) { break; } // Log device_scan_init_kernel configuration int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS)); #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long)stream); #endif // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(device_scan_init_kernel, tile_status, num_tiles, d_num_runs_out); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } // Return if empty problem if (num_items == 0) { break; } // Get SM occupancy for device_rle_sweep_kernel int device_rle_kernel_sm_occupancy; if (CubDebug(error = MaxSmOccupancy(device_rle_kernel_sm_occupancy, // out device_rle_sweep_kernel, block_threads))) { break; } // Get max x-dimension of grid int max_dim_x; if (CubDebug( error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) { break; } // Get grid size for scanning tiles dim3 scan_grid_size; scan_grid_size.z = 1; scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x); scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); // Log device_rle_sweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per " "thread, %d SM occupancy\n", scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, block_threads, (long long)stream, items_per_thread, device_rle_kernel_sm_occupancy); #endif // Invoke device_rle_sweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) .doit(device_rle_sweep_kernel, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, tile_status, equality_op, num_items, num_tiles); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } // Sync the stream if specified to flush runtime errors error = detail::DebugSyncStream(stream); if (CubDebug(error)) { break; } } while (0); return error; } template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() { using MaxPolicyT = typename SelectedPolicy::MaxPolicy; return Invoke(DeviceCompactInitKernel, DeviceRleSweepKernel); } /** * Internal dispatch routine * * @param d_temp_storage * Device-accessible allocation of temporary storage. * When NULL, the required allocation size is written to * `temp_storage_bytes` and no work is done. * * @param temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param d_in * Pointer to input sequence of data items * * @param d_offsets_out * Pointer to output sequence of run-offsets * * @param d_lengths_out * Pointer to output sequence of run-lengths * * @param d_num_runs_out * Pointer to total number of runs (i.e., length of `d_offsets_out`) * * @param equality_op * Equality operator for input items * * @param num_items * Total number of input items (i.e., length of `d_in`) * * @param stream * [optional] CUDA stream to launch kernels within. * Default is stream₀. */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream) { using MaxPolicyT = typename SelectedPolicy::MaxPolicy; cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) { break; } DeviceRleDispatch dispatch(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, equality_op, num_items, stream); // Dispatch if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) { break; } } while (0); return error; } CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, OffsetsOutputIteratorT d_offsets_out, LengthsOutputIteratorT d_lengths_out, NumRunsOutputIteratorT d_num_runs_out, EqualityOpT equality_op, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, equality_op, num_items, stream); } }; CUB_NAMESPACE_END