File size: 18,949 Bytes
0dc1b04 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 | /******************************************************************************
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#pragma once
#include "../config.cuh"
#include "../thread/thread_search.cuh"
#include "../util_math.cuh"
#include "../util_namespace.cuh"
#include "../util_ptx.cuh"
#include "../util_type.cuh"
#include "block_scan.cuh"
#include <limits>
#include <type_traits>
CUB_NAMESPACE_BEGIN
/**
* \brief The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That is, given
* the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i] many times in the output
* array.
* Due to the nature of the run-length decoding algorithm ("decompression"), the output size of the run-length decoded
* array is runtime-dependent and potentially without any upper bound. To address this, BlockRunLengthDecode allows
* retrieving a "window" from the run-length decoded array. The window's offset can be specified and BLOCK_THREADS *
* DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from the specified window will be returned.
*
* \note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array).
* A run of length zero may not be followed by a run length that is not zero.
*
* \par
* \code
* __global__ void ExampleKernel(...)
* {
* // Specialising BlockRunLengthDecode to run-length decode items of type uint64_t
* using RunItemT = uint64_t;
* // Type large enough to index into the run-length decoded array
* using RunLengthT = uint32_t;
*
* // Specialising BlockRunLengthDecode for a 1D block of 128 threads
* constexpr int BLOCK_DIM_X = 128;
* // Specialising BlockRunLengthDecode to have each thread contribute 2 run-length encoded runs
* constexpr int RUNS_PER_THREAD = 2;
* // Specialising BlockRunLengthDecode to have each thread hold 4 run-length decoded items
* constexpr int DECODED_ITEMS_PER_THREAD = 4;
*
* // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
* using BlockRunLengthDecodeT =
* cub::BlockRunLengthDecode<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD>;
*
* // Allocate shared memory for BlockRunLengthDecode
* __shared__ typename BlockRunLengthDecodeT::TempStorage temp_storage;
*
* // The run-length encoded items and how often they shall be repeated in the run-length decoded output
* RunItemT run_values[RUNS_PER_THREAD];
* RunLengthT run_lengths[RUNS_PER_THREAD];
* ...
*
* // Initialize the BlockRunLengthDecode with the runs that we want to run-length decode
* uint32_t total_decoded_size = 0;
* BlockRunLengthDecodeT block_rld(temp_storage, run_values, run_lengths, total_decoded_size);
*
* // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs
* // have been decoded.
* uint32_t decoded_window_offset = 0U;
* while (decoded_window_offset < total_decoded_size)
* {
* RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD];
* RunItemT decoded_items[DECODED_ITEMS_PER_THREAD];
*
* // The number of decoded items that are valid within this window (aka pass) of run-length decoding
* uint32_t num_valid_items = total_decoded_size - decoded_window_offset;
* block_rld.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset);
*
* decoded_window_offset += BLOCK_DIM_X * DECODED_ITEMS_PER_THREAD;
*
* ...
* }
* }
* \endcode
* \par
* Suppose the set of input \p run_values across the block of threads is
* <tt>{ [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] }</tt> and
* \p run_lengths is <tt>{ [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }</tt>.
* The corresponding output \p decoded_items in those threads will be <tt>{ [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4],
* [4, 4, 4, 5], ..., [169, 169, 170, 171] }</tt> and \p relative_offsets will be <tt>{ [0, 0, 1, 0], [1, 2, 0, 1], [2,
* 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] }</tt> during the first iteration of the while loop.
*
* \tparam ItemT The data type of the items being run-length decoded
* \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
* \tparam RUNS_PER_THREAD The number of consecutive runs that each thread contributes
* \tparam DECODED_ITEMS_PER_THREAD The maximum number of decoded items that each thread holds
* \tparam DecodedOffsetT Type used to index into the block's decoded items (large enough to hold the sum over all the
* runs' lengths)
* \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension
* \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension
*/
template <typename ItemT,
int BLOCK_DIM_X,
int RUNS_PER_THREAD,
int DECODED_ITEMS_PER_THREAD,
typename DecodedOffsetT = uint32_t,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1>
class BlockRunLengthDecode
{
//---------------------------------------------------------------------
// CONFIGS & TYPE ALIASES
//---------------------------------------------------------------------
private:
/// The thread block size in threads
static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
/// The number of runs that the block decodes (out-of-bounds items may be padded with run lengths of '0')
static constexpr int BLOCK_RUNS = BLOCK_THREADS * RUNS_PER_THREAD;
/// BlockScan used to determine the beginning of each run (i.e., prefix sum over the runs' length)
using RunOffsetScanT = BlockScan<DecodedOffsetT, BLOCK_DIM_X, BLOCK_SCAN_RAKING_MEMOIZE, BLOCK_DIM_Y, BLOCK_DIM_Z>;
/// Type used to index into the block's runs
using RunOffsetT = uint32_t;
/// Shared memory type required by this thread block
union _TempStorage
{
typename RunOffsetScanT::TempStorage offset_scan;
struct
{
ItemT run_values[BLOCK_RUNS];
DecodedOffsetT run_offsets[BLOCK_RUNS];
} runs;
}; // union TempStorage
/// Internal storage allocator (used when the user does not provide pre-allocated shared memory)
__device__ __forceinline__ _TempStorage &PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/// Shared storage reference
_TempStorage &temp_storage;
/// Linear thread-id
uint32_t linear_tid;
public:
struct TempStorage : Uninitialized<_TempStorage>
{};
//---------------------------------------------------------------------
// CONSTRUCTOR
//---------------------------------------------------------------------
/**
* \brief Constructor specialised for user-provided temporary storage, initializing using the runs' lengths. The
* algorithm's temporary storage may not be repurposed between the constructor call and subsequent
* <b>RunLengthDecode</b> calls.
*/
template <typename RunLengthT, typename TotalDecodedSizeT>
__device__ __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
ItemT (&run_values)[RUNS_PER_THREAD],
RunLengthT (&run_lengths)[RUNS_PER_THREAD],
TotalDecodedSizeT &total_decoded_size)
: temp_storage(temp_storage.Alias())
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{
InitWithRunLengths(run_values, run_lengths, total_decoded_size);
}
/**
* \brief Constructor specialised for user-provided temporary storage, initializing using the runs' offsets. The
* algorithm's temporary storage may not be repurposed between the constructor call and subsequent
* <b>RunLengthDecode</b> calls.
*/
template <typename UserRunOffsetT>
__device__ __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
ItemT (&run_values)[RUNS_PER_THREAD],
UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
: temp_storage(temp_storage.Alias())
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{
InitWithRunOffsets(run_values, run_offsets);
}
/**
* \brief Constructor specialised for static temporary storage, initializing using the runs' lengths.
*/
template <typename RunLengthT, typename TotalDecodedSizeT>
__device__ __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD],
RunLengthT (&run_lengths)[RUNS_PER_THREAD],
TotalDecodedSizeT &total_decoded_size)
: temp_storage(PrivateStorage())
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{
InitWithRunLengths(run_values, run_lengths, total_decoded_size);
}
/**
* \brief Constructor specialised for static temporary storage, initializing using the runs' offsets.
*/
template <typename UserRunOffsetT>
__device__ __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD],
UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
: temp_storage(PrivateStorage())
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{
InitWithRunOffsets(run_values, run_offsets);
}
private:
/**
* \brief Returns the offset of the first value within \p input which compares greater than \p val. This version takes
* \p MAX_NUM_ITEMS, an upper bound of the array size, which will be used to determine the number of binary search
* iterations at compile time.
*/
template <int MAX_NUM_ITEMS,
typename InputIteratorT,
typename OffsetT,
typename T>
__device__ __forceinline__ OffsetT StaticUpperBound(InputIteratorT input, ///< [in] Input sequence
OffsetT num_items, ///< [in] Input sequence length
T val) ///< [in] Search key
{
OffsetT lower_bound = 0;
OffsetT upper_bound = num_items;
#pragma unroll
for (int i = 0; i <= Log2<MAX_NUM_ITEMS>::VALUE; i++)
{
OffsetT mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
mid = (cub::min)(mid, num_items - 1);
if (val < input[mid])
{
upper_bound = mid;
}
else
{
lower_bound = mid + 1;
}
}
return lower_bound;
}
template <typename RunOffsetT>
__device__ __forceinline__ void InitWithRunOffsets(ItemT (&run_values)[RUNS_PER_THREAD],
RunOffsetT (&run_offsets)[RUNS_PER_THREAD])
{
// Keep the runs' items and the offsets of each run's beginning in the temporary storage
RunOffsetT thread_dst_offset = static_cast<RunOffsetT>(linear_tid) * static_cast<RunOffsetT>(RUNS_PER_THREAD);
#pragma unroll
for (int i = 0; i < RUNS_PER_THREAD; i++)
{
temp_storage.runs.run_values[thread_dst_offset] = run_values[i];
temp_storage.runs.run_offsets[thread_dst_offset] = run_offsets[i];
thread_dst_offset++;
}
// Ensure run offsets and run values have been writen to shared memory
CTA_SYNC();
}
template <typename RunLengthT, typename TotalDecodedSizeT>
__device__ __forceinline__ void InitWithRunLengths(ItemT (&run_values)[RUNS_PER_THREAD],
RunLengthT (&run_lengths)[RUNS_PER_THREAD],
TotalDecodedSizeT &total_decoded_size)
{
// Compute the offset for the beginning of each run
DecodedOffsetT run_offsets[RUNS_PER_THREAD];
#pragma unroll
for (int i = 0; i < RUNS_PER_THREAD; i++)
{
run_offsets[i] = static_cast<DecodedOffsetT>(run_lengths[i]);
}
DecodedOffsetT decoded_size_aggregate;
RunOffsetScanT(this->temp_storage.offset_scan).ExclusiveSum(run_offsets, run_offsets, decoded_size_aggregate);
total_decoded_size = static_cast<TotalDecodedSizeT>(decoded_size_aggregate);
// Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation)
CTA_SYNC();
InitWithRunOffsets(run_values, run_offsets);
}
public:
/**
* \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
* items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
* run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
* the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be
* used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
* <b>RunLengthDecode</b> is not required.
* \p item_offsets can be used to retrieve each run-length decoded item's relative index within its run. E.g., the
* run-length encoded array of `3, 1, 4` with the respective run lengths of `2, 1, 3` would yield the run-length
* decoded array of `3, 3, 1, 4, 4, 4` with the relative offsets of `0, 1, 0, 0, 1, 2`.
* \smemreuse
*
* \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
* \param[out] item_offsets The run-length decoded items' relative offset within the run they belong to
* \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
* in undefined behavior.
*/
template <typename RelativeOffsetT>
__device__ __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
RelativeOffsetT (&item_offsets)[DECODED_ITEMS_PER_THREAD],
DecodedOffsetT from_decoded_offset = 0)
{
// The (global) offset of the first item decoded by this thread
DecodedOffsetT thread_decoded_offset = from_decoded_offset + linear_tid * DECODED_ITEMS_PER_THREAD;
// The run that the first decoded item of this thread belongs to
// If this thread's <thread_decoded_offset> is already beyond the total decoded size, it will be assigned to the
// last run
RunOffsetT assigned_run =
StaticUpperBound<BLOCK_RUNS>(temp_storage.runs.run_offsets, BLOCK_RUNS, thread_decoded_offset) -
static_cast<RunOffsetT>(1U);
DecodedOffsetT assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];
// If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
DecodedOffsetT assigned_run_end = (assigned_run == BLOCK_RUNS - 1)
? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
: temp_storage.runs.run_offsets[assigned_run + 1];
ItemT val = temp_storage.runs.run_values[assigned_run];
#pragma unroll
for (DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; i++)
{
decoded_items[i] = val;
item_offsets[i] = thread_decoded_offset - assigned_run_begin;
if (thread_decoded_offset == assigned_run_end - 1)
{
// We make sure that a thread is not re-entering this conditional when being assigned to the last run already by
// extending the last run's length to all the thread's item
assigned_run++;
assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];
// If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
assigned_run_end = (assigned_run == BLOCK_RUNS - 1) ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
: temp_storage.runs.run_offsets[assigned_run + 1];
val = temp_storage.runs.run_values[assigned_run];
}
thread_decoded_offset++;
}
}
/**
* \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
* items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
* run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
* the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be
* used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
* <b>RunLengthDecode</b> is not required.
*
* \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
* \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
* in undefined behavior.
*/
__device__ __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
DecodedOffsetT from_decoded_offset = 0)
{
DecodedOffsetT item_offsets[DECODED_ITEMS_PER_THREAD];
RunLengthDecode(decoded_items, item_offsets, from_decoded_offset);
}
};
CUB_NAMESPACE_END
|