thrust / dependencies /cub /test /catch2_test_block_run_length_decode.cu
camenduru's picture
thanks to nvidia ❤
8ae5fc5
/******************************************************************************
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#include <cub/block/block_load.cuh>
#include <cub/block/block_run_length_decode.cuh>
#include <cub/block/block_store.cuh>
#include <cub/device/device_scan.cuh>
#include <cub/iterator/counting_input_iterator.cuh>
#include <cub/iterator/transform_input_iterator.cuh>
#include <cub/util_allocator.cuh>
// Has to go after all cub headers. Otherwise, this test won't catch unused
// variables in cub kernels.
#include "catch2_test_helper.h"
/******************************************************************************
* HELPER CLASS FOR RUN-LENGTH DECODING TESTS
******************************************************************************/
/**
* \brief Class template to facilitate testing the BlockRunLengthDecode algorithm for all its
* template parameter specialisations.
*
* \tparam ItemItT The item type being run-length decoded
* \tparam RunLengthsItT Iterator type providing the runs' lengths
* \tparam RUNS_PER_THREAD The number of runs that each thread is getting assigned to
* \tparam DECODED_ITEMS_PER_THREAD The number of run-length decoded items that each thread is
* decoding \tparam TEST_RELATIVE_OFFSETS_ Whether to also retrieve each decoded item's
* relative offset within its run \tparam TEST_RUN_OFFSETS_ Whether to pass in each run's
* offset instead of each run's length \tparam BLOCK_DIM_X The thread block length in
* threads along the X dimension
* \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension
* \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension
*/
template <typename ItemItT,
typename RunLengthsItT,
int RUNS_PER_THREAD,
int DECODED_ITEMS_PER_THREAD,
bool TEST_RELATIVE_OFFSETS_,
bool TEST_RUN_OFFSETS_,
int BLOCK_DIM_X,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1>
class AgentTestBlockRunLengthDecode
{
public:
constexpr static uint32_t BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
constexpr static uint32_t RUNS_PER_BLOCK = RUNS_PER_THREAD * BLOCK_THREADS;
constexpr static bool TEST_RELATIVE_OFFSETS = TEST_RELATIVE_OFFSETS_;
private:
using RunItemT = cub::detail::value_t<ItemItT>;
using RunLengthT = cub::detail::value_t<RunLengthsItT>;
using BlockRunOffsetScanT =
cub::BlockScan<RunLengthT, BLOCK_DIM_X, cub::BLOCK_SCAN_RAKING, BLOCK_DIM_Y, BLOCK_DIM_Z>;
using BlockRunLengthDecodeT =
cub::BlockRunLengthDecode<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD>;
using BlockLoadRunItemT = cub::BlockLoad<RunItemT,
BLOCK_DIM_X,
RUNS_PER_THREAD,
cub::BLOCK_LOAD_WARP_TRANSPOSE,
BLOCK_DIM_Y,
BLOCK_DIM_Z>;
using BlockLoadRunLengthsT = cub::BlockLoad<RunLengthT,
BLOCK_DIM_X,
RUNS_PER_THREAD,
cub::BLOCK_LOAD_WARP_TRANSPOSE,
BLOCK_DIM_Y,
BLOCK_DIM_Z>;
using BlockStoreDecodedItemT = cub::BlockStore<RunItemT,
BLOCK_DIM_X,
DECODED_ITEMS_PER_THREAD,
cub::BLOCK_STORE_WARP_TRANSPOSE,
BLOCK_DIM_Y,
BLOCK_DIM_Z>;
using BlockStoreRelativeOffsetT = cub::BlockStore<RunLengthT,
BLOCK_DIM_X,
DECODED_ITEMS_PER_THREAD,
cub::BLOCK_STORE_WARP_TRANSPOSE,
BLOCK_DIM_Y,
BLOCK_DIM_Z>;
__device__ __forceinline__ BlockRunLengthDecodeT
InitBlockRunLengthDecode(RunItemT (&unique_items)[RUNS_PER_THREAD],
RunLengthT (&run_lengths)[RUNS_PER_THREAD],
RunLengthT &decoded_size,
cub::Int2Type<true> /*test_run_offsets*/)
{
RunLengthT run_offsets[RUNS_PER_THREAD];
BlockRunOffsetScanT(temp_storage.run_offsets_scan_storage)
.ExclusiveSum(run_lengths, run_offsets, decoded_size);
// Ensure temporary shared memory can be repurposed
cub::CTA_SYNC();
// Construct BlockRunLengthDecode and initialize with the run offsets
return BlockRunLengthDecodeT(temp_storage.decode.run_length_decode_storage,
unique_items,
run_offsets);
}
__device__ __forceinline__ BlockRunLengthDecodeT
InitBlockRunLengthDecode(RunItemT (&unique_items)[RUNS_PER_THREAD],
RunLengthT (&run_lengths)[RUNS_PER_THREAD],
RunLengthT &decoded_size,
cub::Int2Type<false> /*test_run_offsets*/)
{
// Construct BlockRunLengthDecode and initialize with the run lengths
return BlockRunLengthDecodeT(temp_storage.decode.run_length_decode_storage,
unique_items,
run_lengths,
decoded_size);
}
__device__ __forceinline__ void LoadRuns(ItemItT d_block_unique_items,
RunLengthsItT d_block_run_lengths,
RunItemT (&unique_items)[RUNS_PER_THREAD],
RunLengthT (&run_lengths)[RUNS_PER_THREAD],
size_t num_valid_items)
{
if (num_valid_items < RUNS_PER_BLOCK)
{
BlockLoadRunItemT(temp_storage.load_uniques_storage)
.Load(d_block_unique_items, unique_items, num_valid_items);
}
else
{
BlockLoadRunItemT(temp_storage.load_uniques_storage).Load(d_block_unique_items, unique_items);
}
// Ensure BlockLoad's temporary shared memory can be repurposed
cub::CTA_SYNC();
// Load this block's tile of run lengths
if (num_valid_items < RUNS_PER_BLOCK)
BlockLoadRunLengthsT(temp_storage.load_run_lengths_storage)
.Load(d_block_run_lengths, run_lengths, num_valid_items, static_cast<RunLengthT>(0));
else
BlockLoadRunLengthsT(temp_storage.load_run_lengths_storage)
.Load(d_block_run_lengths, run_lengths);
// Ensure temporary shared memory can be repurposed
cub::CTA_SYNC();
}
public:
union TempStorage
{
typename BlockLoadRunItemT::TempStorage load_uniques_storage;
typename BlockLoadRunLengthsT::TempStorage load_run_lengths_storage;
cub::detail::
conditional_t<TEST_RUN_OFFSETS_, typename BlockRunOffsetScanT::TempStorage, cub::NullType>
run_offsets_scan_storage;
struct
{
typename BlockRunLengthDecodeT::TempStorage run_length_decode_storage;
typename BlockStoreDecodedItemT::TempStorage store_decoded_runs_storage;
typename BlockStoreRelativeOffsetT::TempStorage store_relative_offsets;
} decode;
};
TempStorage &temp_storage;
__device__ __forceinline__ AgentTestBlockRunLengthDecode(TempStorage &temp_storage)
: temp_storage(temp_storage)
{}
/**
* \brief Loads the given block (or tile) of runs, and computes their "decompressed" (run-length
* decoded) size.
*/
__device__ __forceinline__ uint32_t GetDecodedSize(ItemItT d_block_unique_items,
RunLengthsItT d_block_run_lengths,
size_t num_valid_runs)
{
// Load this block's tile of encoded runs
RunItemT unique_items[RUNS_PER_THREAD];
RunLengthT run_lengths[RUNS_PER_THREAD];
LoadRuns(d_block_unique_items, d_block_run_lengths, unique_items, run_lengths, num_valid_runs);
// Init the BlockRunLengthDecode and get the total decoded size of this block's tile (i.e., the
// "decompressed" size)
uint32_t decoded_size = 0U;
BlockRunLengthDecodeT run_length_decode =
InitBlockRunLengthDecode(unique_items,
run_lengths,
decoded_size,
cub::Int2Type<TEST_RUN_OFFSETS_>());
return decoded_size;
}
/**
* \brief Loads the given block (or tile) of runs, run-length decodes them, and writes the results
* to \p d_block_decoded_out.
*/
template <typename UniqueItemOutItT, typename RelativeOffsetOutItT>
__device__ __forceinline__ uint32_t WriteDecodedRuns(ItemItT d_block_unique_items,
RunLengthsItT d_block_run_lengths,
UniqueItemOutItT d_block_decoded_out,
RelativeOffsetOutItT d_block_rel_out,
size_t num_valid_runs)
{
// Load this block's tile of encoded runs
RunItemT unique_items[RUNS_PER_THREAD];
RunLengthT run_lengths[RUNS_PER_THREAD];
LoadRuns(d_block_unique_items, d_block_run_lengths, unique_items, run_lengths, num_valid_runs);
// Init the BlockRunLengthDecode and get the total decoded size of this block's tile (i.e., the
// "decompressed" size)
uint32_t decoded_size = 0U;
BlockRunLengthDecodeT run_length_decode =
InitBlockRunLengthDecode(unique_items,
run_lengths,
decoded_size,
cub::Int2Type<TEST_RUN_OFFSETS_>());
// Run-length decode ("decompress") the runs into a window buffer of limited size. This is
// repeated until all runs have been decoded.
uint32_t decoded_window_offset = 0U;
while (decoded_window_offset < decoded_size)
{
RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD];
RunItemT decoded_items[DECODED_ITEMS_PER_THREAD];
// The number of decoded items that are valid within this window (aka pass) of run-length
// decoding
uint32_t num_valid_items = decoded_size - decoded_window_offset;
run_length_decode.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset);
BlockStoreDecodedItemT(temp_storage.decode.store_decoded_runs_storage)
.Store(d_block_decoded_out + decoded_window_offset, decoded_items, num_valid_items);
if (TEST_RELATIVE_OFFSETS)
{
BlockStoreRelativeOffsetT(temp_storage.decode.store_relative_offsets)
.Store(d_block_rel_out + decoded_window_offset, relative_offsets, num_valid_items);
}
decoded_window_offset += DECODED_ITEMS_PER_THREAD * BLOCK_THREADS;
}
return decoded_size;
}
};
/******************************************************************************
* [STAGE 1] RUN-LENGTH DECODING TEST KERNEL
******************************************************************************/
template <typename AgentTestBlockRunLengthDecode,
typename ItemItT,
typename RunLengthsItT,
typename OffsetT,
typename DecodedSizesOutT>
__launch_bounds__(AgentTestBlockRunLengthDecode::BLOCK_THREADS) __global__
void BlockRunLengthDecodeGetSizeKernel(const ItemItT d_unique_items,
const RunLengthsItT d_run_lengths,
const OffsetT num_runs,
DecodedSizesOutT d_decoded_sizes)
{
constexpr OffsetT RUNS_PER_BLOCK = AgentTestBlockRunLengthDecode::RUNS_PER_BLOCK;
__shared__ typename AgentTestBlockRunLengthDecode::TempStorage temp_storage;
OffsetT block_offset = blockIdx.x * RUNS_PER_BLOCK;
OffsetT num_valid_runs = (block_offset + RUNS_PER_BLOCK >= num_runs) ? (num_runs - block_offset)
: RUNS_PER_BLOCK;
AgentTestBlockRunLengthDecode run_length_decode_agent(temp_storage);
uint64_t num_decoded_items = run_length_decode_agent.GetDecodedSize(d_unique_items + block_offset,
d_run_lengths + block_offset,
num_valid_runs);
d_decoded_sizes[blockIdx.x] = num_decoded_items;
}
/******************************************************************************
* [STAGE 2] RUN-LENGTH DECODING TEST KERNEL
******************************************************************************/
template <typename AgentTestBlockRunLengthDecode,
typename ItemItT,
typename RunLengthsItT,
typename DecodedSizesOutT,
typename OffsetT,
typename DecodedItemsOutItT,
typename RelativeOffsetOutItT>
__launch_bounds__(AgentTestBlockRunLengthDecode::BLOCK_THREADS) __global__
void BlockRunLengthDecodeTestKernel(const ItemItT d_unique_items,
const RunLengthsItT d_run_lengths,
const DecodedSizesOutT d_decoded_offsets,
const OffsetT num_runs,
DecodedItemsOutItT d_decoded_items,
RelativeOffsetOutItT d_relative_offsets)
{
constexpr OffsetT RUNS_PER_BLOCK = AgentTestBlockRunLengthDecode::RUNS_PER_BLOCK;
__shared__ typename AgentTestBlockRunLengthDecode::TempStorage temp_storage;
OffsetT block_offset = blockIdx.x * RUNS_PER_BLOCK;
OffsetT num_valid_runs = (block_offset + RUNS_PER_BLOCK >= num_runs) ? (num_runs - block_offset)
: RUNS_PER_BLOCK;
AgentTestBlockRunLengthDecode run_length_decode_agent(temp_storage);
run_length_decode_agent.WriteDecodedRuns(d_unique_items + block_offset,
d_run_lengths + block_offset,
d_decoded_items + d_decoded_offsets[blockIdx.x],
d_relative_offsets + d_decoded_offsets[blockIdx.x],
num_valid_runs);
}
struct ModOp
{
using T = uint32_t;
__host__ __device__ __forceinline__ T operator()(const T &x) const { return 1 + (x % 100); }
};
template <uint32_t RUNS_PER_THREAD,
uint32_t DECODED_ITEMS_PER_THREAD,
uint32_t BLOCK_DIM_X,
uint32_t BLOCK_DIM_Y,
uint32_t BLOCK_DIM_Z,
bool TEST_RUN_OFFSETS,
bool TEST_RELATIVE_OFFSETS>
void TestAlgorithmSpecialisation()
{
constexpr uint32_t THREADS_PER_BLOCK = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
constexpr uint32_t RUNS_PER_BLOCK = RUNS_PER_THREAD * THREADS_PER_BLOCK;
using RunItemT = float;
using RunLengthT = uint32_t;
using ItemItT = cub::CountingInputIterator<RunItemT>;
using RunLengthsItT =
cub::TransformInputIterator<RunLengthT, ModOp, cub::CountingInputIterator<RunLengthT>>;
ItemItT d_unique_items(1000U);
RunLengthsItT d_run_lengths(cub::CountingInputIterator<RunLengthT>(0), ModOp{});
constexpr uint32_t num_runs = 10000;
constexpr uint32_t num_blocks = (num_runs + (RUNS_PER_BLOCK - 1U)) / RUNS_PER_BLOCK;
size_t temp_storage_bytes = 0ULL;
void *temp_storage = nullptr;
uint32_t *h_num_decoded_total = nullptr;
uint32_t *d_decoded_sizes = nullptr;
uint32_t *d_decoded_offsets = nullptr;
RunItemT *d_decoded_out = nullptr;
RunLengthT *d_relative_offsets = nullptr;
RunItemT *h_decoded_out = nullptr;
RunLengthT *h_relative_offsets = nullptr;
using AgentTestBlockRunLengthDecodeT = AgentTestBlockRunLengthDecode<ItemItT,
RunLengthsItT,
RUNS_PER_THREAD,
DECODED_ITEMS_PER_THREAD,
TEST_RELATIVE_OFFSETS,
TEST_RUN_OFFSETS,
THREADS_PER_BLOCK,
1,
1>;
enum : uint32_t
{
TIMER_SIZE_BEGIN = 0,
TIMER_SIZE_END,
TIMER_DECODE_BEGIN,
TIMER_DECODE_END,
NUM_TIMERS,
};
// Get temporary storage requirements for the scan (for computing offsets for the per-block
// run-length decoded items)
cub::DeviceScan::InclusiveSum(nullptr,
temp_storage_bytes,
d_decoded_sizes,
d_decoded_offsets,
num_blocks);
// Allocate device memory
CubDebugExit(cudaMalloc(&temp_storage, temp_storage_bytes));
CubDebugExit(cudaMalloc(&d_decoded_sizes, num_blocks * sizeof(*d_decoded_sizes)));
// Allocate for the exclusive sum PLUS the overall aggregate
CubDebugExit(cudaMalloc(&d_decoded_offsets, (num_blocks + 1) * sizeof(*d_decoded_offsets)));
CubDebugExit(cudaMallocHost(&h_num_decoded_total, sizeof(*h_num_decoded_total)));
// Get the per-block number of items being decoded (i-th thread block writing size to
// d_decoded_sizes[i])
BlockRunLengthDecodeGetSizeKernel<AgentTestBlockRunLengthDecodeT>
<<<num_blocks, THREADS_PER_BLOCK, 0U>>>(d_unique_items,
d_run_lengths,
num_runs,
d_decoded_sizes);
// Compute offsets for the runs decoded by each block (exclusive sum + aggregate)
CubDebugExit(cudaMemsetAsync(d_decoded_offsets, 0, sizeof(d_decoded_offsets[0])));
CubDebugExit(cub::DeviceScan::InclusiveSum(temp_storage,
temp_storage_bytes,
d_decoded_sizes,
&d_decoded_offsets[1],
num_blocks));
// Copy the total decoded size to CPU in order to allocate just the right amount of device memory
CubDebugExit(cudaMemcpy(h_num_decoded_total,
&d_decoded_offsets[num_blocks],
sizeof(*h_num_decoded_total),
cudaMemcpyDeviceToHost));
// Allocate device memory for the run-length decoded output
CubDebugExit(cudaMallocHost(&h_decoded_out, (*h_num_decoded_total) * sizeof(RunItemT)));
CubDebugExit(cudaMalloc(&d_decoded_out, (*h_num_decoded_total) * sizeof(RunItemT)));
if (TEST_RELATIVE_OFFSETS)
{
CubDebugExit(cudaMalloc(&d_relative_offsets, (*h_num_decoded_total) * sizeof(RunLengthT)));
CubDebugExit(cudaMallocHost(&h_relative_offsets, (*h_num_decoded_total) * sizeof(RunLengthT)));
}
// Perform the block-wise run-length decoding (each block taking its offset from
// d_decoded_offsets)
BlockRunLengthDecodeTestKernel<AgentTestBlockRunLengthDecodeT>
<<<num_blocks, THREADS_PER_BLOCK, 0U>>>(d_unique_items,
d_run_lengths,
d_decoded_offsets,
num_runs,
d_decoded_out,
d_relative_offsets);
// Copy back results for verification
CubDebugExit(cudaMemcpy(h_decoded_out,
d_decoded_out,
(*h_num_decoded_total) * sizeof(*h_decoded_out),
cudaMemcpyDeviceToHost));
if (TEST_RELATIVE_OFFSETS)
{
// Copy back the relative offsets
CubDebugExit(cudaMemcpy(h_relative_offsets,
d_relative_offsets,
(*h_num_decoded_total) * sizeof(*h_relative_offsets),
cudaMemcpyDeviceToHost));
}
// Generate host-side run-length decoded data for verification
std::vector<std::pair<RunItemT, RunLengthT>> host_golden;
host_golden.reserve(*h_num_decoded_total);
for (uint32_t run = 0; run < num_runs; run++)
{
for (RunLengthT i = 0; i < d_run_lengths[run]; i++)
{
host_golden.push_back({d_unique_items[run], i});
}
}
// Verify the total run-length decoded size is correct
REQUIRE(host_golden.size() == h_num_decoded_total[0]);
// Verify the run-length decoded data is correct
bool cmp_eq = true;
for (uint32_t i = 0; i < host_golden.size(); i++)
{
if (host_golden[i].first != h_decoded_out[i])
{
FAIL("Mismatch at #" << i << ": CPU item: " << host_golden[i].first
<< ", GPU: " << h_decoded_out[i] << "\n");
cmp_eq = false;
}
if (TEST_RELATIVE_OFFSETS)
{
if (host_golden[i].second != h_relative_offsets[i])
{
FAIL("Mismatch of relative offset at #" << i
<< ": CPU item: " << host_golden[i].first << ", GPU: " << h_decoded_out[i]
<< "; relative offsets: CPU: " << host_golden[i].second
<< ", GPU: " << h_relative_offsets[i] << "\n");
cmp_eq = false;
break;
}
}
}
REQUIRE(cmp_eq == true);
// Clean up memory allocations
CubDebugExit(cudaFree(temp_storage));
CubDebugExit(cudaFree(d_decoded_sizes));
CubDebugExit(cudaFree(d_decoded_offsets));
CubDebugExit(cudaFree(d_decoded_out));
CubDebugExit(cudaFreeHost(h_num_decoded_total));
CubDebugExit(cudaFreeHost(h_decoded_out));
if (TEST_RELATIVE_OFFSETS)
{
CubDebugExit(cudaFree(d_relative_offsets));
CubDebugExit(cudaFreeHost(h_relative_offsets));
}
}
constexpr bool DO_TEST_RELATIVE_OFFSETS = true;
constexpr bool DO_NOT_TEST_RELATIVE_OFFSETS = false;
constexpr bool TEST_WITH_RUN_OFFSETS = true;
constexpr bool TEST_WITH_RUN_LENGTHS = false;
template <int RunsPerThread,
int DecodedItemsPerThread,
int BlockDimX,
int BlockDimY = 1,
int BlockDimZ = 1>
struct params_t
{
static constexpr int runs_per_thread = RunsPerThread;
static constexpr int decoded_items_per_thread = DecodedItemsPerThread;
static constexpr int block_dim_x = BlockDimX;
static constexpr int block_dim_y = BlockDimY;
static constexpr int block_dim_z = BlockDimZ;
};
CUB_TEST_LIST("Block Run Length Decode works with run lengths and offsets relative to each run",
"[rld][block]",
params_t<1, 1, 64>,
params_t<1, 3, 32, 2, 3>,
params_t<1, 1, 128>,
params_t<1, 8, 128>,
params_t<3, 1, 256>,
params_t<1, 8, 256>,
params_t<8, 1, 256>,
params_t<1, 1, 256>,
params_t<2, 2, 384>)
{
using params = TestType;
TestAlgorithmSpecialisation<params::runs_per_thread,
params::decoded_items_per_thread,
params::block_dim_x,
params::block_dim_y,
params::block_dim_z,
TEST_WITH_RUN_LENGTHS,
DO_TEST_RELATIVE_OFFSETS>();
}
CUB_TEST_LIST("Block Run Length Decode works with run lengths and performs normal run-length "
"decoding",
"[rld][block]",
params_t<1, 1, 64>,
params_t<1, 3, 32, 2, 3>,
params_t<1, 1, 128>,
params_t<1, 8, 128>,
params_t<3, 1, 256>,
params_t<1, 8, 256>,
params_t<8, 1, 256>,
params_t<1, 1, 256>,
params_t<2, 2, 384>)
{
using params = TestType;
TestAlgorithmSpecialisation<params::runs_per_thread,
params::decoded_items_per_thread,
params::block_dim_x,
params::block_dim_y,
params::block_dim_z,
TEST_WITH_RUN_LENGTHS,
DO_NOT_TEST_RELATIVE_OFFSETS>();
}
CUB_TEST_LIST("Block Run Length Decode works with run offsets and generates offsets relative to "
"each run",
"[rld][block]",
params_t<1, 1, 64>,
params_t<1, 3, 32, 2, 3>,
params_t<1, 1, 128>,
params_t<1, 8, 128>,
params_t<3, 1, 256>,
params_t<1, 8, 256>,
params_t<8, 1, 256>,
params_t<1, 1, 256>,
params_t<2, 2, 384>)
{
using params = TestType;
TestAlgorithmSpecialisation<params::runs_per_thread,
params::decoded_items_per_thread,
params::block_dim_x,
params::block_dim_y,
params::block_dim_z,
TEST_WITH_RUN_OFFSETS,
DO_TEST_RELATIVE_OFFSETS>();
}
CUB_TEST_LIST("Block Run Length Decode works with run offsets and performs normal run-length "
"decoding",
"[rld][block]",
params_t<1, 1, 64>,
params_t<1, 3, 32, 2, 3>,
params_t<1, 1, 128>,
params_t<1, 8, 128>,
params_t<3, 1, 256>,
params_t<1, 8, 256>,
params_t<8, 1, 256>,
params_t<1, 1, 256>,
params_t<2, 2, 384>)
{
using params = TestType;
TestAlgorithmSpecialisation<params::runs_per_thread,
params::decoded_items_per_thread,
params::block_dim_x,
params::block_dim_y,
params::block_dim_z,
TEST_WITH_RUN_OFFSETS,
DO_NOT_TEST_RELATIVE_OFFSETS>();
}