thrust / dependencies /cub /test /catch2_test_block_run_length_decode.cu

thanks to nvidia ❤

8ae5fc5 over 2 years ago

28.1 kB

	/******************************************************************************
	* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	#include <cub/block/block_load.cuh>
	#include <cub/block/block_run_length_decode.cuh>
	#include <cub/block/block_store.cuh>
	#include <cub/device/device_scan.cuh>
	#include <cub/iterator/counting_input_iterator.cuh>
	#include <cub/iterator/transform_input_iterator.cuh>
	#include <cub/util_allocator.cuh>

	// Has to go after all cub headers. Otherwise, this test won't catch unused
	// variables in cub kernels.
	#include "catch2_test_helper.h"

	/******************************************************************************
	* HELPER CLASS FOR RUN-LENGTH DECODING TESTS
	******************************************************************************/

	/**
	* \brief Class template to facilitate testing the BlockRunLengthDecode algorithm for all its
	* template parameter specialisations.
	*
	* \tparam ItemItT The item type being run-length decoded
	* \tparam RunLengthsItT Iterator type providing the runs' lengths
	* \tparam RUNS_PER_THREAD The number of runs that each thread is getting assigned to
	* \tparam DECODED_ITEMS_PER_THREAD The number of run-length decoded items that each thread is
	* decoding \tparam TEST_RELATIVE_OFFSETS_ Whether to also retrieve each decoded item's
	* relative offset within its run \tparam TEST_RUN_OFFSETS_ Whether to pass in each run's
	* offset instead of each run's length \tparam BLOCK_DIM_X The thread block length in
	* threads along the X dimension
	* \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension
	* \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension
	*/
	template <typename ItemItT,
	typename RunLengthsItT,
	int RUNS_PER_THREAD,
	int DECODED_ITEMS_PER_THREAD,
	bool TEST_RELATIVE_OFFSETS_,
	bool TEST_RUN_OFFSETS_,
	int BLOCK_DIM_X,
	int BLOCK_DIM_Y = 1,
	int BLOCK_DIM_Z = 1>
	class AgentTestBlockRunLengthDecode
	{
	public:
	constexpr static uint32_t BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
	constexpr static uint32_t RUNS_PER_BLOCK = RUNS_PER_THREAD * BLOCK_THREADS;
	constexpr static bool TEST_RELATIVE_OFFSETS = TEST_RELATIVE_OFFSETS_;

	private:
	using RunItemT = cub::detail::value_t<ItemItT>;
	using RunLengthT = cub::detail::value_t<RunLengthsItT>;

	using BlockRunOffsetScanT =
	cub::BlockScan<RunLengthT, BLOCK_DIM_X, cub::BLOCK_SCAN_RAKING, BLOCK_DIM_Y, BLOCK_DIM_Z>;

	using BlockRunLengthDecodeT =
	cub::BlockRunLengthDecode<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD>;

	using BlockLoadRunItemT = cub::BlockLoad<RunItemT,
	BLOCK_DIM_X,
	RUNS_PER_THREAD,
	cub::BLOCK_LOAD_WARP_TRANSPOSE,
	BLOCK_DIM_Y,
	BLOCK_DIM_Z>;

	using BlockLoadRunLengthsT = cub::BlockLoad<RunLengthT,
	BLOCK_DIM_X,
	RUNS_PER_THREAD,
	cub::BLOCK_LOAD_WARP_TRANSPOSE,
	BLOCK_DIM_Y,
	BLOCK_DIM_Z>;

	using BlockStoreDecodedItemT = cub::BlockStore<RunItemT,
	BLOCK_DIM_X,
	DECODED_ITEMS_PER_THREAD,
	cub::BLOCK_STORE_WARP_TRANSPOSE,
	BLOCK_DIM_Y,
	BLOCK_DIM_Z>;

	using BlockStoreRelativeOffsetT = cub::BlockStore<RunLengthT,
	BLOCK_DIM_X,
	DECODED_ITEMS_PER_THREAD,
	cub::BLOCK_STORE_WARP_TRANSPOSE,
	BLOCK_DIM_Y,
	BLOCK_DIM_Z>;

	__device__ __forceinline__ BlockRunLengthDecodeT
	InitBlockRunLengthDecode(RunItemT (&unique_items)[RUNS_PER_THREAD],
	RunLengthT (&run_lengths)[RUNS_PER_THREAD],
	RunLengthT &decoded_size,
	cub::Int2Type<true> /test_run_offsets/)
	{
	RunLengthT run_offsets[RUNS_PER_THREAD];
	BlockRunOffsetScanT(temp_storage.run_offsets_scan_storage)
	.ExclusiveSum(run_lengths, run_offsets, decoded_size);

	// Ensure temporary shared memory can be repurposed
	cub::CTA_SYNC();

	// Construct BlockRunLengthDecode and initialize with the run offsets
	return BlockRunLengthDecodeT(temp_storage.decode.run_length_decode_storage,
	unique_items,
	run_offsets);
	}

	__device__ __forceinline__ BlockRunLengthDecodeT
	InitBlockRunLengthDecode(RunItemT (&unique_items)[RUNS_PER_THREAD],
	RunLengthT (&run_lengths)[RUNS_PER_THREAD],
	RunLengthT &decoded_size,
	cub::Int2Type<false> /test_run_offsets/)
	{
	// Construct BlockRunLengthDecode and initialize with the run lengths
	return BlockRunLengthDecodeT(temp_storage.decode.run_length_decode_storage,
	unique_items,
	run_lengths,
	decoded_size);
	}

	__device__ __forceinline__ void LoadRuns(ItemItT d_block_unique_items,
	RunLengthsItT d_block_run_lengths,
	RunItemT (&unique_items)[RUNS_PER_THREAD],
	RunLengthT (&run_lengths)[RUNS_PER_THREAD],
	size_t num_valid_items)
	{
	if (num_valid_items < RUNS_PER_BLOCK)
	{
	BlockLoadRunItemT(temp_storage.load_uniques_storage)
	.Load(d_block_unique_items, unique_items, num_valid_items);
	}
	else
	{
	BlockLoadRunItemT(temp_storage.load_uniques_storage).Load(d_block_unique_items, unique_items);
	}

	// Ensure BlockLoad's temporary shared memory can be repurposed
	cub::CTA_SYNC();

	// Load this block's tile of run lengths
	if (num_valid_items < RUNS_PER_BLOCK)
	BlockLoadRunLengthsT(temp_storage.load_run_lengths_storage)
	.Load(d_block_run_lengths, run_lengths, num_valid_items, static_cast<RunLengthT>(0));
	else
	BlockLoadRunLengthsT(temp_storage.load_run_lengths_storage)
	.Load(d_block_run_lengths, run_lengths);

	// Ensure temporary shared memory can be repurposed
	cub::CTA_SYNC();
	}

	public:
	union TempStorage
	{
	typename BlockLoadRunItemT::TempStorage load_uniques_storage;
	typename BlockLoadRunLengthsT::TempStorage load_run_lengths_storage;
	cub::detail::
	conditional_t<TEST_RUN_OFFSETS_, typename BlockRunOffsetScanT::TempStorage, cub::NullType>
	run_offsets_scan_storage;
	struct
	{
	typename BlockRunLengthDecodeT::TempStorage run_length_decode_storage;
	typename BlockStoreDecodedItemT::TempStorage store_decoded_runs_storage;
	typename BlockStoreRelativeOffsetT::TempStorage store_relative_offsets;
	} decode;
	};

	TempStorage &temp_storage;

	__device__ __forceinline__ AgentTestBlockRunLengthDecode(TempStorage &temp_storage)
	: temp_storage(temp_storage)
	{}

	/**
	* \brief Loads the given block (or tile) of runs, and computes their "decompressed" (run-length
	* decoded) size.
	*/
	__device__ __forceinline__ uint32_t GetDecodedSize(ItemItT d_block_unique_items,
	RunLengthsItT d_block_run_lengths,
	size_t num_valid_runs)
	{
	// Load this block's tile of encoded runs
	RunItemT unique_items[RUNS_PER_THREAD];
	RunLengthT run_lengths[RUNS_PER_THREAD];
	LoadRuns(d_block_unique_items, d_block_run_lengths, unique_items, run_lengths, num_valid_runs);

	// Init the BlockRunLengthDecode and get the total decoded size of this block's tile (i.e., the
	// "decompressed" size)
	uint32_t decoded_size = 0U;
	BlockRunLengthDecodeT run_length_decode =
	InitBlockRunLengthDecode(unique_items,
	run_lengths,
	decoded_size,
	cub::Int2Type<TEST_RUN_OFFSETS_>());
	return decoded_size;
	}

	/**
	* \brief Loads the given block (or tile) of runs, run-length decodes them, and writes the results
	* to \p d_block_decoded_out.
	*/
	template <typename UniqueItemOutItT, typename RelativeOffsetOutItT>
	__device__ __forceinline__ uint32_t WriteDecodedRuns(ItemItT d_block_unique_items,
	RunLengthsItT d_block_run_lengths,
	UniqueItemOutItT d_block_decoded_out,
	RelativeOffsetOutItT d_block_rel_out,
	size_t num_valid_runs)
	{
	// Load this block's tile of encoded runs
	RunItemT unique_items[RUNS_PER_THREAD];
	RunLengthT run_lengths[RUNS_PER_THREAD];
	LoadRuns(d_block_unique_items, d_block_run_lengths, unique_items, run_lengths, num_valid_runs);

	// Init the BlockRunLengthDecode and get the total decoded size of this block's tile (i.e., the
	// "decompressed" size)
	uint32_t decoded_size = 0U;
	BlockRunLengthDecodeT run_length_decode =
	InitBlockRunLengthDecode(unique_items,
	run_lengths,
	decoded_size,
	cub::Int2Type<TEST_RUN_OFFSETS_>());

	// Run-length decode ("decompress") the runs into a window buffer of limited size. This is
	// repeated until all runs have been decoded.
	uint32_t decoded_window_offset = 0U;
	while (decoded_window_offset < decoded_size)
	{
	RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD];
	RunItemT decoded_items[DECODED_ITEMS_PER_THREAD];

	// The number of decoded items that are valid within this window (aka pass) of run-length
	// decoding
	uint32_t num_valid_items = decoded_size - decoded_window_offset;
	run_length_decode.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset);
	BlockStoreDecodedItemT(temp_storage.decode.store_decoded_runs_storage)
	.Store(d_block_decoded_out + decoded_window_offset, decoded_items, num_valid_items);

	if (TEST_RELATIVE_OFFSETS)
	{
	BlockStoreRelativeOffsetT(temp_storage.decode.store_relative_offsets)
	.Store(d_block_rel_out + decoded_window_offset, relative_offsets, num_valid_items);
	}

	decoded_window_offset += DECODED_ITEMS_PER_THREAD * BLOCK_THREADS;
	}
	return decoded_size;
	}
	};

	/******************************************************************************
	* [STAGE 1] RUN-LENGTH DECODING TEST KERNEL
	******************************************************************************/
	template <typename AgentTestBlockRunLengthDecode,
	typename ItemItT,
	typename RunLengthsItT,
	typename OffsetT,
	typename DecodedSizesOutT>
	__launch_bounds__(AgentTestBlockRunLengthDecode::BLOCK_THREADS) __global__
	void BlockRunLengthDecodeGetSizeKernel(const ItemItT d_unique_items,
	const RunLengthsItT d_run_lengths,
	const OffsetT num_runs,
	DecodedSizesOutT d_decoded_sizes)
	{
	constexpr OffsetT RUNS_PER_BLOCK = AgentTestBlockRunLengthDecode::RUNS_PER_BLOCK;

	__shared__ typename AgentTestBlockRunLengthDecode::TempStorage temp_storage;

	OffsetT block_offset = blockIdx.x * RUNS_PER_BLOCK;
	OffsetT num_valid_runs = (block_offset + RUNS_PER_BLOCK >= num_runs) ? (num_runs - block_offset)
	: RUNS_PER_BLOCK;

	AgentTestBlockRunLengthDecode run_length_decode_agent(temp_storage);
	uint64_t num_decoded_items = run_length_decode_agent.GetDecodedSize(d_unique_items + block_offset,
	d_run_lengths + block_offset,
	num_valid_runs);

	d_decoded_sizes[blockIdx.x] = num_decoded_items;
	}

	/******************************************************************************
	* [STAGE 2] RUN-LENGTH DECODING TEST KERNEL
	******************************************************************************/
	template <typename AgentTestBlockRunLengthDecode,
	typename ItemItT,
	typename RunLengthsItT,
	typename DecodedSizesOutT,
	typename OffsetT,
	typename DecodedItemsOutItT,
	typename RelativeOffsetOutItT>
	__launch_bounds__(AgentTestBlockRunLengthDecode::BLOCK_THREADS) __global__
	void BlockRunLengthDecodeTestKernel(const ItemItT d_unique_items,
	const RunLengthsItT d_run_lengths,
	const DecodedSizesOutT d_decoded_offsets,
	const OffsetT num_runs,
	DecodedItemsOutItT d_decoded_items,
	RelativeOffsetOutItT d_relative_offsets)

	{
	constexpr OffsetT RUNS_PER_BLOCK = AgentTestBlockRunLengthDecode::RUNS_PER_BLOCK;

	__shared__ typename AgentTestBlockRunLengthDecode::TempStorage temp_storage;

	OffsetT block_offset = blockIdx.x * RUNS_PER_BLOCK;
	OffsetT num_valid_runs = (block_offset + RUNS_PER_BLOCK >= num_runs) ? (num_runs - block_offset)
	: RUNS_PER_BLOCK;

	AgentTestBlockRunLengthDecode run_length_decode_agent(temp_storage);
	run_length_decode_agent.WriteDecodedRuns(d_unique_items + block_offset,
	d_run_lengths + block_offset,
	d_decoded_items + d_decoded_offsets[blockIdx.x],
	d_relative_offsets + d_decoded_offsets[blockIdx.x],
	num_valid_runs);
	}

	struct ModOp
	{
	using T = uint32_t;
	__host__ __device__ __forceinline__ T operator()(const T &x) const { return 1 + (x % 100); }
	};

	template <uint32_t RUNS_PER_THREAD,
	uint32_t DECODED_ITEMS_PER_THREAD,
	uint32_t BLOCK_DIM_X,
	uint32_t BLOCK_DIM_Y,
	uint32_t BLOCK_DIM_Z,
	bool TEST_RUN_OFFSETS,
	bool TEST_RELATIVE_OFFSETS>
	void TestAlgorithmSpecialisation()
	{
	constexpr uint32_t THREADS_PER_BLOCK = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
	constexpr uint32_t RUNS_PER_BLOCK = RUNS_PER_THREAD * THREADS_PER_BLOCK;

	using RunItemT = float;
	using RunLengthT = uint32_t;
	using ItemItT = cub::CountingInputIterator<RunItemT>;
	using RunLengthsItT =
	cub::TransformInputIterator<RunLengthT, ModOp, cub::CountingInputIterator<RunLengthT>>;

	ItemItT d_unique_items(1000U);
	RunLengthsItT d_run_lengths(cub::CountingInputIterator<RunLengthT>(0), ModOp{});

	constexpr uint32_t num_runs = 10000;
	constexpr uint32_t num_blocks = (num_runs + (RUNS_PER_BLOCK - 1U)) / RUNS_PER_BLOCK;

	size_t temp_storage_bytes = 0ULL;
	void *temp_storage = nullptr;
	uint32_t *h_num_decoded_total = nullptr;
	uint32_t *d_decoded_sizes = nullptr;
	uint32_t *d_decoded_offsets = nullptr;
	RunItemT *d_decoded_out = nullptr;
	RunLengthT *d_relative_offsets = nullptr;
	RunItemT *h_decoded_out = nullptr;
	RunLengthT *h_relative_offsets = nullptr;

	using AgentTestBlockRunLengthDecodeT = AgentTestBlockRunLengthDecode<ItemItT,
	RunLengthsItT,
	RUNS_PER_THREAD,
	DECODED_ITEMS_PER_THREAD,
	TEST_RELATIVE_OFFSETS,
	TEST_RUN_OFFSETS,
	THREADS_PER_BLOCK,
	1,
	1>;

	enum : uint32_t
	{
	TIMER_SIZE_BEGIN = 0,
	TIMER_SIZE_END,
	TIMER_DECODE_BEGIN,
	TIMER_DECODE_END,
	NUM_TIMERS,
	};

	// Get temporary storage requirements for the scan (for computing offsets for the per-block
	// run-length decoded items)
	cub::DeviceScan::InclusiveSum(nullptr,
	temp_storage_bytes,
	d_decoded_sizes,
	d_decoded_offsets,
	num_blocks);

	// Allocate device memory
	CubDebugExit(cudaMalloc(&temp_storage, temp_storage_bytes));
	CubDebugExit(cudaMalloc(&d_decoded_sizes, num_blocks * sizeof(*d_decoded_sizes)));
	// Allocate for the exclusive sum PLUS the overall aggregate
	CubDebugExit(cudaMalloc(&d_decoded_offsets, (num_blocks + 1) * sizeof(*d_decoded_offsets)));
	CubDebugExit(cudaMallocHost(&h_num_decoded_total, sizeof(*h_num_decoded_total)));

	// Get the per-block number of items being decoded (i-th thread block writing size to
	// d_decoded_sizes[i])
	BlockRunLengthDecodeGetSizeKernel<AgentTestBlockRunLengthDecodeT>
	<<<num_blocks, THREADS_PER_BLOCK, 0U>>>(d_unique_items,
	d_run_lengths,
	num_runs,
	d_decoded_sizes);

	// Compute offsets for the runs decoded by each block (exclusive sum + aggregate)
	CubDebugExit(cudaMemsetAsync(d_decoded_offsets, 0, sizeof(d_decoded_offsets[0])));
	CubDebugExit(cub::DeviceScan::InclusiveSum(temp_storage,
	temp_storage_bytes,
	d_decoded_sizes,
	&d_decoded_offsets[1],
	num_blocks));

	// Copy the total decoded size to CPU in order to allocate just the right amount of device memory
	CubDebugExit(cudaMemcpy(h_num_decoded_total,
	&d_decoded_offsets[num_blocks],
	sizeof(*h_num_decoded_total),
	cudaMemcpyDeviceToHost));

	// Allocate device memory for the run-length decoded output
	CubDebugExit(cudaMallocHost(&h_decoded_out, (h_num_decoded_total) sizeof(RunItemT)));
	CubDebugExit(cudaMalloc(&d_decoded_out, (h_num_decoded_total) sizeof(RunItemT)));
	if (TEST_RELATIVE_OFFSETS)
	{
	CubDebugExit(cudaMalloc(&d_relative_offsets, (h_num_decoded_total) sizeof(RunLengthT)));
	CubDebugExit(cudaMallocHost(&h_relative_offsets, (h_num_decoded_total) sizeof(RunLengthT)));
	}

	// Perform the block-wise run-length decoding (each block taking its offset from
	// d_decoded_offsets)
	BlockRunLengthDecodeTestKernel<AgentTestBlockRunLengthDecodeT>
	<<<num_blocks, THREADS_PER_BLOCK, 0U>>>(d_unique_items,
	d_run_lengths,
	d_decoded_offsets,
	num_runs,
	d_decoded_out,
	d_relative_offsets);

	// Copy back results for verification
	CubDebugExit(cudaMemcpy(h_decoded_out,
	d_decoded_out,
	(h_num_decoded_total) sizeof(*h_decoded_out),
	cudaMemcpyDeviceToHost));

	if (TEST_RELATIVE_OFFSETS)
	{
	// Copy back the relative offsets
	CubDebugExit(cudaMemcpy(h_relative_offsets,
	d_relative_offsets,
	(h_num_decoded_total) sizeof(*h_relative_offsets),
	cudaMemcpyDeviceToHost));
	}

	// Generate host-side run-length decoded data for verification
	std::vector<std::pair<RunItemT, RunLengthT>> host_golden;
	host_golden.reserve(*h_num_decoded_total);
	for (uint32_t run = 0; run < num_runs; run++)
	{
	for (RunLengthT i = 0; i < d_run_lengths[run]; i++)
	{
	host_golden.push_back({d_unique_items[run], i});
	}
	}

	// Verify the total run-length decoded size is correct
	REQUIRE(host_golden.size() == h_num_decoded_total[0]);

	// Verify the run-length decoded data is correct
	bool cmp_eq = true;
	for (uint32_t i = 0; i < host_golden.size(); i++)
	{
	if (host_golden[i].first != h_decoded_out[i])
	{
	FAIL("Mismatch at #" << i << ": CPU item: " << host_golden[i].first
	<< ", GPU: " << h_decoded_out[i] << "\n");
	cmp_eq = false;
	}
	if (TEST_RELATIVE_OFFSETS)
	{
	if (host_golden[i].second != h_relative_offsets[i])
	{
	FAIL("Mismatch of relative offset at #" << i
	<< ": CPU item: " << host_golden[i].first << ", GPU: " << h_decoded_out[i]
	<< "; relative offsets: CPU: " << host_golden[i].second
	<< ", GPU: " << h_relative_offsets[i] << "\n");
	cmp_eq = false;
	break;
	}
	}
	}

	REQUIRE(cmp_eq == true);

	// Clean up memory allocations
	CubDebugExit(cudaFree(temp_storage));
	CubDebugExit(cudaFree(d_decoded_sizes));
	CubDebugExit(cudaFree(d_decoded_offsets));
	CubDebugExit(cudaFree(d_decoded_out));
	CubDebugExit(cudaFreeHost(h_num_decoded_total));
	CubDebugExit(cudaFreeHost(h_decoded_out));

	if (TEST_RELATIVE_OFFSETS)
	{
	CubDebugExit(cudaFree(d_relative_offsets));
	CubDebugExit(cudaFreeHost(h_relative_offsets));
	}
	}

	constexpr bool DO_TEST_RELATIVE_OFFSETS = true;
	constexpr bool DO_NOT_TEST_RELATIVE_OFFSETS = false;

	constexpr bool TEST_WITH_RUN_OFFSETS = true;
	constexpr bool TEST_WITH_RUN_LENGTHS = false;

	template <int RunsPerThread,
	int DecodedItemsPerThread,
	int BlockDimX,
	int BlockDimY = 1,
	int BlockDimZ = 1>
	struct params_t
	{
	static constexpr int runs_per_thread = RunsPerThread;
	static constexpr int decoded_items_per_thread = DecodedItemsPerThread;
	static constexpr int block_dim_x = BlockDimX;
	static constexpr int block_dim_y = BlockDimY;
	static constexpr int block_dim_z = BlockDimZ;
	};

	CUB_TEST_LIST("Block Run Length Decode works with run lengths and offsets relative to each run",
	"[rld][block]",
	params_t<1, 1, 64>,
	params_t<1, 3, 32, 2, 3>,
	params_t<1, 1, 128>,
	params_t<1, 8, 128>,
	params_t<3, 1, 256>,
	params_t<1, 8, 256>,
	params_t<8, 1, 256>,
	params_t<1, 1, 256>,
	params_t<2, 2, 384>)
	{
	using params = TestType;

	TestAlgorithmSpecialisation<params::runs_per_thread,
	params::decoded_items_per_thread,
	params::block_dim_x,
	params::block_dim_y,
	params::block_dim_z,
	TEST_WITH_RUN_LENGTHS,
	DO_TEST_RELATIVE_OFFSETS>();
	}

	CUB_TEST_LIST("Block Run Length Decode works with run lengths and performs normal run-length "
	"decoding",
	"[rld][block]",
	params_t<1, 1, 64>,
	params_t<1, 3, 32, 2, 3>,
	params_t<1, 1, 128>,
	params_t<1, 8, 128>,
	params_t<3, 1, 256>,
	params_t<1, 8, 256>,
	params_t<8, 1, 256>,
	params_t<1, 1, 256>,
	params_t<2, 2, 384>)
	{
	using params = TestType;

	TestAlgorithmSpecialisation<params::runs_per_thread,
	params::decoded_items_per_thread,
	params::block_dim_x,
	params::block_dim_y,
	params::block_dim_z,
	TEST_WITH_RUN_LENGTHS,
	DO_NOT_TEST_RELATIVE_OFFSETS>();
	}

	CUB_TEST_LIST("Block Run Length Decode works with run offsets and generates offsets relative to "
	"each run",
	"[rld][block]",
	params_t<1, 1, 64>,
	params_t<1, 3, 32, 2, 3>,
	params_t<1, 1, 128>,
	params_t<1, 8, 128>,
	params_t<3, 1, 256>,
	params_t<1, 8, 256>,
	params_t<8, 1, 256>,
	params_t<1, 1, 256>,
	params_t<2, 2, 384>)
	{
	using params = TestType;

	TestAlgorithmSpecialisation<params::runs_per_thread,
	params::decoded_items_per_thread,
	params::block_dim_x,
	params::block_dim_y,
	params::block_dim_z,
	TEST_WITH_RUN_OFFSETS,
	DO_TEST_RELATIVE_OFFSETS>();
	}

	CUB_TEST_LIST("Block Run Length Decode works with run offsets and performs normal run-length "
	"decoding",
	"[rld][block]",
	params_t<1, 1, 64>,
	params_t<1, 3, 32, 2, 3>,
	params_t<1, 1, 128>,
	params_t<1, 8, 128>,
	params_t<3, 1, 256>,
	params_t<1, 8, 256>,
	params_t<8, 1, 256>,
	params_t<1, 1, 256>,
	params_t<2, 2, 384>)
	{
	using params = TestType;

	TestAlgorithmSpecialisation<params::runs_per_thread,
	params::decoded_items_per_thread,
	params::block_dim_x,
	params::block_dim_y,
	params::block_dim_z,
	TEST_WITH_RUN_OFFSETS,
	DO_NOT_TEST_RELATIVE_OFFSETS>();
	}