thrust / dependencies /cub /test /catch2_test_block_adjacent_difference.cu

thanks to nvidia ❤

8ae5fc5 over 2 years ago

12.7 kB

	/******************************************************************************
	* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	#include <cub/block/block_adjacent_difference.cuh>

	#include <thrust/host_vector.h>

	// Has to go after all cub headers. Otherwise, this test won't catch unused
	// variables in cub kernels.
	#include "catch2_test_helper.h"

	template <int ThreadsInBlock,
	int ItemsPerThread,
	class T,
	class ActionT>
	__global__ void block_adj_diff_kernel(T *data, ActionT action, bool in_place)
	{
	using block_adjacent_differencet_t =
	cub::BlockAdjacentDifference<T, ThreadsInBlock>;
	using temp_storage_t = typename block_adjacent_differencet_t::TempStorage;

	__shared__ temp_storage_t temp_storage;

	T thread_in[ItemsPerThread];
	T thread_out[ItemsPerThread];

	const int thread_offset = static_cast<int>(threadIdx.x) * ItemsPerThread;

	for (int item = 0; item < ItemsPerThread; item++)
	{
	thread_in[item] = data[thread_offset + item];
	}
	__syncthreads();

	block_adjacent_differencet_t adj_diff(temp_storage);

	if (in_place)
	{
	action(adj_diff, thread_in, thread_in);

	for (unsigned int item = 0; item < ItemsPerThread; item++)
	{
	data[thread_offset + item] = thread_in[item];
	}
	}
	else
	{
	action(adj_diff, thread_in, thread_out);

	for (unsigned int item = 0; item < ItemsPerThread; item++)
	{
	data[thread_offset + item] = thread_out[item];
	}
	}
	}

	template <class T>
	struct custom_difference_t
	{
	__host__ __device__ T operator()(const T &lhs, const T &rhs)
	{
	return lhs - rhs;
	}
	};

	template <bool ReadLeft>
	struct base_op_t
	{
	template <int ItemsPerThread, typename T, typename BlockAdjDiff>
	__device__ void operator()(BlockAdjDiff &block_adj_diff,
	T (&input)[ItemsPerThread],
	T (&output)[ItemsPerThread]) const
	{
	if (ReadLeft)
	{
	block_adj_diff.SubtractLeft(input, output, custom_difference_t<T>{});
	}
	else
	{
	block_adj_diff.SubtractRight(input, output, custom_difference_t<T>{});
	}
	}
	};

	template <bool ReadLeft>
	struct last_tile_op_t
	{
	int m_valid_items{};

	__host__ last_tile_op_t(int valid_items)
	: m_valid_items(valid_items)
	{}

	template <int ITEMS_PER_THREAD,
	typename T,
	typename BlockAdjDiff>
	__device__ void operator()(BlockAdjDiff &block_adj_diff,
	T (&input)[ITEMS_PER_THREAD],
	T (&output)[ITEMS_PER_THREAD]) const
	{
	custom_difference_t<T> diff{};

	if (ReadLeft)
	{
	block_adj_diff.SubtractLeftPartialTile(input,
	output,
	diff,
	m_valid_items);
	}
	else
	{
	block_adj_diff.SubtractRightPartialTile(input,
	output,
	diff,
	m_valid_items);
	}
	}
	};

	template <class T,
	bool ReadLeft>
	struct middle_tile_op_t
	{
	T m_neighbour_tile_value;

	__host__ middle_tile_op_t(T neighbour_tile_value)
	: m_neighbour_tile_value(neighbour_tile_value)
	{}

	template <int ITEMS_PER_THREAD, typename BlockAdjDiff>
	__device__ void operator()(BlockAdjDiff &block_adj_diff,
	T (&input)[ITEMS_PER_THREAD],
	T (&output)[ITEMS_PER_THREAD]) const
	{
	custom_difference_t<T> diff{};

	if (ReadLeft)
	{
	block_adj_diff.SubtractLeft(input,
	output,
	diff,
	m_neighbour_tile_value);
	}
	else
	{
	block_adj_diff.SubtractRight(input,
	output,
	diff,
	m_neighbour_tile_value);
	}
	}
	};

	template <typename T>
	struct last_tile_with_pred_op_t
	{
	int m_valid_items;
	T m_neighbour_tile_value;

	__host__ last_tile_with_pred_op_t(
	int valid_items,
	T neighbour_tile_value)
	: m_valid_items(valid_items)
	, m_neighbour_tile_value(neighbour_tile_value)
	{
	}

	template <int ITEMS_PER_THREAD, typename BlockAdjDiff>
	__device__ void operator()(BlockAdjDiff &block_adj_diff,
	T (&input)[ITEMS_PER_THREAD],
	T (&output)[ITEMS_PER_THREAD]) const
	{
	custom_difference_t<T> diff{};
	block_adj_diff.SubtractLeftPartialTile(input,
	output,
	diff,
	m_valid_items,
	m_neighbour_tile_value);
	}
	};

	template <int ItemsPerThread, int ThreadsInBlock, class T, class ActionT>
	void block_adj_diff(thrust::device_vector<T> &data,
	bool in_place,
	ActionT action)
	{
	block_adj_diff_kernel<ThreadsInBlock, ItemsPerThread, T, ActionT>
	<<<1, ThreadsInBlock>>>(thrust::raw_pointer_cast(data.data()),
	action,
	in_place);

	REQUIRE(cudaSuccess == cudaPeekAtLastError());
	REQUIRE(cudaSuccess == cudaDeviceSynchronize());
	}

	template <bool ReadLeft,
	class T>
	void host_adj_diff(thrust::host_vector<T> &h_data, int valid_items)
	{
	custom_difference_t<T> diff{};

	if (ReadLeft)
	{
	for (int i = valid_items - 1; i > 0; i--)
	{
	h_data[i] = diff(h_data[i], h_data[i - 1]);
	}
	}
	else
	{
	for (int i = 0; i < valid_items - 1; i++)
	{
	h_data[i] = diff(h_data[i], h_data[i + 1]);
	}
	}
	}

	template <bool ReadLeft, class T>
	void host_adj_diff(thrust::host_vector<T> &h_data,
	int valid_items,
	T neighbour_value)
	{
	custom_difference_t<T> diff{};

	host_adj_diff<ReadLeft>(h_data, valid_items);

	if (valid_items == 0)
	{
	return;
	}

	if (ReadLeft)
	{
	h_data[0] = diff(h_data[0], neighbour_value);
	}
	else
	{
	h_data[valid_items - 1] = diff(h_data[valid_items - 1], neighbour_value);
	}
	}

	// %PARAM% THREADS_IN_BLOCK bs 64:256

	using key_types =
	c2h::type_list<std::uint16_t, std::int32_t, std::int64_t>;

	using threads_in_block = c2h::enum_type_list<int, THREADS_IN_BLOCK>;
	using items_per_thread = c2h::enum_type_list<int, 1, 2, 10, 15>;
	using directions = c2h::enum_type_list<bool, false, true>;
	using left_only = c2h::enum_type_list<bool, true>;

	template <class TestType>
	struct params_t
	{
	using key_t = typename c2h::get<0, TestType>;

	static constexpr int items_per_thread = c2h::get<1, TestType>::value;
	static constexpr int threads_in_block = c2h::get<2, TestType>::value;
	static constexpr int tile_size = items_per_thread * threads_in_block;
	static constexpr bool read_left = c2h::get<3, TestType>::value;
	};

	CUB_TEST("Block adjacent difference works with full tiles",
	"[adjacent difference][block]",
	key_types,
	items_per_thread,
	threads_in_block,
	directions)
	{
	using params = params_t<TestType>;
	using key_t = typename params::key_t;

	thrust::device_vector<key_t> d_data(params::tile_size);
	c2h::gen(CUB_SEED(10), d_data);

	const bool in_place = GENERATE(false, true);

	thrust::host_vector<key_t> h_data = d_data;
	host_adj_diff<params::read_left>(h_data, params::tile_size);

	block_adj_diff<params::items_per_thread, params::threads_in_block>(
	d_data,
	in_place,
	base_op_t<params::read_left>{});

	REQUIRE(h_data == d_data);
	}

	CUB_TEST("Block adjacent difference works with last tiles",
	"[adjacent difference][block]",
	key_types,
	items_per_thread,
	threads_in_block,
	directions)
	{
	using params = params_t<TestType>;
	using key_t = typename params::key_t;

	thrust::device_vector<key_t> d_data(params::tile_size);
	c2h::gen(CUB_SEED(10), d_data);

	const bool in_place = GENERATE(false, true);
	const int valid_items = GENERATE_COPY(take(10, random(0, params::tile_size)));

	thrust::host_vector<key_t> h_data = d_data;
	host_adj_diff<params::read_left>(h_data, valid_items);

	block_adj_diff<params::items_per_thread, params::threads_in_block>(
	d_data,
	in_place,
	last_tile_op_t<params::read_left>{valid_items});

	REQUIRE(h_data == d_data);
	}

	CUB_TEST("Block adjacent difference works with single tiles",
	"[adjacent difference][block]",
	key_types,
	items_per_thread,
	threads_in_block,
	left_only)
	{
	using params = params_t<TestType>;
	using key_t = typename params::key_t;

	thrust::device_vector<key_t> d_data(params::tile_size);
	c2h::gen(CUB_SEED(10), d_data);

	const bool in_place = GENERATE(false, true);
	const int valid_items = GENERATE_COPY(take(10, random(0, params::tile_size)));
	constexpr bool read_left = true;

	thrust::host_vector<key_t> h_data = d_data;
	key_t neighbour_value = h_data[h_data.size() / 2];

	host_adj_diff<read_left>(h_data, valid_items, neighbour_value);

	block_adj_diff<params::items_per_thread, params::threads_in_block>(
	d_data,
	in_place,
	last_tile_with_pred_op_t<key_t>{valid_items, neighbour_value});

	REQUIRE(h_data == d_data);
	}

	CUB_TEST("Block adjacent difference works with middle tiles",
	"[adjacent difference][block]",
	key_types,
	items_per_thread,
	threads_in_block,
	directions)
	{
	using params = params_t<TestType>;
	using key_t = typename params::key_t;

	thrust::device_vector<key_t> d_data(params::tile_size);
	c2h::gen(CUB_SEED(10), d_data);

	const bool in_place = GENERATE(false, true);

	thrust::host_vector<key_t> h_data = d_data;
	key_t neighbour_value = h_data[h_data.size() / 2];

	host_adj_diff<params::read_left>(h_data, params::tile_size, neighbour_value);

	block_adj_diff<params::items_per_thread, params::threads_in_block>(
	d_data,
	in_place,
	middle_tile_op_t<key_t, params::read_left>{neighbour_value});

	REQUIRE(h_data == d_data);
	}

	CUB_TEST("Block adjacent difference supports custom types",
	"[adjacent difference][block]",
	threads_in_block)
	{
	using key_t = c2h::custom_type_t<c2h::equal_comparable_t,
	c2h::subtractable_t>;

	constexpr int items_per_thread = 2;
	constexpr int threads_in_block = c2h::get<0, TestType>::value;
	constexpr int tile_size = threads_in_block * items_per_thread;
	constexpr bool read_left = true;
	const bool in_place = true;

	thrust::device_vector<key_t> d_data(tile_size);
	c2h::gen(CUB_SEED(10), d_data);

	thrust::host_vector<key_t> h_data = d_data;
	host_adj_diff<read_left>(h_data, tile_size);

	block_adj_diff<items_per_thread, threads_in_block>(d_data,
	in_place,
	base_op_t<read_left>{});

	REQUIRE(h_data == d_data);
	}

	// TODO Test different input/output types