thrust / dependencies /cub /test /catch2_test_block_adjacent_difference.cu
camenduru's picture
thanks to nvidia ❤
8ae5fc5
/******************************************************************************
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#include <cub/block/block_adjacent_difference.cuh>
#include <thrust/host_vector.h>
// Has to go after all cub headers. Otherwise, this test won't catch unused
// variables in cub kernels.
#include "catch2_test_helper.h"
template <int ThreadsInBlock,
int ItemsPerThread,
class T,
class ActionT>
__global__ void block_adj_diff_kernel(T *data, ActionT action, bool in_place)
{
using block_adjacent_differencet_t =
cub::BlockAdjacentDifference<T, ThreadsInBlock>;
using temp_storage_t = typename block_adjacent_differencet_t::TempStorage;
__shared__ temp_storage_t temp_storage;
T thread_in[ItemsPerThread];
T thread_out[ItemsPerThread];
const int thread_offset = static_cast<int>(threadIdx.x) * ItemsPerThread;
for (int item = 0; item < ItemsPerThread; item++)
{
thread_in[item] = data[thread_offset + item];
}
__syncthreads();
block_adjacent_differencet_t adj_diff(temp_storage);
if (in_place)
{
action(adj_diff, thread_in, thread_in);
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
data[thread_offset + item] = thread_in[item];
}
}
else
{
action(adj_diff, thread_in, thread_out);
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
data[thread_offset + item] = thread_out[item];
}
}
}
template <class T>
struct custom_difference_t
{
__host__ __device__ T operator()(const T &lhs, const T &rhs)
{
return lhs - rhs;
}
};
template <bool ReadLeft>
struct base_op_t
{
template <int ItemsPerThread, typename T, typename BlockAdjDiff>
__device__ void operator()(BlockAdjDiff &block_adj_diff,
T (&input)[ItemsPerThread],
T (&output)[ItemsPerThread]) const
{
if (ReadLeft)
{
block_adj_diff.SubtractLeft(input, output, custom_difference_t<T>{});
}
else
{
block_adj_diff.SubtractRight(input, output, custom_difference_t<T>{});
}
}
};
template <bool ReadLeft>
struct last_tile_op_t
{
int m_valid_items{};
__host__ last_tile_op_t(int valid_items)
: m_valid_items(valid_items)
{}
template <int ITEMS_PER_THREAD,
typename T,
typename BlockAdjDiff>
__device__ void operator()(BlockAdjDiff &block_adj_diff,
T (&input)[ITEMS_PER_THREAD],
T (&output)[ITEMS_PER_THREAD]) const
{
custom_difference_t<T> diff{};
if (ReadLeft)
{
block_adj_diff.SubtractLeftPartialTile(input,
output,
diff,
m_valid_items);
}
else
{
block_adj_diff.SubtractRightPartialTile(input,
output,
diff,
m_valid_items);
}
}
};
template <class T,
bool ReadLeft>
struct middle_tile_op_t
{
T m_neighbour_tile_value;
__host__ middle_tile_op_t(T neighbour_tile_value)
: m_neighbour_tile_value(neighbour_tile_value)
{}
template <int ITEMS_PER_THREAD, typename BlockAdjDiff>
__device__ void operator()(BlockAdjDiff &block_adj_diff,
T (&input)[ITEMS_PER_THREAD],
T (&output)[ITEMS_PER_THREAD]) const
{
custom_difference_t<T> diff{};
if (ReadLeft)
{
block_adj_diff.SubtractLeft(input,
output,
diff,
m_neighbour_tile_value);
}
else
{
block_adj_diff.SubtractRight(input,
output,
diff,
m_neighbour_tile_value);
}
}
};
template <typename T>
struct last_tile_with_pred_op_t
{
int m_valid_items;
T m_neighbour_tile_value;
__host__ last_tile_with_pred_op_t(
int valid_items,
T neighbour_tile_value)
: m_valid_items(valid_items)
, m_neighbour_tile_value(neighbour_tile_value)
{
}
template <int ITEMS_PER_THREAD, typename BlockAdjDiff>
__device__ void operator()(BlockAdjDiff &block_adj_diff,
T (&input)[ITEMS_PER_THREAD],
T (&output)[ITEMS_PER_THREAD]) const
{
custom_difference_t<T> diff{};
block_adj_diff.SubtractLeftPartialTile(input,
output,
diff,
m_valid_items,
m_neighbour_tile_value);
}
};
template <int ItemsPerThread, int ThreadsInBlock, class T, class ActionT>
void block_adj_diff(thrust::device_vector<T> &data,
bool in_place,
ActionT action)
{
block_adj_diff_kernel<ThreadsInBlock, ItemsPerThread, T, ActionT>
<<<1, ThreadsInBlock>>>(thrust::raw_pointer_cast(data.data()),
action,
in_place);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());
}
template <bool ReadLeft,
class T>
void host_adj_diff(thrust::host_vector<T> &h_data, int valid_items)
{
custom_difference_t<T> diff{};
if (ReadLeft)
{
for (int i = valid_items - 1; i > 0; i--)
{
h_data[i] = diff(h_data[i], h_data[i - 1]);
}
}
else
{
for (int i = 0; i < valid_items - 1; i++)
{
h_data[i] = diff(h_data[i], h_data[i + 1]);
}
}
}
template <bool ReadLeft, class T>
void host_adj_diff(thrust::host_vector<T> &h_data,
int valid_items,
T neighbour_value)
{
custom_difference_t<T> diff{};
host_adj_diff<ReadLeft>(h_data, valid_items);
if (valid_items == 0)
{
return;
}
if (ReadLeft)
{
h_data[0] = diff(h_data[0], neighbour_value);
}
else
{
h_data[valid_items - 1] = diff(h_data[valid_items - 1], neighbour_value);
}
}
// %PARAM% THREADS_IN_BLOCK bs 64:256
using key_types =
c2h::type_list<std::uint16_t, std::int32_t, std::int64_t>;
using threads_in_block = c2h::enum_type_list<int, THREADS_IN_BLOCK>;
using items_per_thread = c2h::enum_type_list<int, 1, 2, 10, 15>;
using directions = c2h::enum_type_list<bool, false, true>;
using left_only = c2h::enum_type_list<bool, true>;
template <class TestType>
struct params_t
{
using key_t = typename c2h::get<0, TestType>;
static constexpr int items_per_thread = c2h::get<1, TestType>::value;
static constexpr int threads_in_block = c2h::get<2, TestType>::value;
static constexpr int tile_size = items_per_thread * threads_in_block;
static constexpr bool read_left = c2h::get<3, TestType>::value;
};
CUB_TEST("Block adjacent difference works with full tiles",
"[adjacent difference][block]",
key_types,
items_per_thread,
threads_in_block,
directions)
{
using params = params_t<TestType>;
using key_t = typename params::key_t;
thrust::device_vector<key_t> d_data(params::tile_size);
c2h::gen(CUB_SEED(10), d_data);
const bool in_place = GENERATE(false, true);
thrust::host_vector<key_t> h_data = d_data;
host_adj_diff<params::read_left>(h_data, params::tile_size);
block_adj_diff<params::items_per_thread, params::threads_in_block>(
d_data,
in_place,
base_op_t<params::read_left>{});
REQUIRE(h_data == d_data);
}
CUB_TEST("Block adjacent difference works with last tiles",
"[adjacent difference][block]",
key_types,
items_per_thread,
threads_in_block,
directions)
{
using params = params_t<TestType>;
using key_t = typename params::key_t;
thrust::device_vector<key_t> d_data(params::tile_size);
c2h::gen(CUB_SEED(10), d_data);
const bool in_place = GENERATE(false, true);
const int valid_items = GENERATE_COPY(take(10, random(0, params::tile_size)));
thrust::host_vector<key_t> h_data = d_data;
host_adj_diff<params::read_left>(h_data, valid_items);
block_adj_diff<params::items_per_thread, params::threads_in_block>(
d_data,
in_place,
last_tile_op_t<params::read_left>{valid_items});
REQUIRE(h_data == d_data);
}
CUB_TEST("Block adjacent difference works with single tiles",
"[adjacent difference][block]",
key_types,
items_per_thread,
threads_in_block,
left_only)
{
using params = params_t<TestType>;
using key_t = typename params::key_t;
thrust::device_vector<key_t> d_data(params::tile_size);
c2h::gen(CUB_SEED(10), d_data);
const bool in_place = GENERATE(false, true);
const int valid_items = GENERATE_COPY(take(10, random(0, params::tile_size)));
constexpr bool read_left = true;
thrust::host_vector<key_t> h_data = d_data;
key_t neighbour_value = h_data[h_data.size() / 2];
host_adj_diff<read_left>(h_data, valid_items, neighbour_value);
block_adj_diff<params::items_per_thread, params::threads_in_block>(
d_data,
in_place,
last_tile_with_pred_op_t<key_t>{valid_items, neighbour_value});
REQUIRE(h_data == d_data);
}
CUB_TEST("Block adjacent difference works with middle tiles",
"[adjacent difference][block]",
key_types,
items_per_thread,
threads_in_block,
directions)
{
using params = params_t<TestType>;
using key_t = typename params::key_t;
thrust::device_vector<key_t> d_data(params::tile_size);
c2h::gen(CUB_SEED(10), d_data);
const bool in_place = GENERATE(false, true);
thrust::host_vector<key_t> h_data = d_data;
key_t neighbour_value = h_data[h_data.size() / 2];
host_adj_diff<params::read_left>(h_data, params::tile_size, neighbour_value);
block_adj_diff<params::items_per_thread, params::threads_in_block>(
d_data,
in_place,
middle_tile_op_t<key_t, params::read_left>{neighbour_value});
REQUIRE(h_data == d_data);
}
CUB_TEST("Block adjacent difference supports custom types",
"[adjacent difference][block]",
threads_in_block)
{
using key_t = c2h::custom_type_t<c2h::equal_comparable_t,
c2h::subtractable_t>;
constexpr int items_per_thread = 2;
constexpr int threads_in_block = c2h::get<0, TestType>::value;
constexpr int tile_size = threads_in_block * items_per_thread;
constexpr bool read_left = true;
const bool in_place = true;
thrust::device_vector<key_t> d_data(tile_size);
c2h::gen(CUB_SEED(10), d_data);
thrust::host_vector<key_t> h_data = d_data;
host_adj_diff<read_left>(h_data, tile_size);
block_adj_diff<items_per_thread, threads_in_block>(d_data,
in_place,
base_op_t<read_left>{});
REQUIRE(h_data == d_data);
}
// TODO Test different input/output types