/****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include // Has to go after all cub headers. Otherwise, this test won't catch unused // variables in cub kernels. #include "catch2_test_helper.h" template __global__ void block_adj_diff_kernel(T *data, ActionT action, bool in_place) { using block_adjacent_differencet_t = cub::BlockAdjacentDifference; using temp_storage_t = typename block_adjacent_differencet_t::TempStorage; __shared__ temp_storage_t temp_storage; T thread_in[ItemsPerThread]; T thread_out[ItemsPerThread]; const int thread_offset = static_cast(threadIdx.x) * ItemsPerThread; for (int item = 0; item < ItemsPerThread; item++) { thread_in[item] = data[thread_offset + item]; } __syncthreads(); block_adjacent_differencet_t adj_diff(temp_storage); if (in_place) { action(adj_diff, thread_in, thread_in); for (unsigned int item = 0; item < ItemsPerThread; item++) { data[thread_offset + item] = thread_in[item]; } } else { action(adj_diff, thread_in, thread_out); for (unsigned int item = 0; item < ItemsPerThread; item++) { data[thread_offset + item] = thread_out[item]; } } } template struct custom_difference_t { __host__ __device__ T operator()(const T &lhs, const T &rhs) { return lhs - rhs; } }; template struct base_op_t { template __device__ void operator()(BlockAdjDiff &block_adj_diff, T (&input)[ItemsPerThread], T (&output)[ItemsPerThread]) const { if (ReadLeft) { block_adj_diff.SubtractLeft(input, output, custom_difference_t{}); } else { block_adj_diff.SubtractRight(input, output, custom_difference_t{}); } } }; template struct last_tile_op_t { int m_valid_items{}; __host__ last_tile_op_t(int valid_items) : m_valid_items(valid_items) {} template __device__ void operator()(BlockAdjDiff &block_adj_diff, T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD]) const { custom_difference_t diff{}; if (ReadLeft) { block_adj_diff.SubtractLeftPartialTile(input, output, diff, m_valid_items); } else { block_adj_diff.SubtractRightPartialTile(input, output, diff, m_valid_items); } } }; template struct middle_tile_op_t { T m_neighbour_tile_value; __host__ middle_tile_op_t(T neighbour_tile_value) : m_neighbour_tile_value(neighbour_tile_value) {} template __device__ void operator()(BlockAdjDiff &block_adj_diff, T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD]) const { custom_difference_t diff{}; if (ReadLeft) { block_adj_diff.SubtractLeft(input, output, diff, m_neighbour_tile_value); } else { block_adj_diff.SubtractRight(input, output, diff, m_neighbour_tile_value); } } }; template struct last_tile_with_pred_op_t { int m_valid_items; T m_neighbour_tile_value; __host__ last_tile_with_pred_op_t( int valid_items, T neighbour_tile_value) : m_valid_items(valid_items) , m_neighbour_tile_value(neighbour_tile_value) { } template __device__ void operator()(BlockAdjDiff &block_adj_diff, T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD]) const { custom_difference_t diff{}; block_adj_diff.SubtractLeftPartialTile(input, output, diff, m_valid_items, m_neighbour_tile_value); } }; template void block_adj_diff(thrust::device_vector &data, bool in_place, ActionT action) { block_adj_diff_kernel <<<1, ThreadsInBlock>>>(thrust::raw_pointer_cast(data.data()), action, in_place); REQUIRE(cudaSuccess == cudaPeekAtLastError()); REQUIRE(cudaSuccess == cudaDeviceSynchronize()); } template void host_adj_diff(thrust::host_vector &h_data, int valid_items) { custom_difference_t diff{}; if (ReadLeft) { for (int i = valid_items - 1; i > 0; i--) { h_data[i] = diff(h_data[i], h_data[i - 1]); } } else { for (int i = 0; i < valid_items - 1; i++) { h_data[i] = diff(h_data[i], h_data[i + 1]); } } } template void host_adj_diff(thrust::host_vector &h_data, int valid_items, T neighbour_value) { custom_difference_t diff{}; host_adj_diff(h_data, valid_items); if (valid_items == 0) { return; } if (ReadLeft) { h_data[0] = diff(h_data[0], neighbour_value); } else { h_data[valid_items - 1] = diff(h_data[valid_items - 1], neighbour_value); } } // %PARAM% THREADS_IN_BLOCK bs 64:256 using key_types = c2h::type_list; using threads_in_block = c2h::enum_type_list; using items_per_thread = c2h::enum_type_list; using directions = c2h::enum_type_list; using left_only = c2h::enum_type_list; template struct params_t { using key_t = typename c2h::get<0, TestType>; static constexpr int items_per_thread = c2h::get<1, TestType>::value; static constexpr int threads_in_block = c2h::get<2, TestType>::value; static constexpr int tile_size = items_per_thread * threads_in_block; static constexpr bool read_left = c2h::get<3, TestType>::value; }; CUB_TEST("Block adjacent difference works with full tiles", "[adjacent difference][block]", key_types, items_per_thread, threads_in_block, directions) { using params = params_t; using key_t = typename params::key_t; thrust::device_vector d_data(params::tile_size); c2h::gen(CUB_SEED(10), d_data); const bool in_place = GENERATE(false, true); thrust::host_vector h_data = d_data; host_adj_diff(h_data, params::tile_size); block_adj_diff( d_data, in_place, base_op_t{}); REQUIRE(h_data == d_data); } CUB_TEST("Block adjacent difference works with last tiles", "[adjacent difference][block]", key_types, items_per_thread, threads_in_block, directions) { using params = params_t; using key_t = typename params::key_t; thrust::device_vector d_data(params::tile_size); c2h::gen(CUB_SEED(10), d_data); const bool in_place = GENERATE(false, true); const int valid_items = GENERATE_COPY(take(10, random(0, params::tile_size))); thrust::host_vector h_data = d_data; host_adj_diff(h_data, valid_items); block_adj_diff( d_data, in_place, last_tile_op_t{valid_items}); REQUIRE(h_data == d_data); } CUB_TEST("Block adjacent difference works with single tiles", "[adjacent difference][block]", key_types, items_per_thread, threads_in_block, left_only) { using params = params_t; using key_t = typename params::key_t; thrust::device_vector d_data(params::tile_size); c2h::gen(CUB_SEED(10), d_data); const bool in_place = GENERATE(false, true); const int valid_items = GENERATE_COPY(take(10, random(0, params::tile_size))); constexpr bool read_left = true; thrust::host_vector h_data = d_data; key_t neighbour_value = h_data[h_data.size() / 2]; host_adj_diff(h_data, valid_items, neighbour_value); block_adj_diff( d_data, in_place, last_tile_with_pred_op_t{valid_items, neighbour_value}); REQUIRE(h_data == d_data); } CUB_TEST("Block adjacent difference works with middle tiles", "[adjacent difference][block]", key_types, items_per_thread, threads_in_block, directions) { using params = params_t; using key_t = typename params::key_t; thrust::device_vector d_data(params::tile_size); c2h::gen(CUB_SEED(10), d_data); const bool in_place = GENERATE(false, true); thrust::host_vector h_data = d_data; key_t neighbour_value = h_data[h_data.size() / 2]; host_adj_diff(h_data, params::tile_size, neighbour_value); block_adj_diff( d_data, in_place, middle_tile_op_t{neighbour_value}); REQUIRE(h_data == d_data); } CUB_TEST("Block adjacent difference supports custom types", "[adjacent difference][block]", threads_in_block) { using key_t = c2h::custom_type_t; constexpr int items_per_thread = 2; constexpr int threads_in_block = c2h::get<0, TestType>::value; constexpr int tile_size = threads_in_block * items_per_thread; constexpr bool read_left = true; const bool in_place = true; thrust::device_vector d_data(tile_size); c2h::gen(CUB_SEED(10), d_data); thrust::host_vector h_data = d_data; host_adj_diff(h_data, tile_size); block_adj_diff(d_data, in_place, base_op_t{}); REQUIRE(h_data == d_data); } // TODO Test different input/output types