| /****************************************************************************** | |
| * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. | |
| * | |
| * Redistribution and use in source and binary forms, with or without | |
| * modification, are permitted provided that the following conditions are met: | |
| * * Redistributions of source code must retain the above copyright | |
| * notice, this list of conditions and the following disclaimer. | |
| * * Redistributions in binary form must reproduce the above copyright | |
| * notice, this list of conditions and the following disclaimer in the | |
| * documentation and/or other materials provided with the distribution. | |
| * * Neither the name of the NVIDIA CORPORATION nor the | |
| * names of its contributors may be used to endorse or promote products | |
| * derived from this software without specific prior written permission. | |
| * | |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| * | |
| ******************************************************************************/ | |
| #include <cub/block/block_load.cuh> | |
| #include <cub/iterator/cache_modified_input_iterator.cuh> | |
| #include <cub/util_allocator.cuh> | |
| // Has to go after all cub headers. Otherwise, this test won't catch unused | |
| // variables in cub kernels. | |
| #include "catch2_test_helper.h" | |
| template <int ItemsPerThread, | |
| int ThreadsInBlock, | |
| cub::BlockLoadAlgorithm LoadAlgorithm> | |
| struct output_idx | |
| { | |
| static __device__ int get(int item) | |
| { | |
| return static_cast<int>(threadIdx.x) * ItemsPerThread + item; | |
| } | |
| }; | |
| template <int ItemsPerThread, int ThreadsInBlock> | |
| struct output_idx<ItemsPerThread, | |
| ThreadsInBlock, | |
| cub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED> | |
| { | |
| static __device__ int get(int item) | |
| { | |
| return static_cast<int>(threadIdx.x) + ThreadsInBlock * item; | |
| } | |
| }; | |
| template <typename InputIteratorT, | |
| typename OutputIteratorT, | |
| int ItemsPerThread, | |
| int ThreadsInBlock, | |
| cub::BlockLoadAlgorithm LoadAlgorithm> | |
| __global__ void kernel(std::integral_constant<bool, true>, | |
| InputIteratorT input, | |
| OutputIteratorT output, | |
| int num_items) | |
| { | |
| using input_t = cub::detail::value_t<InputIteratorT>; | |
| using block_load_t = cub::BlockLoad<input_t, | |
| ThreadsInBlock, | |
| ItemsPerThread, | |
| LoadAlgorithm>; | |
| using storage_t = | |
| typename block_load_t::TempStorage; | |
| __shared__ storage_t storage; | |
| block_load_t block_load(storage); | |
| input_t data[ItemsPerThread]; | |
| if (ItemsPerThread * ThreadsInBlock == num_items) | |
| { | |
| block_load.Load(input, data); | |
| } | |
| else | |
| { | |
| block_load.Load(input, data, num_items); | |
| } | |
| for (int i = 0; i < ItemsPerThread; i++) | |
| { | |
| const int idx = | |
| output_idx<ItemsPerThread, ThreadsInBlock, LoadAlgorithm>::get(i); | |
| if (idx < num_items) | |
| { | |
| output[idx] = data[i]; | |
| } | |
| } | |
| } | |
| template <typename InputIteratorT, | |
| typename OutputIteratorT, | |
| int ItemsPerThread, | |
| int ThreadsInBlock, | |
| cub::BlockLoadAlgorithm /* LoadAlgorithm */> | |
| __global__ void kernel(std::integral_constant<bool, false>, | |
| InputIteratorT input, | |
| OutputIteratorT output, | |
| int num_items) | |
| { | |
| for (int i = 0; i < ItemsPerThread; i++) | |
| { | |
| const int idx = | |
| output_idx<ItemsPerThread, | |
| ThreadsInBlock, | |
| cub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT>::get(i); | |
| if (idx < num_items) | |
| { | |
| output[idx] = input[idx]; | |
| } | |
| } | |
| } | |
| template <int ItemsPerThread, | |
| int ThreadsInBlock, | |
| cub::BlockLoadAlgorithm LoadAlgorithm, | |
| typename InputIteratorT, | |
| typename OutputIteratorT> | |
| void block_load( | |
| InputIteratorT input, | |
| OutputIteratorT output, | |
| int num_items) | |
| { | |
| using input_t = cub::detail::value_t<InputIteratorT>; | |
| using block_load_t = cub::BlockLoad<input_t, | |
| ThreadsInBlock, | |
| ItemsPerThread, | |
| LoadAlgorithm>; | |
| using storage_t = typename block_load_t::TempStorage; | |
| constexpr bool sufficient_resources = sizeof(storage_t) <= 1024 * 48; | |
| kernel<InputIteratorT, | |
| OutputIteratorT, | |
| ItemsPerThread, | |
| ThreadsInBlock, | |
| LoadAlgorithm><<<1, ThreadsInBlock>>>( | |
| std::integral_constant<bool, sufficient_resources>{}, | |
| input, output, num_items); | |
| REQUIRE( cudaSuccess == cudaPeekAtLastError() ); | |
| REQUIRE( cudaSuccess == cudaDeviceSynchronize() ); | |
| } | |
| // %PARAM% IPT it 1:11 | |
| using types = c2h::type_list<std::uint8_t, std::int32_t, std::int64_t>; | |
| using vec_types = c2h::type_list<long2, double2>; | |
| using even_threads_in_block = c2h::enum_type_list<int, 32, 128>; | |
| using odd_threads_in_block = c2h::enum_type_list<int, 15, 65>; | |
| using a_block_size = c2h::enum_type_list<int, 256>; | |
| using items_per_thread = c2h::enum_type_list<int, IPT>; | |
| using load_algorithm = c2h::enum_type_list< | |
| cub::BlockLoadAlgorithm, | |
| cub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT, | |
| cub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED, | |
| cub::BlockLoadAlgorithm::BLOCK_LOAD_VECTORIZE, | |
| cub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE, | |
| cub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE, | |
| cub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED>; | |
| using odd_load_algorithm = c2h::enum_type_list< | |
| cub::BlockLoadAlgorithm, | |
| cub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT, | |
| cub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED, | |
| cub::BlockLoadAlgorithm::BLOCK_LOAD_VECTORIZE, | |
| cub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE>; | |
| template <class TestType> | |
| struct params_t | |
| { | |
| using type = typename c2h::get<0, TestType>; | |
| static constexpr int items_per_thread = c2h::get<1, TestType>::value; | |
| static constexpr int threads_in_block = c2h::get<2, TestType>::value; | |
| static constexpr int tile_size = items_per_thread * threads_in_block; | |
| static constexpr cub::BlockLoadAlgorithm load_algorithm = | |
| c2h::get<3, TestType>::value; | |
| }; | |
| CUB_TEST("Block load works with even block sizes", | |
| "[load][block]", | |
| types, | |
| items_per_thread, | |
| even_threads_in_block, | |
| load_algorithm) | |
| { | |
| using params = params_t<TestType>; | |
| using type = typename params::type; | |
| thrust::device_vector<type> d_input( | |
| GENERATE_COPY(take(10, random(0, params::tile_size)))); | |
| c2h::gen(CUB_SEED(10), d_input); | |
| thrust::device_vector<type> d_output(d_input.size()); | |
| block_load<params::items_per_thread, | |
| params::threads_in_block, | |
| params::load_algorithm>(thrust::raw_pointer_cast(d_input.data()), | |
| thrust::raw_pointer_cast(d_output.data()), | |
| static_cast<int>(d_input.size())); | |
| REQUIRE( d_input == d_output ); | |
| } | |
| CUB_TEST("Block load works with even odd sizes", | |
| "[load][block]", | |
| types, | |
| items_per_thread, | |
| odd_threads_in_block, | |
| odd_load_algorithm) | |
| { | |
| using params = params_t<TestType>; | |
| using type = typename params::type; | |
| thrust::device_vector<type> d_input( | |
| GENERATE_COPY(take(10, random(0, params::tile_size)))); | |
| c2h::gen(CUB_SEED(10), d_input); | |
| thrust::device_vector<type> d_output(d_input.size()); | |
| block_load<params::items_per_thread, | |
| params::threads_in_block, | |
| params::load_algorithm>(thrust::raw_pointer_cast(d_input.data()), | |
| thrust::raw_pointer_cast(d_output.data()), | |
| static_cast<int>(d_input.size())); | |
| REQUIRE( d_input == d_output ); | |
| } | |
| CUB_TEST("Block load works with even vector types", | |
| "[load][block]", | |
| vec_types, | |
| items_per_thread, | |
| a_block_size, | |
| load_algorithm) | |
| { | |
| using params = params_t<TestType>; | |
| using type = typename params::type; | |
| thrust::device_vector<type> d_input( | |
| GENERATE_COPY(take(10, random(0, params::tile_size)))); | |
| c2h::gen(CUB_SEED(10), d_input); | |
| thrust::device_vector<type> d_output(d_input.size()); | |
| block_load<params::items_per_thread, | |
| params::threads_in_block, | |
| params::load_algorithm>(thrust::raw_pointer_cast(d_input.data()), | |
| thrust::raw_pointer_cast(d_output.data()), | |
| static_cast<int>(d_input.size())); | |
| REQUIRE( d_input == d_output ); | |
| } | |
| CUB_TEST("Block load works with custom types", | |
| "[load][block]", | |
| items_per_thread, | |
| load_algorithm) | |
| { | |
| using type = c2h::custom_type_t<c2h::equal_comparable_t>; | |
| constexpr int items_per_thread = c2h::get<0, TestType>::value; | |
| constexpr int threads_in_block = 64; | |
| constexpr int tile_size = items_per_thread * threads_in_block; | |
| static constexpr cub::BlockLoadAlgorithm load_algorithm = | |
| c2h::get<1, TestType>::value; | |
| thrust::device_vector<type> d_input( | |
| GENERATE_COPY(take(10, random(0, tile_size)))); | |
| c2h::gen(CUB_SEED(10), d_input); | |
| thrust::device_vector<type> d_output(d_input.size()); | |
| block_load<items_per_thread, | |
| threads_in_block, | |
| load_algorithm>(thrust::raw_pointer_cast(d_input.data()), | |
| thrust::raw_pointer_cast(d_output.data()), | |
| static_cast<int>(d_input.size())); | |
| REQUIRE( d_input == d_output ); | |
| } | |
| CUB_TEST("Block load works with caching iterators", | |
| "[load][block]", | |
| items_per_thread, | |
| load_algorithm) | |
| { | |
| using type = int; | |
| constexpr int items_per_thread = c2h::get<0, TestType>::value; | |
| constexpr int threads_in_block = 64; | |
| constexpr int tile_size = items_per_thread * threads_in_block; | |
| static constexpr cub::BlockLoadAlgorithm load_algorithm = | |
| c2h::get<1, TestType>::value; | |
| thrust::device_vector<type> d_input( | |
| GENERATE_COPY(take(10, random(0, tile_size)))); | |
| c2h::gen(CUB_SEED(10), d_input); | |
| cub::CacheModifiedInputIterator<cub::CacheLoadModifier::LOAD_DEFAULT, type> in( | |
| thrust::raw_pointer_cast(d_input.data())); | |
| thrust::device_vector<type> d_output(d_input.size()); | |
| block_load<items_per_thread, | |
| threads_in_block, | |
| load_algorithm>(in, | |
| thrust::raw_pointer_cast(d_output.data()), | |
| static_cast<int>(d_input.size())); | |
| REQUIRE( d_input == d_output ); | |
| } | |