thrust / dependencies /cub /test /catch2_test_block_load.cu
camenduru's picture
thanks to nvidia ❤
8ae5fc5
/******************************************************************************
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#include <cub/block/block_load.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/util_allocator.cuh>
// Has to go after all cub headers. Otherwise, this test won't catch unused
// variables in cub kernels.
#include "catch2_test_helper.h"
template <int ItemsPerThread,
int ThreadsInBlock,
cub::BlockLoadAlgorithm LoadAlgorithm>
struct output_idx
{
static __device__ int get(int item)
{
return static_cast<int>(threadIdx.x) * ItemsPerThread + item;
}
};
template <int ItemsPerThread, int ThreadsInBlock>
struct output_idx<ItemsPerThread,
ThreadsInBlock,
cub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED>
{
static __device__ int get(int item)
{
return static_cast<int>(threadIdx.x) + ThreadsInBlock * item;
}
};
template <typename InputIteratorT,
typename OutputIteratorT,
int ItemsPerThread,
int ThreadsInBlock,
cub::BlockLoadAlgorithm LoadAlgorithm>
__global__ void kernel(std::integral_constant<bool, true>,
InputIteratorT input,
OutputIteratorT output,
int num_items)
{
using input_t = cub::detail::value_t<InputIteratorT>;
using block_load_t = cub::BlockLoad<input_t,
ThreadsInBlock,
ItemsPerThread,
LoadAlgorithm>;
using storage_t =
typename block_load_t::TempStorage;
__shared__ storage_t storage;
block_load_t block_load(storage);
input_t data[ItemsPerThread];
if (ItemsPerThread * ThreadsInBlock == num_items)
{
block_load.Load(input, data);
}
else
{
block_load.Load(input, data, num_items);
}
for (int i = 0; i < ItemsPerThread; i++)
{
const int idx =
output_idx<ItemsPerThread, ThreadsInBlock, LoadAlgorithm>::get(i);
if (idx < num_items)
{
output[idx] = data[i];
}
}
}
template <typename InputIteratorT,
typename OutputIteratorT,
int ItemsPerThread,
int ThreadsInBlock,
cub::BlockLoadAlgorithm /* LoadAlgorithm */>
__global__ void kernel(std::integral_constant<bool, false>,
InputIteratorT input,
OutputIteratorT output,
int num_items)
{
for (int i = 0; i < ItemsPerThread; i++)
{
const int idx =
output_idx<ItemsPerThread,
ThreadsInBlock,
cub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT>::get(i);
if (idx < num_items)
{
output[idx] = input[idx];
}
}
}
template <int ItemsPerThread,
int ThreadsInBlock,
cub::BlockLoadAlgorithm LoadAlgorithm,
typename InputIteratorT,
typename OutputIteratorT>
void block_load(
InputIteratorT input,
OutputIteratorT output,
int num_items)
{
using input_t = cub::detail::value_t<InputIteratorT>;
using block_load_t = cub::BlockLoad<input_t,
ThreadsInBlock,
ItemsPerThread,
LoadAlgorithm>;
using storage_t = typename block_load_t::TempStorage;
constexpr bool sufficient_resources = sizeof(storage_t) <= 1024 * 48;
kernel<InputIteratorT,
OutputIteratorT,
ItemsPerThread,
ThreadsInBlock,
LoadAlgorithm><<<1, ThreadsInBlock>>>(
std::integral_constant<bool, sufficient_resources>{},
input, output, num_items);
REQUIRE( cudaSuccess == cudaPeekAtLastError() );
REQUIRE( cudaSuccess == cudaDeviceSynchronize() );
}
// %PARAM% IPT it 1:11
using types = c2h::type_list<std::uint8_t, std::int32_t, std::int64_t>;
using vec_types = c2h::type_list<long2, double2>;
using even_threads_in_block = c2h::enum_type_list<int, 32, 128>;
using odd_threads_in_block = c2h::enum_type_list<int, 15, 65>;
using a_block_size = c2h::enum_type_list<int, 256>;
using items_per_thread = c2h::enum_type_list<int, IPT>;
using load_algorithm = c2h::enum_type_list<
cub::BlockLoadAlgorithm,
cub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT,
cub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED,
cub::BlockLoadAlgorithm::BLOCK_LOAD_VECTORIZE,
cub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE,
cub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE,
cub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED>;
using odd_load_algorithm = c2h::enum_type_list<
cub::BlockLoadAlgorithm,
cub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT,
cub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED,
cub::BlockLoadAlgorithm::BLOCK_LOAD_VECTORIZE,
cub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE>;
template <class TestType>
struct params_t
{
using type = typename c2h::get<0, TestType>;
static constexpr int items_per_thread = c2h::get<1, TestType>::value;
static constexpr int threads_in_block = c2h::get<2, TestType>::value;
static constexpr int tile_size = items_per_thread * threads_in_block;
static constexpr cub::BlockLoadAlgorithm load_algorithm =
c2h::get<3, TestType>::value;
};
CUB_TEST("Block load works with even block sizes",
"[load][block]",
types,
items_per_thread,
even_threads_in_block,
load_algorithm)
{
using params = params_t<TestType>;
using type = typename params::type;
thrust::device_vector<type> d_input(
GENERATE_COPY(take(10, random(0, params::tile_size))));
c2h::gen(CUB_SEED(10), d_input);
thrust::device_vector<type> d_output(d_input.size());
block_load<params::items_per_thread,
params::threads_in_block,
params::load_algorithm>(thrust::raw_pointer_cast(d_input.data()),
thrust::raw_pointer_cast(d_output.data()),
static_cast<int>(d_input.size()));
REQUIRE( d_input == d_output );
}
CUB_TEST("Block load works with even odd sizes",
"[load][block]",
types,
items_per_thread,
odd_threads_in_block,
odd_load_algorithm)
{
using params = params_t<TestType>;
using type = typename params::type;
thrust::device_vector<type> d_input(
GENERATE_COPY(take(10, random(0, params::tile_size))));
c2h::gen(CUB_SEED(10), d_input);
thrust::device_vector<type> d_output(d_input.size());
block_load<params::items_per_thread,
params::threads_in_block,
params::load_algorithm>(thrust::raw_pointer_cast(d_input.data()),
thrust::raw_pointer_cast(d_output.data()),
static_cast<int>(d_input.size()));
REQUIRE( d_input == d_output );
}
CUB_TEST("Block load works with even vector types",
"[load][block]",
vec_types,
items_per_thread,
a_block_size,
load_algorithm)
{
using params = params_t<TestType>;
using type = typename params::type;
thrust::device_vector<type> d_input(
GENERATE_COPY(take(10, random(0, params::tile_size))));
c2h::gen(CUB_SEED(10), d_input);
thrust::device_vector<type> d_output(d_input.size());
block_load<params::items_per_thread,
params::threads_in_block,
params::load_algorithm>(thrust::raw_pointer_cast(d_input.data()),
thrust::raw_pointer_cast(d_output.data()),
static_cast<int>(d_input.size()));
REQUIRE( d_input == d_output );
}
CUB_TEST("Block load works with custom types",
"[load][block]",
items_per_thread,
load_algorithm)
{
using type = c2h::custom_type_t<c2h::equal_comparable_t>;
constexpr int items_per_thread = c2h::get<0, TestType>::value;
constexpr int threads_in_block = 64;
constexpr int tile_size = items_per_thread * threads_in_block;
static constexpr cub::BlockLoadAlgorithm load_algorithm =
c2h::get<1, TestType>::value;
thrust::device_vector<type> d_input(
GENERATE_COPY(take(10, random(0, tile_size))));
c2h::gen(CUB_SEED(10), d_input);
thrust::device_vector<type> d_output(d_input.size());
block_load<items_per_thread,
threads_in_block,
load_algorithm>(thrust::raw_pointer_cast(d_input.data()),
thrust::raw_pointer_cast(d_output.data()),
static_cast<int>(d_input.size()));
REQUIRE( d_input == d_output );
}
CUB_TEST("Block load works with caching iterators",
"[load][block]",
items_per_thread,
load_algorithm)
{
using type = int;
constexpr int items_per_thread = c2h::get<0, TestType>::value;
constexpr int threads_in_block = 64;
constexpr int tile_size = items_per_thread * threads_in_block;
static constexpr cub::BlockLoadAlgorithm load_algorithm =
c2h::get<1, TestType>::value;
thrust::device_vector<type> d_input(
GENERATE_COPY(take(10, random(0, tile_size))));
c2h::gen(CUB_SEED(10), d_input);
cub::CacheModifiedInputIterator<cub::CacheLoadModifier::LOAD_DEFAULT, type> in(
thrust::raw_pointer_cast(d_input.data()));
thrust::device_vector<type> d_output(d_input.size());
block_load<items_per_thread,
threads_in_block,
load_algorithm>(in,
thrust::raw_pointer_cast(d_output.data()),
static_cast<int>(d_input.size()));
REQUIRE( d_input == d_output );
}