thrust / dependencies /cub /test /catch2_test_block_histogram.cu
camenduru's picture
thanks to nvidia ❤
8ae5fc5
/******************************************************************************
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Test of BlockHistogram utilities
******************************************************************************/
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <limits>
#include <string>
#include <cub/block/block_histogram.cuh>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// Has to go after all cub headers. Otherwise, this test won't catch unused
// variables in cub kernels.
#include "catch2_test_helper.h"
template <int BINS,
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
cub::BlockHistogramAlgorithm ALGORITHM,
typename T,
typename HistoCounter>
__global__ void block_histogram_kernel(T *d_samples, HistoCounter *d_histogram)
{
// Parameterize BlockHistogram type for our thread block
using block_histogram_t =
cub::BlockHistogram<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS, ALGORITHM>;
// Allocate temp storage in shared memory
__shared__ typename block_histogram_t::TempStorage temp_storage;
// Per-thread tile data
T data[ITEMS_PER_THREAD];
cub::LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_samples, data);
// Test histo (writing directly to histogram buffer in global)
block_histogram_t(temp_storage).Histogram(data, d_histogram);
}
template <int ItemsPerThread,
int ThreadsInBlock,
int Bins,
cub::BlockHistogramAlgorithm Algorithm,
typename SampleT>
void block_histogram(thrust::device_vector<SampleT> &d_samples,
thrust::device_vector<int> &d_histogram)
{
block_histogram_kernel<Bins, ThreadsInBlock, ItemsPerThread, Algorithm>
<<<1, ThreadsInBlock>>>(thrust::raw_pointer_cast(d_samples.data()),
thrust::raw_pointer_cast(d_histogram.data()));
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());
}
// %PARAM% TEST_BINS bins 32:256:1024
using types = c2h::type_list<std::uint8_t, std::uint16_t>;
using threads_in_block = c2h::enum_type_list<int, 32, 96, 128>;
using items_per_thread = c2h::enum_type_list<int, 1, 5>;
using bins = c2h::enum_type_list<int, TEST_BINS>;
using algorithms = c2h::enum_type_list<cub::BlockHistogramAlgorithm,
cub::BLOCK_HISTO_SORT,
cub::BLOCK_HISTO_ATOMIC>;
template <class TestType>
struct params_t
{
using sample_t = typename c2h::get<0, TestType>;
static constexpr int items_per_thread = c2h::get<1, TestType>::value;
static constexpr int threads_in_block = c2h::get<2, TestType>::value;
static constexpr int bins = c2h::get<3, TestType>::value;
static constexpr int num_samples = threads_in_block * items_per_thread;
static constexpr cub::BlockHistogramAlgorithm algorithm =
c2h::get<4, TestType>::value;
};
CUB_TEST("Block histogram can be computed with uniform input",
"[histogram][block]",
types,
items_per_thread,
threads_in_block,
bins,
algorithms)
{
using params = params_t<TestType>;
using sample_t = typename params::sample_t;
const sample_t uniform_value =
static_cast<sample_t>(GENERATE_COPY(take(10, random(0, params::bins - 1))));
thrust::host_vector<sample_t> h_samples(params::num_samples, uniform_value);
thrust::host_vector<int> h_reference(params::bins);
h_reference[static_cast<std::size_t>(uniform_value)] = params::num_samples;
// Allocate problem device arrays
thrust::device_vector<sample_t> d_samples = h_samples;
thrust::device_vector<int> d_histogram(params::bins);
// Run kernel
block_histogram<params::items_per_thread,
params::threads_in_block,
params::bins,
params::algorithm>(d_samples, d_histogram);
REQUIRE(h_reference == d_histogram);
}
template <typename SampleT>
thrust::host_vector<int>
compute_host_reference(int bins, const thrust::host_vector<SampleT> &h_samples)
{
thrust::host_vector<int> h_reference(bins);
for (const SampleT &sample : h_samples)
{
h_reference[sample]++;
}
return h_reference;
}
CUB_TEST("Block histogram can be computed with modulo input",
"[histogram][block]",
types,
items_per_thread,
threads_in_block,
bins,
algorithms)
{
using params = params_t<TestType>;
using sample_t = typename params::sample_t;
// Allocate problem device arrays
thrust::device_vector<int> d_histogram(params::bins);
thrust::device_vector<sample_t> d_samples(params::num_samples);
c2h::gen(c2h::modulo_t{params::bins}, d_samples);
thrust::host_vector<sample_t> h_samples = d_samples;
auto h_reference = compute_host_reference(params::bins, h_samples);
// Run kernel
block_histogram<params::items_per_thread,
params::threads_in_block,
params::bins,
params::algorithm>(d_samples, d_histogram);
REQUIRE(h_reference == d_histogram);
}
CUB_TEST("Block histogram can be computed with random input",
"[histogram][block]",
types,
items_per_thread,
threads_in_block,
bins,
algorithms)
{
using params = params_t<TestType>;
using sample_t = typename params::sample_t;
// Allocate problem device arrays
thrust::device_vector<int> d_histogram(params::bins);
thrust::device_vector<sample_t> d_samples(params::num_samples);
const sample_t min_bin = static_cast<sample_t>(0);
const sample_t max_bin =
static_cast<sample_t>(
std::min(static_cast<std::int32_t>(std::numeric_limits<sample_t>::max()),
static_cast<std::int32_t>(params::bins - 1)));
c2h::gen(CUB_SEED(10), d_samples, min_bin, max_bin);
thrust::host_vector<sample_t> h_samples = d_samples;
auto h_reference = compute_host_reference(params::bins, h_samples);
// Run kernel
block_histogram<params::items_per_thread,
params::threads_in_block,
params::bins,
params::algorithm>(d_samples, d_histogram);
REQUIRE(h_reference == d_histogram);
}