/****************************************************************************** * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of BlockHistogram utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include // Has to go after all cub headers. Otherwise, this test won't catch unused // variables in cub kernels. #include "catch2_test_helper.h" template __global__ void block_histogram_kernel(T *d_samples, HistoCounter *d_histogram) { // Parameterize BlockHistogram type for our thread block using block_histogram_t = cub::BlockHistogram; // Allocate temp storage in shared memory __shared__ typename block_histogram_t::TempStorage temp_storage; // Per-thread tile data T data[ITEMS_PER_THREAD]; cub::LoadDirectStriped(threadIdx.x, d_samples, data); // Test histo (writing directly to histogram buffer in global) block_histogram_t(temp_storage).Histogram(data, d_histogram); } template void block_histogram(thrust::device_vector &d_samples, thrust::device_vector &d_histogram) { block_histogram_kernel <<<1, ThreadsInBlock>>>(thrust::raw_pointer_cast(d_samples.data()), thrust::raw_pointer_cast(d_histogram.data())); REQUIRE(cudaSuccess == cudaPeekAtLastError()); REQUIRE(cudaSuccess == cudaDeviceSynchronize()); } // %PARAM% TEST_BINS bins 32:256:1024 using types = c2h::type_list; using threads_in_block = c2h::enum_type_list; using items_per_thread = c2h::enum_type_list; using bins = c2h::enum_type_list; using algorithms = c2h::enum_type_list; template struct params_t { using sample_t = typename c2h::get<0, TestType>; static constexpr int items_per_thread = c2h::get<1, TestType>::value; static constexpr int threads_in_block = c2h::get<2, TestType>::value; static constexpr int bins = c2h::get<3, TestType>::value; static constexpr int num_samples = threads_in_block * items_per_thread; static constexpr cub::BlockHistogramAlgorithm algorithm = c2h::get<4, TestType>::value; }; CUB_TEST("Block histogram can be computed with uniform input", "[histogram][block]", types, items_per_thread, threads_in_block, bins, algorithms) { using params = params_t; using sample_t = typename params::sample_t; const sample_t uniform_value = static_cast(GENERATE_COPY(take(10, random(0, params::bins - 1)))); thrust::host_vector h_samples(params::num_samples, uniform_value); thrust::host_vector h_reference(params::bins); h_reference[static_cast(uniform_value)] = params::num_samples; // Allocate problem device arrays thrust::device_vector d_samples = h_samples; thrust::device_vector d_histogram(params::bins); // Run kernel block_histogram(d_samples, d_histogram); REQUIRE(h_reference == d_histogram); } template thrust::host_vector compute_host_reference(int bins, const thrust::host_vector &h_samples) { thrust::host_vector h_reference(bins); for (const SampleT &sample : h_samples) { h_reference[sample]++; } return h_reference; } CUB_TEST("Block histogram can be computed with modulo input", "[histogram][block]", types, items_per_thread, threads_in_block, bins, algorithms) { using params = params_t; using sample_t = typename params::sample_t; // Allocate problem device arrays thrust::device_vector d_histogram(params::bins); thrust::device_vector d_samples(params::num_samples); c2h::gen(c2h::modulo_t{params::bins}, d_samples); thrust::host_vector h_samples = d_samples; auto h_reference = compute_host_reference(params::bins, h_samples); // Run kernel block_histogram(d_samples, d_histogram); REQUIRE(h_reference == d_histogram); } CUB_TEST("Block histogram can be computed with random input", "[histogram][block]", types, items_per_thread, threads_in_block, bins, algorithms) { using params = params_t; using sample_t = typename params::sample_t; // Allocate problem device arrays thrust::device_vector d_histogram(params::bins); thrust::device_vector d_samples(params::num_samples); const sample_t min_bin = static_cast(0); const sample_t max_bin = static_cast( std::min(static_cast(std::numeric_limits::max()), static_cast(params::bins - 1))); c2h::gen(CUB_SEED(10), d_samples, min_bin, max_bin); thrust::host_vector h_samples = d_samples; auto h_reference = compute_host_reference(params::bins, h_samples); // Run kernel block_histogram(d_samples, d_histogram); REQUIRE(h_reference == d_histogram); }