File size: 7,919 Bytes

8ae5fc5

/******************************************************************************
* Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*     * Redistributions of source code must retain the above copyright
*       notice, this list of conditions and the following disclaimer.
*     * Redistributions in binary form must reproduce the above copyright
*       notice, this list of conditions and the following disclaimer in the
*       documentation and/or other materials provided with the distribution.
*     * Neither the name of the NVIDIA CORPORATION nor the
*       names of its contributors may be used to endorse or promote products
*       derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/

/******************************************************************************
 * Test of BlockHistogram utilities
 ******************************************************************************/

// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR

#include <limits>
#include <string>

#include <cub/block/block_histogram.cuh>

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

// Has to go after all cub headers. Otherwise, this test won't catch unused
// variables in cub kernels.
#include "catch2_test_helper.h"

template <int BINS,
          int BLOCK_THREADS,
          int ITEMS_PER_THREAD,
          cub::BlockHistogramAlgorithm ALGORITHM,
          typename T,
          typename HistoCounter>
__global__ void block_histogram_kernel(T *d_samples, HistoCounter *d_histogram)
{
  // Parameterize BlockHistogram type for our thread block
  using block_histogram_t =
    cub::BlockHistogram<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS, ALGORITHM>;

  // Allocate temp storage in shared memory
  __shared__ typename block_histogram_t::TempStorage temp_storage;

  // Per-thread tile data
  T data[ITEMS_PER_THREAD];
  cub::LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_samples, data);

  // Test histo (writing directly to histogram buffer in global)
  block_histogram_t(temp_storage).Histogram(data, d_histogram);
}

template <int ItemsPerThread,
          int ThreadsInBlock,
          int Bins,
          cub::BlockHistogramAlgorithm Algorithm,
          typename SampleT>
void block_histogram(thrust::device_vector<SampleT> &d_samples,
                     thrust::device_vector<int> &d_histogram)
{
  block_histogram_kernel<Bins, ThreadsInBlock, ItemsPerThread, Algorithm>
    <<<1, ThreadsInBlock>>>(thrust::raw_pointer_cast(d_samples.data()),
                            thrust::raw_pointer_cast(d_histogram.data()));

  REQUIRE(cudaSuccess == cudaPeekAtLastError());
  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
}

// %PARAM% TEST_BINS bins 32:256:1024

using types = c2h::type_list<std::uint8_t, std::uint16_t>;
using threads_in_block = c2h::enum_type_list<int, 32, 96, 128>;
using items_per_thread = c2h::enum_type_list<int, 1, 5>;
using bins             = c2h::enum_type_list<int, TEST_BINS>;
using algorithms = c2h::enum_type_list<cub::BlockHistogramAlgorithm,
                                       cub::BLOCK_HISTO_SORT,
                                       cub::BLOCK_HISTO_ATOMIC>;

template <class TestType>
struct params_t
{
  using sample_t = typename c2h::get<0, TestType>;

  static constexpr int items_per_thread = c2h::get<1, TestType>::value;
  static constexpr int threads_in_block = c2h::get<2, TestType>::value;
  static constexpr int bins = c2h::get<3, TestType>::value;
  static constexpr int num_samples = threads_in_block * items_per_thread;
  static constexpr cub::BlockHistogramAlgorithm algorithm =
    c2h::get<4, TestType>::value;
};

CUB_TEST("Block histogram can be computed with uniform input",
         "[histogram][block]",
         types,
         items_per_thread,
         threads_in_block,
         bins,
         algorithms)
{
  using params   = params_t<TestType>;
  using sample_t = typename params::sample_t;

  const sample_t uniform_value =
    static_cast<sample_t>(GENERATE_COPY(take(10, random(0, params::bins - 1))));

  thrust::host_vector<sample_t> h_samples(params::num_samples, uniform_value);
  thrust::host_vector<int> h_reference(params::bins);
  h_reference[static_cast<std::size_t>(uniform_value)] = params::num_samples;

  // Allocate problem device arrays
  thrust::device_vector<sample_t> d_samples = h_samples;
  thrust::device_vector<int> d_histogram(params::bins);

  // Run kernel
  block_histogram<params::items_per_thread,
                  params::threads_in_block,
                  params::bins,
                  params::algorithm>(d_samples, d_histogram);

  REQUIRE(h_reference == d_histogram);
}

template <typename SampleT>
thrust::host_vector<int>
compute_host_reference(int bins, const thrust::host_vector<SampleT> &h_samples)
{
  thrust::host_vector<int> h_reference(bins);
  for (const SampleT &sample : h_samples)
  {
    h_reference[sample]++;
  }

  return h_reference;
}

CUB_TEST("Block histogram can be computed with modulo input",
         "[histogram][block]",
         types,
         items_per_thread,
         threads_in_block,
         bins,
         algorithms)
{
  using params   = params_t<TestType>;
  using sample_t = typename params::sample_t;

  // Allocate problem device arrays
  thrust::device_vector<int> d_histogram(params::bins);
  thrust::device_vector<sample_t> d_samples(params::num_samples);

  c2h::gen(c2h::modulo_t{params::bins}, d_samples);

  thrust::host_vector<sample_t> h_samples = d_samples;
  auto h_reference = compute_host_reference(params::bins, h_samples);

  // Run kernel
  block_histogram<params::items_per_thread,
                  params::threads_in_block,
                  params::bins,
                  params::algorithm>(d_samples, d_histogram);

  REQUIRE(h_reference == d_histogram);
}

CUB_TEST("Block histogram can be computed with random input",
         "[histogram][block]",
         types,
         items_per_thread,
         threads_in_block,
         bins,
         algorithms)
{
  using params   = params_t<TestType>;
  using sample_t = typename params::sample_t;

  // Allocate problem device arrays
  thrust::device_vector<int> d_histogram(params::bins);
  thrust::device_vector<sample_t> d_samples(params::num_samples);

  const sample_t min_bin = static_cast<sample_t>(0);
  const sample_t max_bin = 
    static_cast<sample_t>(
      std::min(static_cast<std::int32_t>(std::numeric_limits<sample_t>::max()),
               static_cast<std::int32_t>(params::bins - 1)));

  c2h::gen(CUB_SEED(10), d_samples, min_bin, max_bin);

  thrust::host_vector<sample_t> h_samples = d_samples;
  auto h_reference = compute_host_reference(params::bins, h_samples);

  // Run kernel
  block_histogram<params::items_per_thread,
                  params::threads_in_block,
                  params::bins,
                  params::algorithm>(d_samples, d_histogram);

  REQUIRE(h_reference == d_histogram);
}