thrust / dependencies /cub /test /test_device_histogram.cu
camenduru's picture
thanks to nvidia ❤
8ae5fc5
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Test of DeviceHistogram utilities
******************************************************************************/
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <cub/device/device_histogram.cuh>
#include <cub/iterator/constant_input_iterator.cuh>
#include <cub/util_allocator.cuh>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <cuda/std/type_traits>
#include <algorithm>
#include <limits>
#include <typeinfo>
#include "test_util.h"
#define TEST_HALF_T !_NVHPC_CUDA
#if TEST_HALF_T
#include <cuda_fp16.h>
#endif
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
// Dispatch types
enum Backend
{
CUB, // CUB method
CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
};
bool g_verbose_input = false;
bool g_verbose = false;
int g_timing_iterations = 0;
CachingDeviceAllocator g_allocator(true);
//---------------------------------------------------------------------
// Dispatch to different DeviceHistogram entrypoints
//---------------------------------------------------------------------
template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, int BACKEND>
struct Dispatch;
template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS>
struct Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, CUB>
{
/**
* Dispatch to CUB multi histogram-range entrypoint
*/
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
//CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Range(
int timing_timing_iterations,
size_t * /*d_temp_storage_bytes*/,
cudaError_t * /*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
CounterT *(&d_histogram)[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT *(&d_levels)[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
d_levels,
num_row_pixels,
num_rows,
row_stride_bytes);
}
return error;
}
#if TEST_HALF_T
/**
* Dispatch to CUB multi histogram-range entrypoint
*/
template <typename CounterT, typename OffsetT>
//CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Range(
int timing_timing_iterations,
size_t * /*d_temp_storage_bytes*/,
cudaError_t * /*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
half_t *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
CounterT *(&d_histogram)[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
half_t *(&d_levels)[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
reinterpret_cast<__half*>(d_samples),
d_histogram,
num_levels,
reinterpret_cast<__half *(&)[NUM_ACTIVE_CHANNELS]>(d_levels),
num_row_pixels,
num_rows,
row_stride_bytes);
}
return error;
}
#endif
/**
* Dispatch to CUB multi histogram-even entrypoint
*/
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
//CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Even(
int timing_timing_iterations,
size_t * /*d_temp_storage_bytes*/,
cudaError_t * /*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
CounterT *(&d_histogram)[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT *lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT *upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
lower_level,
upper_level,
num_row_pixels,
num_rows,
row_stride_bytes);
}
return error;
}
#if TEST_HALF_T
/**
* Dispatch to CUB multi histogram-even entrypoint
*/
template <typename CounterT, typename OffsetT>
//CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Even(
int timing_timing_iterations,
size_t * /*d_temp_storage_bytes*/,
cudaError_t * /*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
half_t *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
CounterT *(&d_histogram)[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
half_t *lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
half_t *upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
reinterpret_cast<__half*>(d_samples),
d_histogram,
num_levels,
reinterpret_cast<__half*>(lower_level),
reinterpret_cast<__half*>(upper_level),
num_row_pixels,
num_rows,
row_stride_bytes);
}
return error;
}
#endif
};
template <>
struct Dispatch<1, 1, CUB>
{
/**
* Dispatch to CUB single histogram-range entrypoint
*/
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
//CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Range(
int timing_timing_iterations,
size_t */*d_temp_storage_bytes*/,
cudaError_t */*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
CounterT* (&d_histogram)[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT (&d_levels)[1], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::HistogramRange(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram[0],
num_levels[0],
d_levels[0],
num_row_pixels,
num_rows,
row_stride_bytes);
}
return error;
}
#if TEST_HALF_T
template <typename CounterT, typename OffsetT>
//CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Range(
int timing_timing_iterations,
size_t */*d_temp_storage_bytes*/,
cudaError_t */*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
half_t *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
CounterT* (&d_histogram)[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
half_t (&d_levels)[1], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::HistogramRange(
d_temp_storage,
temp_storage_bytes,
reinterpret_cast<__half*>(d_samples),
d_histogram[0],
num_levels[0],
d_levels[0].operator __half(),
num_row_pixels,
num_rows,
row_stride_bytes);
}
return error;
}
#endif
/**
* Dispatch to CUB single histogram-even entrypoint
*/
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
//CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Even(
int timing_timing_iterations,
size_t */*d_temp_storage_bytes*/,
cudaError_t */*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
CounterT* (&d_histogram)[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT *lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT *upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::HistogramEven(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram[0],
num_levels[0],
lower_level[0],
upper_level[0],
num_row_pixels,
num_rows,
row_stride_bytes);
}
return error;
}
#if TEST_HALF_T
template <typename CounterT, typename OffsetT>
//CUB_RUNTIME_FUNCTION __forceinline__
static cudaError_t Even(
int timing_timing_iterations,
size_t */*d_temp_storage_bytes*/,
cudaError_t */*d_cdp_error*/,
void* d_temp_storage,
size_t& temp_storage_bytes,
half_t *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
CounterT* (&d_histogram)[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
half_t *lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
half_t *upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
cudaError_t error = cudaSuccess;
for (int i = 0; i < timing_timing_iterations; ++i)
{
error = DeviceHistogram::HistogramEven(
d_temp_storage,
temp_storage_bytes,
reinterpret_cast<__half*>(d_samples),
d_histogram[0],
num_levels[0],
lower_level[0].operator __half(),
upper_level[0].operator __half(),
num_row_pixels,
num_rows,
row_stride_bytes);
}
return error;
}
#endif
};
//---------------------------------------------------------------------
// Test generation
//---------------------------------------------------------------------
// Searches for bin given a list of bin-boundary levels
template <typename LevelT>
struct SearchTransform
{
LevelT *levels; // Pointer to levels array
int num_levels; // Number of levels in array
// Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
template <typename SampleT>
int operator()(SampleT sample)
{
int bin = int(std::upper_bound(levels, levels + num_levels, (LevelT) sample) - levels - 1);
if (bin < 0)
{
// Sample out of range
return num_levels;
}
return bin;
}
};
// Template to scale samples to evenly-spaced bins
template <typename LevelT, typename = void>
struct ScaleTransform;
// [Integral types] Scales samples to evenly-spaced bins
template <typename LevelT>
struct ScaleTransform<LevelT,
typename ::cuda::std::enable_if<::cuda::std::is_integral<LevelT>::value>::type>
{
int num_levels; // Number of levels in array
LevelT max; // Max sample level (exclusive)
LevelT min; // Min sample level (inclusive)
void Init(int num_levels_, // Number of levels in array
LevelT max_, // Max sample level (exclusive)
LevelT min_) // Min sample level (inclusive)
{
this->num_levels = num_levels_;
this->max = max_;
this->min = min_;
}
// Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
template <typename SampleT>
int operator()(SampleT sample)
{
if ((sample < min) || (sample >= max))
{
// Sample out of range
return num_levels;
}
// Accurate bin computation following the arithmetic we guarantee in the HistoEven docs
return static_cast<int>(
(static_cast<uint64_t>(sample - min) * static_cast<uint64_t>(num_levels - 1)) /
static_cast<uint64_t>(max - min));
}
};
// [[Extended] floating point types] Scales samples to evenly-spaced bins
template <typename LevelT>
struct ScaleTransform<LevelT,
typename ::cuda::std::enable_if<::cuda::std::is_floating_point<LevelT>::value
#if TEST_HALF_T
|| ::cuda::std::is_same<LevelT, half_t>::value
#endif
>::type>
{
int num_levels; // Number of levels in array
LevelT max; // Max sample level (exclusive)
LevelT min; // Min sample level (inclusive)
LevelT scale; // Bin scaling factor
void Init(int _num_levels, // Number of levels in array
LevelT _max, // Max sample level (exclusive)
LevelT _min) // Min sample level (inclusive)
{
this->num_levels = _num_levels;
this->max = _max;
this->min = _min;
this->scale = LevelT{1.0f} /
static_cast<LevelT>((max - min) / static_cast<LevelT>(num_levels - 1));
}
// Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
template <typename SampleT>
int operator()(SampleT sample)
{
if ((sample < min) || (sample >= max))
{
// Sample out of range
return num_levels;
}
return (int)((((float)sample) - min) * scale);
}
};
/**
* Generate sample
*/
template <typename T, typename LevelT>
void Sample(T &datum, LevelT max_level, int entropy_reduction)
{
unsigned int max = (unsigned int) -1;
unsigned int bits;
RandomBits(bits, entropy_reduction);
float fraction = (float(bits) / max);
datum = (T) (fraction * max_level);
}
/**
* Initialize histogram samples
*/
template <
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename LevelT,
typename SampleT,
typename OffsetT>
void InitializeSamples(
LevelT max_level,
int entropy_reduction,
SampleT *h_samples,
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
// Initialize samples
for (OffsetT row = 0; row < num_rows; ++row)
{
for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
{
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
// Sample offset
OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
// Init sample value
Sample(h_samples[offset], max_level, entropy_reduction);
if (g_verbose_input)
{
if (channel > 0) printf(", ");
std::cout << CoutCast(h_samples[offset]);
}
}
}
}
}
/**
* Initialize histogram solutions
*/
template <
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename CounterT,
typename SampleIteratorT,
typename TransformOp,
typename OffsetT>
void InitializeBins(
SampleIteratorT h_samples,
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
TransformOp transform_op[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
CounterT *h_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
using SampleT = cub::detail::value_t<SampleIteratorT>;
// Init bins
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
{
for (int bin = 0; bin < num_levels[CHANNEL] - 1; ++bin)
{
h_histogram[CHANNEL][bin] = 0;
}
}
// Initialize samples
if (g_verbose_input) printf("Samples: \n");
for (OffsetT row = 0; row < num_rows; ++row)
{
for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
{
if (g_verbose_input) printf("[");
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
// Sample offset
OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
// Update sample bin
int bin = transform_op[channel](h_samples[offset]);
if (g_verbose_input) printf(" (%d)", bin); fflush(stdout);
if ((bin >= 0) && (bin < num_levels[channel] - 1))
{
// valid bin
h_histogram[channel][bin]++;
}
}
if (g_verbose_input) printf("]");
}
if (g_verbose_input) printf("\n\n");
}
}
/**
* Test histogram-even
*/
template <
Backend BACKEND,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleT,
typename CounterT,
typename LevelT,
typename OffsetT,
typename SampleIteratorT>
void TestEven(
LevelT max_level,
int entropy_reduction,
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest
SampleIteratorT h_samples,
SampleIteratorT d_samples)
{
OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
printf("\n----------------------------\n");
printf("%s cub::DeviceHistogram::Even (%s) "
"%d pixels (%d height, %d width, %d-byte row stride), "
"%d %d-byte %s samples (entropy reduction %d), "
"%s levels, %s counters, %d/%d channels, max sample ",
(BACKEND == CDP) ? "CDP CUB" : "CUB",
(std::is_pointer<SampleIteratorT>::value) ? "pointer" : "iterator",
(int) (num_row_pixels * num_rows),
(int) num_rows,
(int) num_row_pixels,
(int) row_stride_bytes,
(int) total_samples,
(int) sizeof(SampleT),
typeid(SampleT).name(),
entropy_reduction,
typeid(LevelT).name(),
typeid(CounterT).name(),
NUM_ACTIVE_CHANNELS,
NUM_CHANNELS);
std::cout << CoutCast(max_level) << "\n";
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
std::cout << "\tChannel " << channel << ": "
<< num_levels[channel] - 1 << " bins "
<< "[" << lower_level[channel] << ", "
<< upper_level[channel] << ")\n";
}
fflush(stdout);
// Allocate and initialize host and device data
typedef SampleT Foo; // rename type to quelch gcc warnings (bug?)
CounterT* h_histogram[NUM_ACTIVE_CHANNELS];
ScaleTransform<LevelT> transform_op[NUM_ACTIVE_CHANNELS];
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
int bins = num_levels[channel] - 1;
h_histogram[channel] = new CounterT[bins];
transform_op[channel].Init(num_levels[channel], upper_level[channel], lower_level[channel]);
}
InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
// Allocate and initialize device data
CounterT* d_histogram[NUM_ACTIVE_CHANNELS];
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * (num_levels[channel] - 1)));
CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * (num_levels[channel] - 1)));
}
// Allocate CDP device arrays
size_t *d_temp_storage_bytes = NULL;
cudaError_t *d_cdp_error = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
1, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes,
d_samples, d_histogram, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes);
// Allocate temporary storage with "canary" zones
int canary_bytes = 256;
char canary_token = 8;
char* canary_zone = new char[canary_bytes];
memset(canary_zone, canary_token, canary_bytes);
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
// Run warmup/correctness iteration
Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
1, d_temp_storage_bytes, d_cdp_error,
((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
d_samples, d_histogram, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes);
// Check canary zones
if (g_verbose)
{
printf("Checking leading temp_storage canary zone (token = %d)\n"
"------------------------------------------------------\n",
static_cast<int>(canary_token));
}
int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
AssertEquals(0, error);
if (g_verbose)
{
printf("Checking trailing temp_storage canary zone (token = %d)\n"
"-------------------------------------------------------\n",
static_cast<int>(canary_token));
}
error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
AssertEquals(0, error);
// Flush any stdout/stderr
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(cudaDeviceSynchronize());
fflush(stdout);
fflush(stderr);
// Check for correctness (and display results, if specified)
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
if (g_verbose)
{
printf("Checking histogram result (channel = %d)\n"
"----------------------------------------\n",
channel);
}
int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
error |= channel_error;
}
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
d_samples, d_histogram, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes);
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = giga_rate * sizeof(SampleT);
printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
avg_millis,
giga_rate,
giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
giga_rate / NUM_CHANNELS,
giga_bandwidth);
}
printf("\n\n");
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
if (h_histogram[channel])
delete[] h_histogram[channel];
if (d_histogram[channel])
CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
}
if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
// Correctness asserts
AssertEquals(0, error);
}
/**
* Test histogram-even (native pointer input)
*/
template <
Backend BACKEND,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleT,
typename CounterT,
typename LevelT,
typename OffsetT>
void TestEvenNative(
LevelT max_level,
int entropy_reduction,
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
// Allocate and initialize host sample data
typedef SampleT Foo; // rename type to quelch gcc warnings (bug?)
SampleT* h_samples = new Foo[total_samples];
InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
// Allocate and initialize device data
SampleT* d_samples = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
max_level, entropy_reduction, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes,
h_samples, d_samples);
// Cleanup
if (h_samples) delete[] h_samples;
if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
}
/**
* Test histogram-even (iterator input)
*/
template <
Backend BACKEND,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleT,
typename CounterT,
typename LevelT,
typename OffsetT>
void TestEvenIterator(
Int2Type<false> /*is_half*/,
LevelT max_level,
int entropy_reduction,
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
SampleT sample = (SampleT) lower_level[0];
ConstantInputIterator<SampleT> sample_itr(sample);
TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
max_level, entropy_reduction, num_levels, lower_level, upper_level,
num_row_pixels, num_rows, row_stride_bytes,
sample_itr, sample_itr);
}
template <Backend,
int,
int NUM_ACTIVE_CHANNELS,
typename,
typename,
typename LevelT,
typename OffsetT>
void TestEvenIterator(Int2Type<true> /*is_half*/,
LevelT,
int,
int[NUM_ACTIVE_CHANNELS],
LevelT[NUM_ACTIVE_CHANNELS],
LevelT[NUM_ACTIVE_CHANNELS],
OffsetT,
OffsetT,
OffsetT)
{
// We have to reinterpret cast `half_t *` pointer to `__half *` in this test.
// Hence, iterators testing is not supported.
}
/**
* Test histogram-range
*/
template <
Backend BACKEND,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleT,
typename CounterT,
typename LevelT,
typename OffsetT>
void TestRange(
LevelT max_level,
int entropy_reduction,
int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
LevelT* levels[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest
OffsetT num_rows, ///< [in] The number of rows in the region of interest
OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest
{
OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
printf("\n----------------------------\n");
printf("%s cub::DeviceHistogram::Range %d pixels "
"(%d height, %d width, %d-byte row stride), "
"%d %d-byte %s samples (entropy reduction %d), "
"%s levels, %s counters, %d/%d channels, max sample ",
(BACKEND == CDP) ? "CDP CUB" : "CUB",
(int)(num_row_pixels * num_rows),
(int)num_rows,
(int)num_row_pixels,
(int)row_stride_bytes,
(int)total_samples,
(int)sizeof(SampleT),
typeid(SampleT).name(),
entropy_reduction,
typeid(LevelT).name(),
typeid(CounterT).name(),
NUM_ACTIVE_CHANNELS,
NUM_CHANNELS);
std::cout << CoutCast(max_level) << "\n";
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
printf("Channel %d: %d bins", channel, num_levels[channel] - 1);
if (g_verbose)
{
std::cout << "[ " << levels[channel][0];
for (int level = 1; level < num_levels[channel]; ++level)
{
std::cout << ", " << levels[channel][level];
}
printf("]");
}
printf("\n");
}
fflush(stdout);
// Allocate and initialize host and device data
typedef SampleT Foo; // rename type to quelch gcc warnings (bug?)
SampleT* h_samples = new Foo[total_samples];
CounterT* h_histogram[NUM_ACTIVE_CHANNELS];
SearchTransform<LevelT> transform_op[NUM_ACTIVE_CHANNELS];
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
transform_op[channel].levels = levels[channel];
transform_op[channel].num_levels = num_levels[channel];
int bins = num_levels[channel] - 1;
h_histogram[channel] = new CounterT[bins];
}
InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
// Allocate and initialize device data
SampleT* d_samples = NULL;
LevelT* d_levels[NUM_ACTIVE_CHANNELS];
CounterT* d_histogram[NUM_ACTIVE_CHANNELS];
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_levels[channel], sizeof(LevelT) * num_levels[channel]));
CubDebugExit(cudaMemcpy(d_levels[channel], levels[channel], sizeof(LevelT) * num_levels[channel], cudaMemcpyHostToDevice));
int bins = num_levels[channel] - 1;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * bins));
CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * bins));
}
// Allocate CDP device arrays
size_t *d_temp_storage_bytes = NULL;
cudaError_t *d_cdp_error = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
1, d_temp_storage_bytes, d_cdp_error,
d_temp_storage, temp_storage_bytes,
d_samples,
d_histogram,
num_levels, d_levels,
num_row_pixels, num_rows, row_stride_bytes);
// Allocate temporary storage with "canary" zones
int canary_bytes = 256;
char canary_token = 9;
char* canary_zone = new char[canary_bytes];
memset(canary_zone, canary_token, canary_bytes);
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
// Run warmup/correctness iteration
Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
1, d_temp_storage_bytes, d_cdp_error,
((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
d_samples,
d_histogram,
num_levels, d_levels,
num_row_pixels, num_rows, row_stride_bytes);
// Check canary zones
int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
AssertEquals(0, error);
error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
AssertEquals(0, error);
// Flush any stdout/stderr
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(cudaDeviceSynchronize());
fflush(stdout);
fflush(stderr);
// Check for correctness (and display results, if specified)
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
error |= channel_error;
}
// Performance
GpuTimer gpu_timer;
gpu_timer.Start();
Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
d_samples,
d_histogram,
num_levels, d_levels,
num_row_pixels, num_rows, row_stride_bytes);
gpu_timer.Stop();
float elapsed_millis = gpu_timer.ElapsedMillis();
// Display performance
if (g_timing_iterations > 0)
{
float avg_millis = elapsed_millis / g_timing_iterations;
float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
float giga_bandwidth = giga_rate * sizeof(SampleT);
printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
avg_millis,
giga_rate,
giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
giga_rate / NUM_CHANNELS,
giga_bandwidth);
}
printf("\n\n");
// Cleanup
if (h_samples) delete[] h_samples;
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
if (h_histogram[channel])
delete[] h_histogram[channel];
if (d_histogram[channel])
CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
if (d_levels[channel])
CubDebugExit(g_allocator.DeviceFree(d_levels[channel]));
}
if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
// Correctness asserts
AssertEquals(0, error);
}
/**
* Test histogram-even
*/
template <
Backend BACKEND,
typename SampleT,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename CounterT,
typename LevelT,
typename OffsetT>
void TestEven(
OffsetT num_row_pixels,
OffsetT num_rows,
OffsetT row_stride_bytes,
int entropy_reduction,
int num_levels[NUM_ACTIVE_CHANNELS],
LevelT max_level,
int max_num_levels)
{
LevelT lower_level[NUM_ACTIVE_CHANNELS];
LevelT upper_level[NUM_ACTIVE_CHANNELS];
// Find smallest level increment
int max_bins = max_num_levels - 1;
LevelT min_level_increment = max_level / static_cast<LevelT>(max_bins);
// Set upper and lower levels for each channel
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
int num_bins = num_levels[channel] - 1;
lower_level[channel] = static_cast<LevelT>((max_level - (static_cast<LevelT>(num_bins) * min_level_increment)) / static_cast<LevelT>(2));
upper_level[channel] = static_cast<LevelT>((max_level + (static_cast<LevelT>(num_bins) * min_level_increment)) / static_cast<LevelT>(2));
}
// Test pointer-based samples
TestEvenNative<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
// Test iterator-based samples (CUB-only)
TestEvenIterator<CUB, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
Int2Type<std::is_same<SampleT, half_t>::value>{}, max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
}
/**
* Test histogram-range
*/
template <
Backend BACKEND,
typename SampleT,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename CounterT,
typename LevelT,
typename OffsetT>
void TestRange(
OffsetT num_row_pixels,
OffsetT num_rows,
OffsetT row_stride_bytes,
int entropy_reduction,
int num_levels[NUM_ACTIVE_CHANNELS],
LevelT max_level,
int max_num_levels)
{
// Find smallest level increment
int max_bins = max_num_levels - 1;
LevelT min_level_increment = max_level / static_cast<LevelT>(max_bins);
LevelT* levels[NUM_ACTIVE_CHANNELS];
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
levels[channel] = new LevelT[num_levels[channel]];
int num_bins = num_levels[channel] - 1;
LevelT lower_level = (max_level - static_cast<LevelT>(num_bins * min_level_increment)) / static_cast<LevelT>(2);
for (int level = 0; level < num_levels[channel]; ++level)
levels[channel][level] = lower_level + static_cast<LevelT>(level * min_level_increment);
}
TestRange<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
max_level, entropy_reduction, num_levels, levels, num_row_pixels, num_rows, row_stride_bytes);
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
delete[] levels[channel];
}
/**
* Test different entrypoints
*/
template <
typename SampleT,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename CounterT,
typename LevelT,
typename OffsetT>
void Test(
OffsetT num_row_pixels,
OffsetT num_rows,
OffsetT row_stride_bytes,
int entropy_reduction,
int num_levels[NUM_ACTIVE_CHANNELS],
LevelT max_level,
int max_num_levels)
{
TestEven<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
TestRange<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
}
/**
* Test different number of levels
*/
template <
typename SampleT,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename CounterT,
typename LevelT,
typename OffsetT>
void Test(
OffsetT num_row_pixels,
OffsetT num_rows,
OffsetT row_stride_bytes,
int entropy_reduction,
LevelT max_level,
int max_num_levels)
{
int num_levels[NUM_ACTIVE_CHANNELS];
// All different levels
num_levels[0] = max_num_levels;
for (int channel = 1; channel < NUM_ACTIVE_CHANNELS; ++channel)
{
num_levels[channel] = (num_levels[channel - 1] / 2) + 1;
}
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
}
/**
* Test different entropy-levels
*/
template <
typename SampleT,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename CounterT,
typename LevelT,
typename OffsetT>
void Test(
OffsetT num_row_pixels,
OffsetT num_rows,
OffsetT row_stride_bytes,
LevelT max_level,
int max_num_levels)
{
// entropy_reduction = -1 -> all samples == 0
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes, -1, max_level, max_num_levels);
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes, 0, max_level, max_num_levels);
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes, 5, max_level, max_num_levels);
}
/**
* Test different row strides
*/
template <
typename SampleT,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename CounterT,
typename LevelT,
typename OffsetT>
void Test(
OffsetT num_row_pixels,
OffsetT num_rows,
LevelT max_level,
int max_num_levels)
{
OffsetT row_stride_bytes = num_row_pixels * NUM_CHANNELS * sizeof(SampleT);
// No padding
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes, max_level, max_num_levels);
// 13 samples padding
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
num_row_pixels, num_rows, row_stride_bytes + (13 * sizeof(SampleT)), max_level, max_num_levels);
}
/**
* Test different problem sizes
*/
template <
typename SampleT,
int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename CounterT,
typename LevelT,
typename OffsetT>
void Test(
LevelT max_level,
int max_num_levels)
{
// 0 row/col images
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
OffsetT(1920), OffsetT(0), max_level, max_num_levels);
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
OffsetT(0), OffsetT(0), max_level, max_num_levels);
// Small inputs
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
OffsetT(15), OffsetT(1), max_level, max_num_levels);
// 1080 image
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
OffsetT(1920), OffsetT(1080), max_level, max_num_levels);
// Sample different aspect ratios sizes
for (OffsetT rows = 1; rows < 1000000; rows *= 1000)
{
for (OffsetT cols = 1; cols < (1000000 / rows); cols *= 1000)
{
Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
cols, rows, max_level, max_num_levels);
}
}
}
/**
* Test different channel interleavings (valid specialiation)
*/
template <typename SampleT, typename CounterT, typename LevelT, typename OffsetT>
void TestChannels(LevelT max_level,
int max_num_levels,
Int2Type<true> /*is_valid_tag*/,
Int2Type<false> /*test_extra_channels*/)
{
Test<SampleT, 1, 1, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
Test<SampleT, 4, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
}
template <typename SampleT, typename CounterT, typename LevelT, typename OffsetT>
void TestChannels(LevelT max_level,
int max_num_levels,
Int2Type<true> /*is_valid_tag*/,
Int2Type<true> /*test_extra_channels*/)
{
Test<SampleT, 1, 1, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
Test<SampleT, 4, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
Test<SampleT, 3, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
Test<SampleT, 4, 4, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
}
template <typename SampleT,
typename CounterT,
typename LevelT,
typename OffsetT,
typename TestExtraChannels>
void TestChannels(LevelT /*max_level*/,
int /*max_num_levels*/,
Int2Type<false> /*is_valid_tag*/,
TestExtraChannels)
{}
void TestLevelsAliasing()
{
constexpr int num_levels = 7;
int h_histogram[num_levels - 1]{};
int h_samples[]{
0, 2, 4, 6, 8, 10, 12, // levels
1, // bin 0
3, 3, // bin 1
5, 5, 5, // bin 2
7, 7, 7, 7, // bin 3
9, 9, 9, 9, 9, // bin 4
11, 11, 11, 11, 11, 11 // bin 5
};
constexpr int num_samples = sizeof(h_samples) / sizeof(h_samples[0]);
int *d_histogram{};
int *d_samples{};
CubDebugExit(
g_allocator.DeviceAllocate((void **)&d_histogram, sizeof(h_histogram)));
CubDebugExit(
g_allocator.DeviceAllocate((void **)&d_samples, sizeof(h_samples)));
CubDebugExit(
cudaMemcpy(d_samples, h_samples, sizeof(h_samples), cudaMemcpyHostToDevice));
// Alias levels with samples (fancy way to `d_histogram[bin]++`).
int *d_levels = d_samples;
std::uint8_t *d_temp_storage{};
std::size_t temp_storage_bytes{};
CubDebugExit(cub::DeviceHistogram::HistogramRange(d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
d_levels,
num_samples));
CubDebugExit(
g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes));
CubDebugExit(cub::DeviceHistogram::HistogramRange(d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
d_levels,
num_samples));
CubDebugExit(cudaMemcpy(h_histogram,
d_histogram,
sizeof(h_histogram),
cudaMemcpyDeviceToHost));
for (int bin = 0; bin < num_levels - 1; bin++)
{
// Each bin should contain `bin + 1` samples. Since samples also contain
// levels, they contribute one extra item to each bin.
AssertEquals(bin + 2, h_histogram[bin]);
}
CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
CubDebugExit(g_allocator.DeviceFree(d_histogram));
CubDebugExit(g_allocator.DeviceFree(d_levels));
}
// Regression test for NVIDIA/cub#489: integer rounding errors lead to incorrect
// bin detection:
void TestIntegerBinCalcs()
{
constexpr int num_levels = 8;
constexpr int num_bins = num_levels - 1;
int h_histogram[num_bins]{};
const int h_histogram_ref[num_bins]{1, 5, 0, 2, 1, 0, 0};
const int h_samples[]{2, 6, 7, 2, 3, 0, 2, 2, 6, 999};
const int lower_level = 0;
const int upper_level = 12;
constexpr int num_samples = sizeof(h_samples) / sizeof(h_samples[0]);
int *d_histogram{};
int *d_samples{};
CubDebugExit(
g_allocator.DeviceAllocate((void **)&d_histogram, sizeof(h_histogram)));
CubDebugExit(
g_allocator.DeviceAllocate((void **)&d_samples, sizeof(h_samples)));
CubDebugExit(
cudaMemcpy(d_samples, h_samples, sizeof(h_samples), cudaMemcpyHostToDevice));
std::uint8_t *d_temp_storage{};
std::size_t temp_storage_bytes{};
CubDebugExit(cub::DeviceHistogram::HistogramEven(d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
lower_level,
upper_level,
num_samples));
CubDebugExit(
g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes));
CubDebugExit(cub::DeviceHistogram::HistogramEven(d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
lower_level,
upper_level,
num_samples));
CubDebugExit(cudaMemcpy(h_histogram,
d_histogram,
sizeof(h_histogram),
cudaMemcpyDeviceToHost));
for (int bin = 0; bin < num_bins; ++bin)
{
AssertEquals(h_histogram_ref[bin], h_histogram[bin]);
}
CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
CubDebugExit(g_allocator.DeviceFree(d_histogram));
CubDebugExit(g_allocator.DeviceFree(d_samples));
}
/**
* @brief Our bin computation for HistogramEven is guaranteed only for when (max_level - min_level)
* * num_bins does not overflow when using uint64_t arithmetic. In case bin computation could
* overflow, we expect cudaErrorInvalidValue to be returned.
*/
template<typename SampleT>
void TestOverflow()
{
using CounterT = uint32_t;
constexpr std::size_t test_cases = 2;
// Test data common across tests
SampleT lower_level = 0;
SampleT upper_level = ::cuda::std::numeric_limits<SampleT>::max();
thrust::counting_iterator<SampleT> d_samples{0UL};
thrust::device_vector<CounterT> d_histo_out(1024);
CounterT *d_histogram = thrust::raw_pointer_cast(d_histo_out.data());
int num_samples = 1000;
// Prepare per-test specific data
constexpr std::size_t canary_bytes = 3;
std::array<std::size_t, test_cases> temp_storage_bytes{canary_bytes, canary_bytes};
std::array<int, test_cases> num_bins{1, 2};
// Since test #1 is just a single bin, we expect it to succeed
// Since we promote up to 64-bit integer arithmetic we expect tests to not overflow for types of
// up to 4 bytes. For 64-bit and wider types, we do not perform further promotion to even wider
// types, hence we expect cudaErrorInvalidValue to be returned to indicate of a potential overflow
std::array<cudaError_t, test_cases> expected_status{
cudaSuccess,
sizeof(SampleT) <= 4UL ? cudaSuccess : cudaErrorInvalidValue};
// Verify we always initializes temp_storage_bytes
cudaError_t error{cudaSuccess};
for (std::size_t i = 0; i < test_cases; i++)
{
error = cub::DeviceHistogram::HistogramEven(nullptr,
temp_storage_bytes[i],
d_samples,
d_histogram,
num_bins[i] + 1,
lower_level,
upper_level,
num_samples);
// Ensure that temp_storage_bytes has been initialized even in the presence of error
AssertTrue(temp_storage_bytes[i] != canary_bytes);
}
// Allocate sufficient temporary storage
thrust::device_vector<std::uint8_t> temp_storage(
std::max(temp_storage_bytes[0], temp_storage_bytes[1]));
for (std::size_t i = 0; i < test_cases; i++)
{
error = cub::DeviceHistogram::HistogramEven(thrust::raw_pointer_cast(temp_storage.data()),
temp_storage_bytes[i],
d_samples,
d_histogram,
num_bins[i] + 1,
lower_level,
upper_level,
num_samples);
// Ensure we do not return an error on querying temporary storage requirements
AssertEquals(error, expected_status[i]);
}
}
//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------
/**
* Main
*/
int main(int argc, char** argv)
{
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
g_verbose_input = args.CheckCmdLineFlag("v2");
args.GetCmdLineArgument("i", g_timing_iterations);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--i=<timing iterations>] "
"[--device=<device-id>] "
"[--v] "
"[--v2] "
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
TestOverflow<uint8_t>();
TestOverflow<uint16_t>();
TestOverflow<uint32_t>();
TestOverflow<uint64_t>();
using true_t = Int2Type<true>;
using false_t = Int2Type<false>;
TestLevelsAliasing();
TestIntegerBinCalcs(); // regression test for NVIDIA/cub#489
#if TEST_HALF_T
TestChannels<half_t, int, half_t, int>(256, 256 + 1, true_t{}, true_t{});
#endif
TestChannels <signed char, int, int, int>(256, 256 + 1, true_t{}, true_t{});
TestChannels <unsigned short, int, int, int>(8192, 8192 + 1, true_t{}, false_t{});
// Make sure bin computation works fine when using int32 arithmetic
TestChannels <unsigned short, int, unsigned short, int>(std::numeric_limits<unsigned short>::max(), std::numeric_limits<unsigned short>::max() + 1, true_t{}, false_t{});
// Make sure bin computation works fine when requiring int64 arithmetic
TestChannels <unsigned int, int, unsigned int, int>(std::numeric_limits<unsigned int>::max(), 8192 + 1, true_t{}, false_t{});
#if !defined(__ICC)
// Fails with ICC for unknown reasons, see #332.
TestChannels <float, int, float, int>(1.0, 256 + 1, true_t{}, false_t{});
#endif
// float samples, int levels, regression test for NVIDIA/cub#479.
TestChannels <float, int, int, int>(12, 7, true_t{}, true_t{});
// Test down-conversion of size_t offsets to int
TestChannels <unsigned char, int, int, long long>(256, 256 + 1, Int2Type<(sizeof(size_t) != sizeof(int))>{}, false_t{});
return 0;
}