/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of DeviceHistogram utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include #include #include #include "test_util.h" #define TEST_HALF_T !_NVHPC_CUDA #if TEST_HALF_T #include #endif using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- // Dispatch types enum Backend { CUB, // CUB method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; bool g_verbose_input = false; bool g_verbose = false; int g_timing_iterations = 0; CachingDeviceAllocator g_allocator(true); //--------------------------------------------------------------------- // Dispatch to different DeviceHistogram entrypoints //--------------------------------------------------------------------- template struct Dispatch; template struct Dispatch { /** * Dispatch to CUB multi histogram-range entrypoint */ template //CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Range( int timing_timing_iterations, size_t * /*d_temp_storage_bytes*/, cudaError_t * /*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT *(&d_histogram)[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_levels[i] - 1. int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT *(&d_levels)[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceHistogram::MultiHistogramRange( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); } return error; } #if TEST_HALF_T /** * Dispatch to CUB multi histogram-range entrypoint */ template //CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Range( int timing_timing_iterations, size_t * /*d_temp_storage_bytes*/, cudaError_t * /*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, half_t *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT *(&d_histogram)[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_levels[i] - 1. int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. half_t *(&d_levels)[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceHistogram::MultiHistogramRange( d_temp_storage, temp_storage_bytes, reinterpret_cast<__half*>(d_samples), d_histogram, num_levels, reinterpret_cast<__half *(&)[NUM_ACTIVE_CHANNELS]>(d_levels), num_row_pixels, num_rows, row_stride_bytes); } return error; } #endif /** * Dispatch to CUB multi histogram-even entrypoint */ template //CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Even( int timing_timing_iterations, size_t * /*d_temp_storage_bytes*/, cudaError_t * /*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT *(&d_histogram)[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_levels[i] - 1. int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT *lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. LevelT *upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceHistogram::MultiHistogramEven( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes); } return error; } #if TEST_HALF_T /** * Dispatch to CUB multi histogram-even entrypoint */ template //CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Even( int timing_timing_iterations, size_t * /*d_temp_storage_bytes*/, cudaError_t * /*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, half_t *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT *(&d_histogram)[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_levels[i] - 1. int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. half_t *lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. half_t *upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceHistogram::MultiHistogramEven( d_temp_storage, temp_storage_bytes, reinterpret_cast<__half*>(d_samples), d_histogram, num_levels, reinterpret_cast<__half*>(lower_level), reinterpret_cast<__half*>(upper_level), num_row_pixels, num_rows, row_stride_bytes); } return error; } #endif }; template <> struct Dispatch<1, 1, CUB> { /** * Dispatch to CUB single histogram-range entrypoint */ template //CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Range( int timing_timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT* (&d_histogram)[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_levels[i] - 1. int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT (&d_levels)[1], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceHistogram::HistogramRange( d_temp_storage, temp_storage_bytes, d_samples, d_histogram[0], num_levels[0], d_levels[0], num_row_pixels, num_rows, row_stride_bytes); } return error; } #if TEST_HALF_T template //CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Range( int timing_timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, half_t *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT* (&d_histogram)[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_levels[i] - 1. int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. half_t (&d_levels)[1], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceHistogram::HistogramRange( d_temp_storage, temp_storage_bytes, reinterpret_cast<__half*>(d_samples), d_histogram[0], num_levels[0], d_levels[0].operator __half(), num_row_pixels, num_rows, row_stride_bytes); } return error; } #endif /** * Dispatch to CUB single histogram-even entrypoint */ template //CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Even( int timing_timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT* (&d_histogram)[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_levels[i] - 1. int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT *lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. LevelT *upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceHistogram::HistogramEven( d_temp_storage, temp_storage_bytes, d_samples, d_histogram[0], num_levels[0], lower_level[0], upper_level[0], num_row_pixels, num_rows, row_stride_bytes); } return error; } #if TEST_HALF_T template //CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Even( int timing_timing_iterations, size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, void* d_temp_storage, size_t& temp_storage_bytes, half_t *d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT* (&d_histogram)[1], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_levels[i] - 1. int *num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. half_t *lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. half_t *upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceHistogram::HistogramEven( d_temp_storage, temp_storage_bytes, reinterpret_cast<__half*>(d_samples), d_histogram[0], num_levels[0], lower_level[0].operator __half(), upper_level[0].operator __half(), num_row_pixels, num_rows, row_stride_bytes); } return error; } #endif }; //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- // Searches for bin given a list of bin-boundary levels template struct SearchTransform { LevelT *levels; // Pointer to levels array int num_levels; // Number of levels in array // Functor for converting samples to bin-ids (num_levels is returned if sample is out of range) template int operator()(SampleT sample) { int bin = int(std::upper_bound(levels, levels + num_levels, (LevelT) sample) - levels - 1); if (bin < 0) { // Sample out of range return num_levels; } return bin; } }; // Template to scale samples to evenly-spaced bins template struct ScaleTransform; // [Integral types] Scales samples to evenly-spaced bins template struct ScaleTransform::value>::type> { int num_levels; // Number of levels in array LevelT max; // Max sample level (exclusive) LevelT min; // Min sample level (inclusive) void Init(int num_levels_, // Number of levels in array LevelT max_, // Max sample level (exclusive) LevelT min_) // Min sample level (inclusive) { this->num_levels = num_levels_; this->max = max_; this->min = min_; } // Functor for converting samples to bin-ids (num_levels is returned if sample is out of range) template int operator()(SampleT sample) { if ((sample < min) || (sample >= max)) { // Sample out of range return num_levels; } // Accurate bin computation following the arithmetic we guarantee in the HistoEven docs return static_cast( (static_cast(sample - min) * static_cast(num_levels - 1)) / static_cast(max - min)); } }; // [[Extended] floating point types] Scales samples to evenly-spaced bins template struct ScaleTransform::value #if TEST_HALF_T || ::cuda::std::is_same::value #endif >::type> { int num_levels; // Number of levels in array LevelT max; // Max sample level (exclusive) LevelT min; // Min sample level (inclusive) LevelT scale; // Bin scaling factor void Init(int _num_levels, // Number of levels in array LevelT _max, // Max sample level (exclusive) LevelT _min) // Min sample level (inclusive) { this->num_levels = _num_levels; this->max = _max; this->min = _min; this->scale = LevelT{1.0f} / static_cast((max - min) / static_cast(num_levels - 1)); } // Functor for converting samples to bin-ids (num_levels is returned if sample is out of range) template int operator()(SampleT sample) { if ((sample < min) || (sample >= max)) { // Sample out of range return num_levels; } return (int)((((float)sample) - min) * scale); } }; /** * Generate sample */ template void Sample(T &datum, LevelT max_level, int entropy_reduction) { unsigned int max = (unsigned int) -1; unsigned int bits; RandomBits(bits, entropy_reduction); float fraction = (float(bits) / max); datum = (T) (fraction * max_level); } /** * Initialize histogram samples */ template < int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename LevelT, typename SampleT, typename OffsetT> void InitializeSamples( LevelT max_level, int entropy_reduction, SampleT *h_samples, OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { // Initialize samples for (OffsetT row = 0; row < num_rows; ++row) { for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel) { for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { // Sample offset OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel; // Init sample value Sample(h_samples[offset], max_level, entropy_reduction); if (g_verbose_input) { if (channel > 0) printf(", "); std::cout << CoutCast(h_samples[offset]); } } } } } /** * Initialize histogram solutions */ template < int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename CounterT, typename SampleIteratorT, typename TransformOp, typename OffsetT> void InitializeBins( SampleIteratorT h_samples, int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. TransformOp transform_op[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. CounterT *h_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_levels[i] - 1. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { using SampleT = cub::detail::value_t; // Init bins for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) { for (int bin = 0; bin < num_levels[CHANNEL] - 1; ++bin) { h_histogram[CHANNEL][bin] = 0; } } // Initialize samples if (g_verbose_input) printf("Samples: \n"); for (OffsetT row = 0; row < num_rows; ++row) { for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel) { if (g_verbose_input) printf("["); for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { // Sample offset OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel; // Update sample bin int bin = transform_op[channel](h_samples[offset]); if (g_verbose_input) printf(" (%d)", bin); fflush(stdout); if ((bin >= 0) && (bin < num_levels[channel] - 1)) { // valid bin h_histogram[channel][bin]++; } } if (g_verbose_input) printf("]"); } if (g_verbose_input) printf("\n\n"); } } /** * Test histogram-even */ template < Backend BACKEND, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename CounterT, typename LevelT, typename OffsetT, typename SampleIteratorT> void TestEven( LevelT max_level, int entropy_reduction, int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest SampleIteratorT h_samples, SampleIteratorT d_samples) { OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT)); printf("\n----------------------------\n"); printf("%s cub::DeviceHistogram::Even (%s) " "%d pixels (%d height, %d width, %d-byte row stride), " "%d %d-byte %s samples (entropy reduction %d), " "%s levels, %s counters, %d/%d channels, max sample ", (BACKEND == CDP) ? "CDP CUB" : "CUB", (std::is_pointer::value) ? "pointer" : "iterator", (int) (num_row_pixels * num_rows), (int) num_rows, (int) num_row_pixels, (int) row_stride_bytes, (int) total_samples, (int) sizeof(SampleT), typeid(SampleT).name(), entropy_reduction, typeid(LevelT).name(), typeid(CounterT).name(), NUM_ACTIVE_CHANNELS, NUM_CHANNELS); std::cout << CoutCast(max_level) << "\n"; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { std::cout << "\tChannel " << channel << ": " << num_levels[channel] - 1 << " bins " << "[" << lower_level[channel] << ", " << upper_level[channel] << ")\n"; } fflush(stdout); // Allocate and initialize host and device data typedef SampleT Foo; // rename type to quelch gcc warnings (bug?) CounterT* h_histogram[NUM_ACTIVE_CHANNELS]; ScaleTransform transform_op[NUM_ACTIVE_CHANNELS]; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { int bins = num_levels[channel] - 1; h_histogram[channel] = new CounterT[bins]; transform_op[channel].Init(num_levels[channel], upper_level[channel], lower_level[channel]); } InitializeBins( h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes); // Allocate and initialize device data CounterT* d_histogram[NUM_ACTIVE_CHANNELS]; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * (num_levels[channel] - 1))); CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * (num_levels[channel] - 1))); } // Allocate CDP device arrays size_t *d_temp_storage_bytes = NULL; cudaError_t *d_cdp_error = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; Dispatch::Even( 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes); // Allocate temporary storage with "canary" zones int canary_bytes = 256; char canary_token = 8; char* canary_zone = new char[canary_bytes]; memset(canary_zone, canary_token, canary_bytes); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2))); CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2))); // Run warmup/correctness iteration Dispatch::Even( 1, d_temp_storage_bytes, d_cdp_error, ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes); // Check canary zones if (g_verbose) { printf("Checking leading temp_storage canary zone (token = %d)\n" "------------------------------------------------------\n", static_cast(canary_token)); } int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose); AssertEquals(0, error); if (g_verbose) { printf("Checking trailing temp_storage canary zone (token = %d)\n" "-------------------------------------------------------\n", static_cast(canary_token)); } error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose); AssertEquals(0, error); // Flush any stdout/stderr CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); fflush(stdout); fflush(stderr); // Check for correctness (and display results, if specified) for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { if (g_verbose) { printf("Checking histogram result (channel = %d)\n" "----------------------------------------\n", channel); } int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose); printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n"); error |= channel_error; } // Performance GpuTimer gpu_timer; gpu_timer.Start(); Dispatch::Even( g_timing_iterations, d_temp_storage_bytes, d_cdp_error, ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes); gpu_timer.Stop(); float elapsed_millis = gpu_timer.ElapsedMillis(); // Display performance if (g_timing_iterations > 0) { float avg_millis = elapsed_millis / g_timing_iterations; float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f; float giga_bandwidth = giga_rate * sizeof(SampleT); printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s", avg_millis, giga_rate, giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS, giga_rate / NUM_CHANNELS, giga_bandwidth); } printf("\n\n"); for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { if (h_histogram[channel]) delete[] h_histogram[channel]; if (d_histogram[channel]) CubDebugExit(g_allocator.DeviceFree(d_histogram[channel])); } if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); // Correctness asserts AssertEquals(0, error); } /** * Test histogram-even (native pointer input) */ template < Backend BACKEND, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename CounterT, typename LevelT, typename OffsetT> void TestEvenNative( LevelT max_level, int entropy_reduction, int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT)); // Allocate and initialize host sample data typedef SampleT Foo; // rename type to quelch gcc warnings (bug?) SampleT* h_samples = new Foo[total_samples]; InitializeSamples( max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes); // Allocate and initialize device data SampleT* d_samples = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples)); CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice)); TestEven( max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes, h_samples, d_samples); // Cleanup if (h_samples) delete[] h_samples; if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples)); } /** * Test histogram-even (iterator input) */ template < Backend BACKEND, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename CounterT, typename LevelT, typename OffsetT> void TestEvenIterator( Int2Type /*is_half*/, LevelT max_level, int entropy_reduction, int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { SampleT sample = (SampleT) lower_level[0]; ConstantInputIterator sample_itr(sample); TestEven( max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes, sample_itr, sample_itr); } template void TestEvenIterator(Int2Type /*is_half*/, LevelT, int, int[NUM_ACTIVE_CHANNELS], LevelT[NUM_ACTIVE_CHANNELS], LevelT[NUM_ACTIVE_CHANNELS], OffsetT, OffsetT, OffsetT) { // We have to reinterpret cast `half_t *` pointer to `__half *` in this test. // Hence, iterators testing is not supported. } /** * Test histogram-range */ template < Backend BACKEND, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename CounterT, typename LevelT, typename OffsetT> void TestRange( LevelT max_level, int entropy_reduction, int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT* levels[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest OffsetT row_stride_bytes) ///< [in] The number of bytes between starts of consecutive rows in the region of interest { OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT)); printf("\n----------------------------\n"); printf("%s cub::DeviceHistogram::Range %d pixels " "(%d height, %d width, %d-byte row stride), " "%d %d-byte %s samples (entropy reduction %d), " "%s levels, %s counters, %d/%d channels, max sample ", (BACKEND == CDP) ? "CDP CUB" : "CUB", (int)(num_row_pixels * num_rows), (int)num_rows, (int)num_row_pixels, (int)row_stride_bytes, (int)total_samples, (int)sizeof(SampleT), typeid(SampleT).name(), entropy_reduction, typeid(LevelT).name(), typeid(CounterT).name(), NUM_ACTIVE_CHANNELS, NUM_CHANNELS); std::cout << CoutCast(max_level) << "\n"; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { printf("Channel %d: %d bins", channel, num_levels[channel] - 1); if (g_verbose) { std::cout << "[ " << levels[channel][0]; for (int level = 1; level < num_levels[channel]; ++level) { std::cout << ", " << levels[channel][level]; } printf("]"); } printf("\n"); } fflush(stdout); // Allocate and initialize host and device data typedef SampleT Foo; // rename type to quelch gcc warnings (bug?) SampleT* h_samples = new Foo[total_samples]; CounterT* h_histogram[NUM_ACTIVE_CHANNELS]; SearchTransform transform_op[NUM_ACTIVE_CHANNELS]; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { transform_op[channel].levels = levels[channel]; transform_op[channel].num_levels = num_levels[channel]; int bins = num_levels[channel] - 1; h_histogram[channel] = new CounterT[bins]; } InitializeSamples( max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes); InitializeBins( h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes); // Allocate and initialize device data SampleT* d_samples = NULL; LevelT* d_levels[NUM_ACTIVE_CHANNELS]; CounterT* d_histogram[NUM_ACTIVE_CHANNELS]; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples)); CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice)); for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { CubDebugExit(g_allocator.DeviceAllocate((void**)&d_levels[channel], sizeof(LevelT) * num_levels[channel])); CubDebugExit(cudaMemcpy(d_levels[channel], levels[channel], sizeof(LevelT) * num_levels[channel], cudaMemcpyHostToDevice)); int bins = num_levels[channel] - 1; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * bins)); CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * bins)); } // Allocate CDP device arrays size_t *d_temp_storage_bytes = NULL; cudaError_t *d_cdp_error = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; Dispatch::Range( 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); // Allocate temporary storage with "canary" zones int canary_bytes = 256; char canary_token = 9; char* canary_zone = new char[canary_bytes]; memset(canary_zone, canary_token, canary_bytes); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2))); CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2))); // Run warmup/correctness iteration Dispatch::Range( 1, d_temp_storage_bytes, d_cdp_error, ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); // Check canary zones int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose); AssertEquals(0, error); error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose); AssertEquals(0, error); // Flush any stdout/stderr CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); fflush(stdout); fflush(stderr); // Check for correctness (and display results, if specified) for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose); printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n"); error |= channel_error; } // Performance GpuTimer gpu_timer; gpu_timer.Start(); Dispatch::Range( g_timing_iterations, d_temp_storage_bytes, d_cdp_error, ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); gpu_timer.Stop(); float elapsed_millis = gpu_timer.ElapsedMillis(); // Display performance if (g_timing_iterations > 0) { float avg_millis = elapsed_millis / g_timing_iterations; float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f; float giga_bandwidth = giga_rate * sizeof(SampleT); printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s", avg_millis, giga_rate, giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS, giga_rate / NUM_CHANNELS, giga_bandwidth); } printf("\n\n"); // Cleanup if (h_samples) delete[] h_samples; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { if (h_histogram[channel]) delete[] h_histogram[channel]; if (d_histogram[channel]) CubDebugExit(g_allocator.DeviceFree(d_histogram[channel])); if (d_levels[channel]) CubDebugExit(g_allocator.DeviceFree(d_levels[channel])); } if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples)); if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); // Correctness asserts AssertEquals(0, error); } /** * Test histogram-even */ template < Backend BACKEND, typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename CounterT, typename LevelT, typename OffsetT> void TestEven( OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_bytes, int entropy_reduction, int num_levels[NUM_ACTIVE_CHANNELS], LevelT max_level, int max_num_levels) { LevelT lower_level[NUM_ACTIVE_CHANNELS]; LevelT upper_level[NUM_ACTIVE_CHANNELS]; // Find smallest level increment int max_bins = max_num_levels - 1; LevelT min_level_increment = max_level / static_cast(max_bins); // Set upper and lower levels for each channel for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { int num_bins = num_levels[channel] - 1; lower_level[channel] = static_cast((max_level - (static_cast(num_bins) * min_level_increment)) / static_cast(2)); upper_level[channel] = static_cast((max_level + (static_cast(num_bins) * min_level_increment)) / static_cast(2)); } // Test pointer-based samples TestEvenNative( max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes); // Test iterator-based samples (CUB-only) TestEvenIterator( Int2Type::value>{}, max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes); } /** * Test histogram-range */ template < Backend BACKEND, typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename CounterT, typename LevelT, typename OffsetT> void TestRange( OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_bytes, int entropy_reduction, int num_levels[NUM_ACTIVE_CHANNELS], LevelT max_level, int max_num_levels) { // Find smallest level increment int max_bins = max_num_levels - 1; LevelT min_level_increment = max_level / static_cast(max_bins); LevelT* levels[NUM_ACTIVE_CHANNELS]; for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { levels[channel] = new LevelT[num_levels[channel]]; int num_bins = num_levels[channel] - 1; LevelT lower_level = (max_level - static_cast(num_bins * min_level_increment)) / static_cast(2); for (int level = 0; level < num_levels[channel]; ++level) levels[channel][level] = lower_level + static_cast(level * min_level_increment); } TestRange( max_level, entropy_reduction, num_levels, levels, num_row_pixels, num_rows, row_stride_bytes); for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) delete[] levels[channel]; } /** * Test different entrypoints */ template < typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename CounterT, typename LevelT, typename OffsetT> void Test( OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_bytes, int entropy_reduction, int num_levels[NUM_ACTIVE_CHANNELS], LevelT max_level, int max_num_levels) { TestEven( num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels); TestRange( num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels); } /** * Test different number of levels */ template < typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename CounterT, typename LevelT, typename OffsetT> void Test( OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_bytes, int entropy_reduction, LevelT max_level, int max_num_levels) { int num_levels[NUM_ACTIVE_CHANNELS]; // All different levels num_levels[0] = max_num_levels; for (int channel = 1; channel < NUM_ACTIVE_CHANNELS; ++channel) { num_levels[channel] = (num_levels[channel - 1] / 2) + 1; } Test( num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels); } /** * Test different entropy-levels */ template < typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename CounterT, typename LevelT, typename OffsetT> void Test( OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_bytes, LevelT max_level, int max_num_levels) { // entropy_reduction = -1 -> all samples == 0 Test( num_row_pixels, num_rows, row_stride_bytes, -1, max_level, max_num_levels); Test( num_row_pixels, num_rows, row_stride_bytes, 0, max_level, max_num_levels); Test( num_row_pixels, num_rows, row_stride_bytes, 5, max_level, max_num_levels); } /** * Test different row strides */ template < typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename CounterT, typename LevelT, typename OffsetT> void Test( OffsetT num_row_pixels, OffsetT num_rows, LevelT max_level, int max_num_levels) { OffsetT row_stride_bytes = num_row_pixels * NUM_CHANNELS * sizeof(SampleT); // No padding Test( num_row_pixels, num_rows, row_stride_bytes, max_level, max_num_levels); // 13 samples padding Test( num_row_pixels, num_rows, row_stride_bytes + (13 * sizeof(SampleT)), max_level, max_num_levels); } /** * Test different problem sizes */ template < typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename CounterT, typename LevelT, typename OffsetT> void Test( LevelT max_level, int max_num_levels) { // 0 row/col images Test( OffsetT(1920), OffsetT(0), max_level, max_num_levels); Test( OffsetT(0), OffsetT(0), max_level, max_num_levels); // Small inputs Test( OffsetT(15), OffsetT(1), max_level, max_num_levels); // 1080 image Test( OffsetT(1920), OffsetT(1080), max_level, max_num_levels); // Sample different aspect ratios sizes for (OffsetT rows = 1; rows < 1000000; rows *= 1000) { for (OffsetT cols = 1; cols < (1000000 / rows); cols *= 1000) { Test( cols, rows, max_level, max_num_levels); } } } /** * Test different channel interleavings (valid specialiation) */ template void TestChannels(LevelT max_level, int max_num_levels, Int2Type /*is_valid_tag*/, Int2Type /*test_extra_channels*/) { Test(max_level, max_num_levels); Test(max_level, max_num_levels); } template void TestChannels(LevelT max_level, int max_num_levels, Int2Type /*is_valid_tag*/, Int2Type /*test_extra_channels*/) { Test(max_level, max_num_levels); Test(max_level, max_num_levels); Test(max_level, max_num_levels); Test(max_level, max_num_levels); } template void TestChannels(LevelT /*max_level*/, int /*max_num_levels*/, Int2Type /*is_valid_tag*/, TestExtraChannels) {} void TestLevelsAliasing() { constexpr int num_levels = 7; int h_histogram[num_levels - 1]{}; int h_samples[]{ 0, 2, 4, 6, 8, 10, 12, // levels 1, // bin 0 3, 3, // bin 1 5, 5, 5, // bin 2 7, 7, 7, 7, // bin 3 9, 9, 9, 9, 9, // bin 4 11, 11, 11, 11, 11, 11 // bin 5 }; constexpr int num_samples = sizeof(h_samples) / sizeof(h_samples[0]); int *d_histogram{}; int *d_samples{}; CubDebugExit( g_allocator.DeviceAllocate((void **)&d_histogram, sizeof(h_histogram))); CubDebugExit( g_allocator.DeviceAllocate((void **)&d_samples, sizeof(h_samples))); CubDebugExit( cudaMemcpy(d_samples, h_samples, sizeof(h_samples), cudaMemcpyHostToDevice)); // Alias levels with samples (fancy way to `d_histogram[bin]++`). int *d_levels = d_samples; std::uint8_t *d_temp_storage{}; std::size_t temp_storage_bytes{}; CubDebugExit(cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_samples)); CubDebugExit( g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes)); CubDebugExit(cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_samples)); CubDebugExit(cudaMemcpy(h_histogram, d_histogram, sizeof(h_histogram), cudaMemcpyDeviceToHost)); for (int bin = 0; bin < num_levels - 1; bin++) { // Each bin should contain `bin + 1` samples. Since samples also contain // levels, they contribute one extra item to each bin. AssertEquals(bin + 2, h_histogram[bin]); } CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); CubDebugExit(g_allocator.DeviceFree(d_histogram)); CubDebugExit(g_allocator.DeviceFree(d_levels)); } // Regression test for NVIDIA/cub#489: integer rounding errors lead to incorrect // bin detection: void TestIntegerBinCalcs() { constexpr int num_levels = 8; constexpr int num_bins = num_levels - 1; int h_histogram[num_bins]{}; const int h_histogram_ref[num_bins]{1, 5, 0, 2, 1, 0, 0}; const int h_samples[]{2, 6, 7, 2, 3, 0, 2, 2, 6, 999}; const int lower_level = 0; const int upper_level = 12; constexpr int num_samples = sizeof(h_samples) / sizeof(h_samples[0]); int *d_histogram{}; int *d_samples{}; CubDebugExit( g_allocator.DeviceAllocate((void **)&d_histogram, sizeof(h_histogram))); CubDebugExit( g_allocator.DeviceAllocate((void **)&d_samples, sizeof(h_samples))); CubDebugExit( cudaMemcpy(d_samples, h_samples, sizeof(h_samples), cudaMemcpyHostToDevice)); std::uint8_t *d_temp_storage{}; std::size_t temp_storage_bytes{}; CubDebugExit(cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples)); CubDebugExit( g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes)); CubDebugExit(cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples)); CubDebugExit(cudaMemcpy(h_histogram, d_histogram, sizeof(h_histogram), cudaMemcpyDeviceToHost)); for (int bin = 0; bin < num_bins; ++bin) { AssertEquals(h_histogram_ref[bin], h_histogram[bin]); } CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); CubDebugExit(g_allocator.DeviceFree(d_histogram)); CubDebugExit(g_allocator.DeviceFree(d_samples)); } /** * @brief Our bin computation for HistogramEven is guaranteed only for when (max_level - min_level) * * num_bins does not overflow when using uint64_t arithmetic. In case bin computation could * overflow, we expect cudaErrorInvalidValue to be returned. */ template void TestOverflow() { using CounterT = uint32_t; constexpr std::size_t test_cases = 2; // Test data common across tests SampleT lower_level = 0; SampleT upper_level = ::cuda::std::numeric_limits::max(); thrust::counting_iterator d_samples{0UL}; thrust::device_vector d_histo_out(1024); CounterT *d_histogram = thrust::raw_pointer_cast(d_histo_out.data()); int num_samples = 1000; // Prepare per-test specific data constexpr std::size_t canary_bytes = 3; std::array temp_storage_bytes{canary_bytes, canary_bytes}; std::array num_bins{1, 2}; // Since test #1 is just a single bin, we expect it to succeed // Since we promote up to 64-bit integer arithmetic we expect tests to not overflow for types of // up to 4 bytes. For 64-bit and wider types, we do not perform further promotion to even wider // types, hence we expect cudaErrorInvalidValue to be returned to indicate of a potential overflow std::array expected_status{ cudaSuccess, sizeof(SampleT) <= 4UL ? cudaSuccess : cudaErrorInvalidValue}; // Verify we always initializes temp_storage_bytes cudaError_t error{cudaSuccess}; for (std::size_t i = 0; i < test_cases; i++) { error = cub::DeviceHistogram::HistogramEven(nullptr, temp_storage_bytes[i], d_samples, d_histogram, num_bins[i] + 1, lower_level, upper_level, num_samples); // Ensure that temp_storage_bytes has been initialized even in the presence of error AssertTrue(temp_storage_bytes[i] != canary_bytes); } // Allocate sufficient temporary storage thrust::device_vector temp_storage( std::max(temp_storage_bytes[0], temp_storage_bytes[1])); for (std::size_t i = 0; i < test_cases; i++) { error = cub::DeviceHistogram::HistogramEven(thrust::raw_pointer_cast(temp_storage.data()), temp_storage_bytes[i], d_samples, d_histogram, num_bins[i] + 1, lower_level, upper_level, num_samples); // Ensure we do not return an error on querying temporary storage requirements AssertEquals(error, expected_status[i]); } } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); g_verbose_input = args.CheckCmdLineFlag("v2"); args.GetCmdLineArgument("i", g_timing_iterations); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--i=] " "[--device=] " "[--v] " "[--v2] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); TestOverflow(); TestOverflow(); TestOverflow(); TestOverflow(); using true_t = Int2Type; using false_t = Int2Type; TestLevelsAliasing(); TestIntegerBinCalcs(); // regression test for NVIDIA/cub#489 #if TEST_HALF_T TestChannels(256, 256 + 1, true_t{}, true_t{}); #endif TestChannels (256, 256 + 1, true_t{}, true_t{}); TestChannels (8192, 8192 + 1, true_t{}, false_t{}); // Make sure bin computation works fine when using int32 arithmetic TestChannels (std::numeric_limits::max(), std::numeric_limits::max() + 1, true_t{}, false_t{}); // Make sure bin computation works fine when requiring int64 arithmetic TestChannels (std::numeric_limits::max(), 8192 + 1, true_t{}, false_t{}); #if !defined(__ICC) // Fails with ICC for unknown reasons, see #332. TestChannels (1.0, 256 + 1, true_t{}, false_t{}); #endif // float samples, int levels, regression test for NVIDIA/cub#479. TestChannels (12, 7, true_t{}, true_t{}); // Test down-conversion of size_t offsets to int TestChannels (256, 256 + 1, Int2Type<(sizeof(size_t) != sizeof(int))>{}, false_t{}); return 0; }