/****************************************************************************** * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #include #include "../histogram_common.cuh" #include // %RANGE% TUNE_ITEMS ipt 7:24:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_RLE_COMPRESS rle 0:1:1 // %RANGE% TUNE_WORK_STEALING ws 0:1:1 // %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1 // %RANGE% TUNE_LOAD ld 0:2:1 template static void histogram(nvbench::state &state, nvbench::type_list) { constexpr int num_channels = 4; constexpr int num_active_channels = 3; using sample_iterator_t = SampleT *; #if !TUNE_BASE using policy_t = policy_hub_t; using dispatch_t = cub::DispatchHistogram; #else // TUNE_BASE using dispatch_t = cub::DispatchHistogram; #endif // TUNE_BASE const auto entropy = str_to_entropy(state.get_string("Entropy")); const auto elements = state.get_int64("Elements{io}"); const auto num_bins = state.get_int64("Bins"); const int num_levels_r = static_cast(num_bins) + 1; const int num_levels_g = num_levels_r; const int num_levels_b = num_levels_g; const SampleT lower_level = 0; const SampleT upper_level = get_upper_level(num_bins, elements); SampleT step = (upper_level - lower_level) / num_bins; thrust::device_vector levels_r(num_bins + 1); // TODO Extract sequence to the helper TU thrust::sequence(levels_r.begin(), levels_r.end(), lower_level, step); thrust::device_vector levels_g = levels_r; thrust::device_vector levels_b = levels_g; SampleT *d_levels_r = thrust::raw_pointer_cast(levels_r.data()); SampleT *d_levels_g = thrust::raw_pointer_cast(levels_g.data()); SampleT *d_levels_b = thrust::raw_pointer_cast(levels_b.data()); thrust::device_vector input(elements * num_channels); thrust::device_vector hist_r(num_bins); thrust::device_vector hist_g(num_bins); thrust::device_vector hist_b(num_bins); gen(seed_t{}, input, entropy, lower_level, upper_level); SampleT *d_input = thrust::raw_pointer_cast(input.data()); CounterT *d_histogram_r = thrust::raw_pointer_cast(hist_r.data()); CounterT *d_histogram_g = thrust::raw_pointer_cast(hist_g.data()); CounterT *d_histogram_b = thrust::raw_pointer_cast(hist_b.data()); CounterT *d_histogram[num_active_channels] = {d_histogram_r, d_histogram_g, d_histogram_b}; int num_levels[num_active_channels] = {num_levels_r, num_levels_g, num_levels_b}; SampleT *d_levels[num_active_channels] = {d_levels_r, d_levels_g, d_levels_b}; std::uint8_t *d_temp_storage = nullptr; std::size_t temp_storage_bytes{}; cub::Int2Type is_byte_sample; OffsetT num_row_pixels = static_cast(elements); OffsetT num_rows = 1; OffsetT row_stride_samples = num_row_pixels; state.add_element_count(elements); state.add_global_memory_reads(elements * num_active_channels); state.add_global_memory_writes(num_bins * num_active_channels); dispatch_t::DispatchRange(d_temp_storage, temp_storage_bytes, d_input, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_samples, 0, is_byte_sample); thrust::device_vector tmp(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(tmp.data()); state.exec([&](nvbench::launch &launch) { dispatch_t::DispatchRange(d_temp_storage, temp_storage_bytes, d_input, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_samples, launch.get_stream(), is_byte_sample); }); } using bin_types = nvbench::type_list; using some_offset_types = nvbench::type_list; #ifdef TUNE_SampleT using sample_types = nvbench::type_list; #else // !defined(TUNE_SampleT) using sample_types = nvbench::type_list; #endif // TUNE_SampleT NVBENCH_BENCH_TYPES(histogram, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset_types)) .set_name("cub::DeviceHistogram::MultiHistogramRange") .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) .add_int64_axis("Bins", {128, 2048, 2097152}) .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});