thanks to nvidia ❤

8ae5fc5 over 2 years ago

7.36 kB

	/******************************************************************************
	* Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	#include <thrust/sequence.h>

	#include "../histogram_common.cuh"
	#include <nvbench_helper.cuh>

	// %RANGE% TUNE_ITEMS ipt 7:24:1
	// %RANGE% TUNE_THREADS tpb 128:1024:32
	// %RANGE% TUNE_RLE_COMPRESS rle 0:1:1
	// %RANGE% TUNE_WORK_STEALING ws 0:1:1
	// %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
	// %RANGE% TUNE_LOAD ld 0:2:1

	template <typename SampleT, typename CounterT, typename OffsetT>
	static void histogram(nvbench::state &state, nvbench::type_list<SampleT, CounterT, OffsetT>)
	{
	constexpr int num_channels = 4;
	constexpr int num_active_channels = 3;

	using sample_iterator_t = SampleT *;

	#if !TUNE_BASE
	using policy_t = policy_hub_t<key_t, num_active_channels>;
	using dispatch_t = cub::DispatchHistogram<num_channels, //
	num_active_channels,
	sample_iterator_t,
	CounterT,
	SampleT,
	OffsetT,
	policy_t>;
	#else // TUNE_BASE
	using dispatch_t = cub::DispatchHistogram<num_channels, //
	num_active_channels,
	sample_iterator_t,
	CounterT,
	SampleT,
	OffsetT>;
	#endif // TUNE_BASE

	const auto entropy = str_to_entropy(state.get_string("Entropy"));
	const auto elements = state.get_int64("Elements{io}");
	const auto num_bins = state.get_int64("Bins");
	const int num_levels_r = static_cast<int>(num_bins) + 1;
	const int num_levels_g = num_levels_r;
	const int num_levels_b = num_levels_g;

	const SampleT lower_level = 0;
	const SampleT upper_level = get_upper_level<SampleT>(num_bins, elements);

	SampleT step = (upper_level - lower_level) / num_bins;
	thrust::device_vector<SampleT> levels_r(num_bins + 1);

	// TODO Extract sequence to the helper TU
	thrust::sequence(levels_r.begin(), levels_r.end(), lower_level, step);
	thrust::device_vector<SampleT> levels_g = levels_r;
	thrust::device_vector<SampleT> levels_b = levels_g;

	SampleT *d_levels_r = thrust::raw_pointer_cast(levels_r.data());
	SampleT *d_levels_g = thrust::raw_pointer_cast(levels_g.data());
	SampleT *d_levels_b = thrust::raw_pointer_cast(levels_b.data());

	thrust::device_vector<SampleT> input(elements * num_channels);
	thrust::device_vector<CounterT> hist_r(num_bins);
	thrust::device_vector<CounterT> hist_g(num_bins);
	thrust::device_vector<CounterT> hist_b(num_bins);
	gen(seed_t{}, input, entropy, lower_level, upper_level);

	SampleT *d_input = thrust::raw_pointer_cast(input.data());
	CounterT *d_histogram_r = thrust::raw_pointer_cast(hist_r.data());
	CounterT *d_histogram_g = thrust::raw_pointer_cast(hist_g.data());
	CounterT *d_histogram_b = thrust::raw_pointer_cast(hist_b.data());

	CounterT *d_histogram[num_active_channels] = {d_histogram_r, d_histogram_g, d_histogram_b};
	int num_levels[num_active_channels] = {num_levels_r, num_levels_g, num_levels_b};
	SampleT *d_levels[num_active_channels] = {d_levels_r, d_levels_g, d_levels_b};

	std::uint8_t *d_temp_storage = nullptr;
	std::size_t temp_storage_bytes{};

	cub::Int2Type<sizeof(SampleT) == 1> is_byte_sample;
	OffsetT num_row_pixels = static_cast<OffsetT>(elements);
	OffsetT num_rows = 1;
	OffsetT row_stride_samples = num_row_pixels;

	state.add_element_count(elements);
	state.add_global_memory_reads<SampleT>(elements * num_active_channels);
	state.add_global_memory_writes<CounterT>(num_bins * num_active_channels);

	dispatch_t::DispatchRange(d_temp_storage,
	temp_storage_bytes,
	d_input,
	d_histogram,
	num_levels,
	d_levels,
	num_row_pixels,
	num_rows,
	row_stride_samples,
	0,
	is_byte_sample);

	thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
	d_temp_storage = thrust::raw_pointer_cast(tmp.data());

	state.exec([&](nvbench::launch &launch) {
	dispatch_t::DispatchRange(d_temp_storage,
	temp_storage_bytes,
	d_input,
	d_histogram,
	num_levels,
	d_levels,
	num_row_pixels,
	num_rows,
	row_stride_samples,
	launch.get_stream(),
	is_byte_sample);
	});
	}

	using bin_types = nvbench::type_list<int32_t>;
	using some_offset_types = nvbench::type_list<int32_t>;

	#ifdef TUNE_SampleT
	using sample_types = nvbench::type_list<TUNE_SampleT>;
	#else // !defined(TUNE_SampleT)
	using sample_types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t, float, double>;
	#endif // TUNE_SampleT

	NVBENCH_BENCH_TYPES(histogram, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset_types))
	.set_name("cub::DeviceHistogram::MultiHistogramRange")
	.set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
	.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
	.add_int64_axis("Bins", {128, 2048, 2097152})
	.add_string_axis("Entropy", {"1.000", "0.544", "0.000"});