| /****************************************************************************** |
| * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * * Neither the name of the NVIDIA CORPORATION nor the |
| * names of its contributors may be used to endorse or promote products |
| * derived from this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| * |
| ******************************************************************************/ |
| |
| #include <nvbench_helper.cuh> |
| #include <look_back_helper.cuh> |
| #include <cub/device/device_reduce.cuh> |
| |
| // %RANGE% TUNE_ITEMS ipt 7:24:1 |
| // %RANGE% TUNE_THREADS tpb 128:1024:32 |
| // %RANGE% TUNE_TRANSPOSE trp 0:1:1 |
| // %RANGE% TUNE_LOAD ld 0:1:1 |
| // %RANGE% TUNE_MAGIC_NS ns 0:2048:4 |
| // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 |
| // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 |
| |
| #if !TUNE_BASE |
| #if TUNE_TRANSPOSE == 0 |
| #define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT |
| #else // TUNE_TRANSPOSE == 1 |
| #define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE |
| #endif // TUNE_TRANSPOSE |
| |
| #if TUNE_LOAD == 0 |
| #define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT |
| #else // TUNE_LOAD == 1 |
| #define TUNE_LOAD_MODIFIER cub::LOAD_CA |
| #endif // TUNE_LOAD |
| |
| struct device_reduce_by_key_policy_hub |
| { |
| struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350> |
| { |
| using ReduceByKeyPolicyT = cub::AgentReduceByKeyPolicy<TUNE_THREADS, |
| TUNE_ITEMS, |
| TUNE_LOAD_ALGORITHM, |
| TUNE_LOAD_MODIFIER, |
| cub::BLOCK_SCAN_WARP_SCANS, |
| delay_constructor_t>; |
| }; |
| |
| using MaxPolicy = Policy350; |
| }; |
| #endif // !TUNE_BASE |
| |
| template <class KeyT, class ValueT, class OffsetT> |
| static void reduce(nvbench::state &state, nvbench::type_list<KeyT, ValueT, OffsetT>) |
| { |
| using keys_input_it_t = const KeyT*; |
| using unique_output_it_t = KeyT*; |
| using vals_input_it_t = const ValueT*; |
| using aggregate_output_it_t = ValueT*; |
| using num_runs_output_iterator_t = OffsetT*; |
| using equality_op_t = cub::Equality; |
| using reduction_op_t = cub::Sum; |
| using accum_t = ValueT; |
| using offset_t = OffsetT; |
| |
| #if !TUNE_BASE |
| using dispatch_t = cub::DispatchReduceByKey<keys_input_it_t, |
| unique_output_it_t, |
| vals_input_it_t, |
| aggregate_output_it_t, |
| num_runs_output_iterator_t, |
| equality_op_t, |
| reduction_op_t, |
| offset_t, |
| accum_t, |
| device_reduce_by_key_policy_hub>; |
| #else |
| using dispatch_t = cub::DispatchReduceByKey<keys_input_it_t, |
| unique_output_it_t, |
| vals_input_it_t, |
| aggregate_output_it_t, |
| num_runs_output_iterator_t, |
| equality_op_t, |
| reduction_op_t, |
| offset_t, |
| accum_t>; |
| #endif |
| |
| const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}")); |
| const std::size_t min_segment_size = 1; |
| const std::size_t max_segment_size = static_cast<std::size_t>(state.get_int64("MaxSegSize")); |
| |
| thrust::device_vector<OffsetT> num_runs_out(1); |
| thrust::device_vector<ValueT> in_vals(elements); |
| thrust::device_vector<ValueT> out_vals(elements); |
| thrust::device_vector<KeyT> out_keys(elements); |
| thrust::device_vector<KeyT> in_keys = |
| gen_uniform_key_segments<KeyT>(seed_t{}, elements, min_segment_size, max_segment_size); |
| |
| KeyT *d_in_keys = thrust::raw_pointer_cast(in_keys.data()); |
| KeyT *d_out_keys = thrust::raw_pointer_cast(out_keys.data()); |
| ValueT *d_in_vals = thrust::raw_pointer_cast(in_vals.data()); |
| ValueT *d_out_vals = thrust::raw_pointer_cast(out_vals.data()); |
| OffsetT *d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data()); |
| |
| std::uint8_t *d_temp_storage{}; |
| std::size_t temp_storage_bytes{}; |
| |
| dispatch_t::Dispatch(d_temp_storage, |
| temp_storage_bytes, |
| d_in_keys, |
| d_out_keys, |
| d_in_vals, |
| d_out_vals, |
| d_num_runs_out, |
| equality_op_t{}, |
| reduction_op_t{}, |
| elements, |
| 0); |
| |
| thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes); |
| d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); |
| |
| dispatch_t::Dispatch(d_temp_storage, |
| temp_storage_bytes, |
| d_in_keys, |
| d_out_keys, |
| d_in_vals, |
| d_out_vals, |
| d_num_runs_out, |
| equality_op_t{}, |
| reduction_op_t{}, |
| elements, |
| 0); |
| cudaDeviceSynchronize(); |
| const OffsetT num_runs = num_runs_out[0]; |
| |
| state.add_element_count(elements); |
| state.add_global_memory_reads<KeyT>(elements); |
| state.add_global_memory_reads<ValueT>(elements); |
| state.add_global_memory_writes<ValueT>(num_runs); |
| state.add_global_memory_writes<KeyT>(num_runs); |
| state.add_global_memory_writes<OffsetT>(1); |
| |
| state.exec([&](nvbench::launch &launch) { |
| dispatch_t::Dispatch(d_temp_storage, |
| temp_storage_bytes, |
| d_in_keys, |
| d_out_keys, |
| d_in_vals, |
| d_out_vals, |
| d_num_runs_out, |
| equality_op_t{}, |
| reduction_op_t{}, |
| elements, |
| launch.get_stream()); |
| }); |
| } |
| |
| using some_offset_types = nvbench::type_list<nvbench::int32_t>; |
| |
| #ifdef TUNE_KeyT |
| using key_types = nvbench::type_list<TUNE_KeyT>; |
| #else // !defined(TUNE_KeyT) |
| using key_types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t, int128_t>; |
| #endif // TUNE_KeyT |
| |
| #ifdef TUNE_ValueT |
| using value_types = nvbench::type_list<TUNE_ValueT>; |
| #else // !defined(TUNE_ValueT) |
| using value_types = all_types; |
| #endif // TUNE_ValueT |
| |
| NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(key_types, value_types, some_offset_types)) |
| .set_name("cub::DeviceReduce::ReduceByKey") |
| .set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"}) |
| .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) |
| .add_int64_power_of_two_axis("MaxSegSize", {1, 4, 8}); |
| |