File size: 7,869 Bytes

8ae5fc5

/******************************************************************************
 * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

#include <nvbench_helper.cuh>
#include <look_back_helper.cuh>
#include <cub/device/device_run_length_encode.cuh>

// %RANGE% TUNE_ITEMS ipt 7:24:1
// %RANGE% TUNE_THREADS tpb 128:1024:32
// %RANGE% TUNE_TRANSPOSE trp 0:1:1
// %RANGE% TUNE_LOAD ld 0:1:1
// %RANGE% TUNE_MAGIC_NS ns 0:2048:4
// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5

#if !TUNE_BASE
#if TUNE_TRANSPOSE == 0
#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
#else // TUNE_TRANSPOSE == 1
#define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
#endif // TUNE_TRANSPOSE

#if TUNE_LOAD == 0
#define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
#else // TUNE_LOAD == 1
#define TUNE_LOAD_MODIFIER cub::LOAD_CA
#endif // TUNE_LOAD

struct device_reduce_by_key_policy_hub
{
  struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
  {
    using ReduceByKeyPolicyT = cub::AgentReduceByKeyPolicy<TUNE_THREADS,
                                                           TUNE_ITEMS,
                                                           TUNE_LOAD_ALGORITHM,
                                                           TUNE_LOAD_MODIFIER,
                                                           cub::BLOCK_SCAN_WARP_SCANS,
                                                           delay_constructor_t>;
  };

  using MaxPolicy = Policy350;
};
#endif // !TUNE_BASE

template <class T, class OffsetT>
static void rle(nvbench::state &state, nvbench::type_list<T, OffsetT>)
{
  using offset_t = OffsetT;
  using keys_input_it_t = const T*;
  using unique_output_it_t = T*;
  using vals_input_it_t = cub::ConstantInputIterator<offset_t, OffsetT>;
  using aggregate_output_it_t = offset_t*;
  using num_runs_output_iterator_t = offset_t*;
  using equality_op_t = cub::Equality;
  using reduction_op_t = cub::Sum;
  using accum_t = offset_t;

  #if !TUNE_BASE
  using dispatch_t = cub::DispatchReduceByKey<keys_input_it_t,
                                              unique_output_it_t,
                                              vals_input_it_t,
                                              aggregate_output_it_t,
                                              num_runs_output_iterator_t,
                                              equality_op_t,
                                              reduction_op_t,
                                              offset_t,
                                              accum_t,
                                              device_reduce_by_key_policy_hub>;
  #else
  using dispatch_t = cub::DispatchReduceByKey<keys_input_it_t,
                                              unique_output_it_t,
                                              vals_input_it_t,
                                              aggregate_output_it_t,
                                              num_runs_output_iterator_t,
                                              equality_op_t,
                                              reduction_op_t,
                                              offset_t,
                                              accum_t>;
  #endif

  const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
  const std::size_t min_segment_size = 1;
  const std::size_t max_segment_size = static_cast<std::size_t>(state.get_int64("MaxSegSize"));

  thrust::device_vector<offset_t> num_runs_out(1);
  thrust::device_vector<offset_t> out_vals(elements);
  thrust::device_vector<T> out_keys(elements);
  thrust::device_vector<T> in_keys =
    gen_uniform_key_segments<T>(seed_t{}, elements, min_segment_size, max_segment_size);

  T *d_in_keys             = thrust::raw_pointer_cast(in_keys.data());
  T *d_out_keys            = thrust::raw_pointer_cast(out_keys.data());
  offset_t *d_out_vals     = thrust::raw_pointer_cast(out_vals.data());
  offset_t *d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data());
  vals_input_it_t d_in_vals(offset_t{1});

  std::uint8_t *d_temp_storage{};
  std::size_t temp_storage_bytes{};

  dispatch_t::Dispatch(d_temp_storage,
                       temp_storage_bytes,
                       d_in_keys,
                       d_out_keys,
                       d_in_vals,
                       d_out_vals,
                       d_num_runs_out,
                       equality_op_t{},
                       reduction_op_t{},
                       elements,
                       0);

  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

  dispatch_t::Dispatch(d_temp_storage,
                       temp_storage_bytes,
                       d_in_keys,
                       d_out_keys,
                       d_in_vals,
                       d_out_vals,
                       d_num_runs_out,
                       equality_op_t{},
                       reduction_op_t{},
                       elements,
                       0);
  cudaDeviceSynchronize();
  const OffsetT num_runs = num_runs_out[0];

  state.add_element_count(elements);
  state.add_global_memory_reads<T>(elements);
  state.add_global_memory_writes<T>(num_runs);
  state.add_global_memory_writes<OffsetT>(num_runs);
  state.add_global_memory_writes<OffsetT>(1);

  state.exec([&](nvbench::launch &launch) {
    dispatch_t::Dispatch(d_temp_storage,
                         temp_storage_bytes,
                         d_in_keys,
                         d_out_keys,
                         d_in_vals,
                         d_out_vals,
                         d_num_runs_out,
                         equality_op_t{},
                         reduction_op_t{},
                         elements,
                         launch.get_stream());
  });
}

using some_offset_types = nvbench::type_list<nvbench::int32_t>;

NVBENCH_BENCH_TYPES(rle, NVBENCH_TYPE_AXES(all_types, some_offset_types))
  .set_name("cub::DeviceRunLengthEncode::Encode")
  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
  .add_int64_power_of_two_axis("MaxSegSize", {1, 4, 8});