|
|
#pragma once
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <cstdint>
|
|
|
#include <c10/util/C++17.h>
|
|
|
#include <c10/util/Load.h>
|
|
|
#include <c10/util/irange.h>
|
|
|
#include <ATen/detail/FunctionTraits.h>
|
|
|
#include <ATen/native/cpu/IsContiguous.h>
|
|
|
#include <ATen/native/TensorIterator.h>
|
|
|
#include <ATen/native/TensorIteratorDynamicCasting.h>
|
|
|
#include <ATen/cpu/vec/vec.h>
|
|
|
|
|
|
#include <tuple>
|
|
|
#include <utility>
|
|
|
|
|
|
namespace at::native { inline namespace CPU_CAPABILITY {
|
|
|
|
|
|
using namespace vec;
|
|
|
|
|
|
template <typename traits, std::size_t... INDEX>
|
|
|
typename traits::ArgsTuple
|
|
|
dereference_impl(char* C10_RESTRICT data[], const int64_t* strides, int64_t i,
|
|
|
std::index_sequence<INDEX...>) {
|
|
|
return std::make_tuple(
|
|
|
c10::load<typename traits::template arg<INDEX>::type>(
|
|
|
data[INDEX] + i * strides[INDEX])...);
|
|
|
}
|
|
|
|
|
|
template <typename traits>
|
|
|
typename traits::ArgsTuple
|
|
|
dereference(char* C10_RESTRICT data[], const int64_t* strides, int64_t i) {
|
|
|
using Indices = std::make_index_sequence<traits::arity>;
|
|
|
return dereference_impl<traits>(data, strides, i, Indices{});
|
|
|
}
|
|
|
|
|
|
template <typename traits, std::size_t... INDEX>
|
|
|
typename traits::ArgsTuple
|
|
|
dereference_vec_impl(char* C10_RESTRICT data[],
|
|
|
const typename traits::result_type& opt_scalar,
|
|
|
size_t S,
|
|
|
int64_t i,
|
|
|
std::index_sequence<INDEX...>) {
|
|
|
using Vec = typename traits::result_type;
|
|
|
using scalar_t = typename Vec::value_type;
|
|
|
return std::make_tuple(
|
|
|
S == INDEX + 1 ?
|
|
|
opt_scalar :
|
|
|
Vec::loadu(data[INDEX] + i * sizeof(scalar_t))...);
|
|
|
}
|
|
|
|
|
|
template <typename traits>
|
|
|
typename traits::ArgsTuple
|
|
|
dereference_vec(char* C10_RESTRICT data[], const typename traits::result_type& opt_scalar, size_t S, int64_t i) {
|
|
|
using Indices = std::make_index_sequence<traits::arity>;
|
|
|
return dereference_vec_impl<traits>(data, opt_scalar, S, i, Indices{});
|
|
|
}
|
|
|
|
|
|
template <typename func_t,
|
|
|
std::enable_if_t<!std::is_void_v<typename function_traits<func_t>::result_type>>* = nullptr>
|
|
|
inline void
|
|
|
execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
|
|
|
using traits = function_traits<func_t>;
|
|
|
using result_type = typename traits::result_type;
|
|
|
for (; i < n; i++) {
|
|
|
result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
|
|
|
*out_ptr = c10::guts::apply(op, dereference<traits>(
|
|
|
&data[1],
|
|
|
&strides[1],
|
|
|
i));
|
|
|
}
|
|
|
}
|
|
|
|
|
|
template <typename func_t,
|
|
|
std::enable_if_t<std::is_void_v<typename function_traits<func_t>::result_type>>* = nullptr>
|
|
|
inline void
|
|
|
execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
|
|
|
using traits = function_traits<func_t>;
|
|
|
for (; i < n; i++) {
|
|
|
c10::guts::apply(op, dereference<traits>(
|
|
|
&data[0],
|
|
|
&strides[0],
|
|
|
i));
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename func_t>
|
|
|
inline void
|
|
|
basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) {
|
|
|
using traits = function_traits<func_t>;
|
|
|
constexpr int ntensors = traits::arity + 1;
|
|
|
|
|
|
|
|
|
|
|
|
int64_t strides[ntensors];
|
|
|
for (const auto arg : c10::irange(ntensors)) {
|
|
|
strides[arg] = strides_[arg];
|
|
|
}
|
|
|
|
|
|
execute_op(data, strides, i, n, std::forward<func_t>(op));
|
|
|
}
|
|
|
|
|
|
|
|
|
template<class T, size_t N>
|
|
|
struct TupleOutput {
|
|
|
static void handle(char *C10_RESTRICT data[], const int64_t *strides, int64_t i,
|
|
|
const T &tuple) {
|
|
|
TupleOutput<T, N - 1>::handle(data, strides, i, tuple);
|
|
|
|
|
|
auto output = std::get<N - 1>(tuple);
|
|
|
using output_type = decltype(output);
|
|
|
output_type * out_ptr = (output_type *)(data[N - 1] + i * strides[N - 1]);
|
|
|
*out_ptr = output;
|
|
|
}
|
|
|
};
|
|
|
|
|
|
|
|
|
template<class T>
|
|
|
struct TupleOutput<T, 1> {
|
|
|
static void handle(char *C10_RESTRICT data[], const int64_t *strides, int64_t i,
|
|
|
const T &tuple) {
|
|
|
auto output = std::get<0>(tuple);
|
|
|
using output_type = decltype(output);
|
|
|
output_type* out_ptr = (output_type *)(data[0] + i * strides[0]);
|
|
|
*out_ptr = output;
|
|
|
}
|
|
|
};
|
|
|
|
|
|
template<class... Args>
|
|
|
void handle_tuple_outputs(char* C10_RESTRICT data[],
|
|
|
const int64_t* strides,
|
|
|
int64_t i,
|
|
|
const std::tuple<Args...> &tuple) {
|
|
|
TupleOutput<decltype(tuple), sizeof...(Args)>::handle(data, strides, i, tuple);
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename func_t>
|
|
|
inline void
|
|
|
multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) {
|
|
|
using traits = function_traits<func_t>;
|
|
|
|
|
|
using result_type = typename traits::result_type;
|
|
|
constexpr int num_outputs = std::tuple_size_v<result_type>;
|
|
|
constexpr int ntensors = traits::arity + num_outputs;
|
|
|
|
|
|
|
|
|
|
|
|
int64_t strides[ntensors];
|
|
|
for (const auto arg : c10::irange(ntensors)) {
|
|
|
strides[arg] = strides_[arg];
|
|
|
}
|
|
|
|
|
|
for (; i < n; i++) {
|
|
|
auto output = c10::guts::apply(op, dereference<traits>(
|
|
|
&data[num_outputs],
|
|
|
&strides[num_outputs],
|
|
|
i));
|
|
|
handle_tuple_outputs(data, strides, i, output);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename func_t, typename vec_func_t>
|
|
|
inline void
|
|
|
vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, vec_func_t&& vop) {
|
|
|
using traits = function_traits<vec_func_t>;
|
|
|
using scalar_t = typename function_traits<func_t>::result_type;
|
|
|
using Vec = Vectorized<scalar_t>;
|
|
|
constexpr int ntensors = traits::arity + 1;
|
|
|
|
|
|
char* C10_RESTRICT data[ntensors];
|
|
|
for (const auto arg : c10::irange(ntensors)) {
|
|
|
data[arg] = data_[arg];
|
|
|
}
|
|
|
|
|
|
Vec opt_scalar = Vec(S > 0 ? c10::load((scalar_t*)data[S]) : scalar_t(0));
|
|
|
int64_t i = 0;
|
|
|
for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
|
|
|
auto args1 = dereference_vec<traits>(&data[1], opt_scalar, S, i);
|
|
|
auto args2 = dereference_vec<traits>(&data[1], opt_scalar, S, i + Vec::size());
|
|
|
auto out1 = c10::guts::apply(vop, std::move(args1));
|
|
|
auto out2 = c10::guts::apply(vop, std::move(args2));
|
|
|
out1.store(data[0] + i * sizeof(scalar_t));
|
|
|
out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t));
|
|
|
}
|
|
|
if (i < n) {
|
|
|
int64_t strides[ntensors];
|
|
|
for (const auto arg : c10::irange(ntensors)) {
|
|
|
strides[arg] = (S > 0 && arg == S) ? 0 : sizeof(scalar_t);
|
|
|
}
|
|
|
basic_loop(data, strides, i, n, std::forward<func_t>(op));
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
template <typename traits, typename cb_t>
|
|
|
inline void unroll_contiguous_scalar_checks(
|
|
|
const int64_t* ,
|
|
|
std::index_sequence<>,
|
|
|
cb_t&& cb) {
|
|
|
cb(0);
|
|
|
}
|
|
|
|
|
|
template <typename traits, typename cb_t, size_t INDEX0, size_t ...INDEX>
|
|
|
inline void unroll_contiguous_scalar_checks(
|
|
|
const int64_t* strides,
|
|
|
std::index_sequence<INDEX0, INDEX...>,
|
|
|
cb_t&& cb) {
|
|
|
if (is_contiguous_scalar<traits, INDEX0 + 1>(strides)) {
|
|
|
cb(INDEX0 + 1);
|
|
|
} else {
|
|
|
unroll_contiguous_scalar_checks<traits>(strides, std::index_sequence<INDEX...>{}, std::forward<cb_t>(cb));
|
|
|
}
|
|
|
}
|
|
|
|
|
|
template <typename op_t, typename vop_t>
|
|
|
struct VectorizedLoop2d {
|
|
|
op_t op;
|
|
|
vop_t vop;
|
|
|
|
|
|
using traits = function_traits<op_t>;
|
|
|
static constexpr int ntensors = traits::arity + 1;
|
|
|
using data_t = std::array<char*, ntensors>;
|
|
|
|
|
|
VectorizedLoop2d(op_t op, vop_t vop):
|
|
|
op(std::move(op)), vop(std::move(vop)) {}
|
|
|
|
|
|
static void advance(data_t &data, const int64_t *outer_strides) {
|
|
|
for (const auto arg : c10::irange(data.size())) {
|
|
|
data[arg] += outer_strides[arg];
|
|
|
}
|
|
|
}
|
|
|
|
|
|
void operator()(char** base, const int64_t *strides, int64_t size0, int64_t size1) {
|
|
|
data_t data;
|
|
|
std::copy_n(base, ntensors, data.data());
|
|
|
const int64_t *outer_strides = &strides[ntensors];
|
|
|
|
|
|
if (is_contiguous<traits>(strides)) {
|
|
|
for ([[maybe_unused]] const auto i : c10::irange(size1)) {
|
|
|
vectorized_loop(data.data(), size0, 0, op, vop);
|
|
|
advance(data, outer_strides);
|
|
|
}
|
|
|
} else {
|
|
|
using Indices = std::make_index_sequence<traits::arity>;
|
|
|
unroll_contiguous_scalar_checks<traits>(strides, Indices{}, [&](size_t idx) {
|
|
|
if (idx) {
|
|
|
for ([[maybe_unused]] const auto i : c10::irange(size1)) {
|
|
|
vectorized_loop(data.data(), size0, idx, op, vop);
|
|
|
advance(data, outer_strides);
|
|
|
}
|
|
|
} else {
|
|
|
for ([[maybe_unused]] const auto i : c10::irange(size1)) {
|
|
|
basic_loop(data.data(), strides, 0, size0, op);
|
|
|
advance(data, outer_strides);
|
|
|
}
|
|
|
}
|
|
|
});
|
|
|
}
|
|
|
}
|
|
|
};
|
|
|
|
|
|
template <typename op_t, typename vop_t>
|
|
|
VectorizedLoop2d<op_t, vop_t> make_vectorized_loop2d(
|
|
|
op_t &&op, vop_t &&vop) {
|
|
|
return VectorizedLoop2d<op_t, vop_t>(std::forward<op_t>(op), std::forward<vop_t>(vop));
|
|
|
}
|
|
|
|
|
|
template <typename func_t>
|
|
|
void cpu_kernel(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) {
|
|
|
using traits = function_traits<func_t>;
|
|
|
|
|
|
TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
|
|
|
TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
|
|
|
|
|
|
TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
|
|
|
|
|
|
iter.for_each([&](char** data, const int64_t* strides, int64_t n) {
|
|
|
|
|
|
|
|
|
basic_loop(data, strides, 0, n, op);
|
|
|
}, grain_size);
|
|
|
iter.cast_outputs();
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename func_t>
|
|
|
void cpu_kernel_multiple_outputs(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) {
|
|
|
using traits = function_traits<func_t>;
|
|
|
TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
|
|
|
|
|
|
iter.for_each([&](char** data, const int64_t* strides, int64_t n) {
|
|
|
multiple_outputs_loop(data, strides, 0, n, op);
|
|
|
}, grain_size);
|
|
|
iter.cast_outputs();
|
|
|
}
|
|
|
|
|
|
template <bool check_dynamic_cast=true, typename func_t, typename vec_func_t>
|
|
|
void cpu_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, int64_t grain_size = at::internal::GRAIN_SIZE) {
|
|
|
using traits = function_traits<func_t>;
|
|
|
|
|
|
TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
|
|
|
TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
|
|
|
|
|
|
|
|
|
if constexpr (check_dynamic_cast) {
|
|
|
TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
|
|
|
}
|
|
|
|
|
|
iter.for_each(make_vectorized_loop2d(std::forward<func_t>(op), std::forward<vec_func_t>(vop)), grain_size);
|
|
|
iter.cast_outputs();
|
|
|
}
|
|
|
|
|
|
template <typename func_t>
|
|
|
void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op, const Range& range) {
|
|
|
using traits = function_traits<func_t>;
|
|
|
constexpr bool result_void = std::is_void_v<typename traits::result_type>;
|
|
|
TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity &&
|
|
|
((result_void && iter.noutputs() == 0) || (!result_void && iter.noutputs() == 1)));
|
|
|
|
|
|
TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
|
|
|
|
|
|
iter.serial_for_each([&](char** data, const int64_t* strides, int64_t n) {
|
|
|
basic_loop(data, strides, 0, n, op);
|
|
|
}, range);
|
|
|
iter.cast_outputs();
|
|
|
}
|
|
|
|
|
|
template <typename func_t>
|
|
|
void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op) {
|
|
|
cpu_serial_kernel(iter, std::forward<func_t>(op), {0, iter.numel()});
|
|
|
}
|
|
|
|
|
|
template <typename func_t, typename vec_func_t>
|
|
|
void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, const Range& range) {
|
|
|
using traits = function_traits<func_t>;
|
|
|
|
|
|
TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
|
|
|
TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
|
|
|
|
|
|
TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
|
|
|
|
|
|
iter.serial_for_each(make_vectorized_loop2d(std::forward<func_t>(op), std::forward<vec_func_t>(vop)), range);
|
|
|
iter.cast_outputs();
|
|
|
}
|
|
|
|
|
|
template <typename func_t, typename vec_func_t>
|
|
|
void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop) {
|
|
|
cpu_serial_kernel_vec(iter, std::forward<func_t>(op), std::forward<vec_func_t>(vop), {0, iter.numel()});
|
|
|
}
|
|
|
|
|
|
}}
|
|
|
|