|
|
|
|
|
#pragma once |
|
|
|
|
|
#include <ATen/core/Tensor.h> |
|
|
|
|
|
#include <ATen/MemoryOverlap.h> |
|
|
#include <ATen/Parallel.h> |
|
|
#include <ATen/TensorIterator.h> |
|
|
#include <ATen/cpu/vec/functional.h> |
|
|
#include <ATen/cpu/vec/vec.h> |
|
|
#include <c10/util/irange.h> |
|
|
|
|
|
namespace at { namespace native { namespace detail { |
|
|
|
|
|
struct InputMeta { |
|
|
void* data_ptr; |
|
|
int64_t inner_size; |
|
|
|
|
|
InputMeta(const Tensor& t, int64_t dim, int64_t inner) |
|
|
: data_ptr(t.data_ptr()), inner_size(t.sizes()[dim] * inner) {} |
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename scalar_t, typename TensorListType> |
|
|
void stack_serial_kernel_impl(Tensor& result, TensorListType tensors, int64_t dim) { |
|
|
TORCH_INTERNAL_ASSERT_DEBUG_ONLY( |
|
|
dim >= 0 && dim <= result.dim(), |
|
|
"dim out of range in stack_serial_kernel_impl"); |
|
|
int64_t outer = |
|
|
result.numel() / (result.sizes()[dim] * result.strides()[dim]); |
|
|
scalar_t* result_data = result.data_ptr<scalar_t>(); |
|
|
int64_t ninputs = tensors.size(); |
|
|
std::vector<InputMeta> inputs; |
|
|
inputs.reserve(ninputs); |
|
|
for (const auto& tensor : tensors) { |
|
|
inputs.emplace_back(tensor, dim, tensor.strides()[dim]); |
|
|
} |
|
|
|
|
|
using Vec = vec::Vectorized<scalar_t>; |
|
|
scalar_t* result_ptr = result_data; |
|
|
for (const auto i : c10::irange(outer)) { |
|
|
for (const auto j : c10::irange(ninputs)) { |
|
|
int64_t local_inner = inputs[j].inner_size; |
|
|
scalar_t* input_ptr = (scalar_t*)(inputs[j].data_ptr) + i * local_inner; |
|
|
|
|
|
if (local_inner < Vec::size()) { |
|
|
for (const auto k : c10::irange(local_inner)) { |
|
|
result_ptr[k] = input_ptr[k]; |
|
|
} |
|
|
} else { |
|
|
vec::map( |
|
|
[](Vec x) { return x; }, result_ptr, input_ptr, local_inner); |
|
|
} |
|
|
result_ptr += local_inner; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename TensorListType> |
|
|
bool can_use_native_serial_stack_impl(Tensor& result, TensorListType tensors, int64_t dim) { |
|
|
TORCH_CHECK(tensors.size() > 0, "expected a non-empty list of Tensors"); |
|
|
const Tensor& first_tensor = tensors[0]; |
|
|
|
|
|
|
|
|
|
|
|
if (dim >= first_tensor.dim()) return false; |
|
|
|
|
|
if (first_tensor.numel() == 0 && first_tensor.dim() == 1) return false; |
|
|
|
|
|
if (result.dtype() != first_tensor.dtype()) return false; |
|
|
|
|
|
auto first_tensor_mem_format = first_tensor.suggest_memory_format(); |
|
|
ScalarType dtype = first_tensor.scalar_type(); |
|
|
|
|
|
if (!result.is_contiguous(first_tensor_mem_format)) { |
|
|
return false; |
|
|
} |
|
|
|
|
|
|
|
|
if (dtype != ScalarType::Double && dtype != ScalarType::Float) { |
|
|
return false; |
|
|
} |
|
|
|
|
|
|
|
|
auto const &first_tensor_shape = first_tensor.sizes(); |
|
|
for (const auto i : c10::irange(1, tensors.size())) { |
|
|
auto const &tensor = tensors[i]; |
|
|
TORCH_CHECK(tensors[i].sizes() == first_tensor.sizes(), |
|
|
"stack expects each tensor to be equal size, but got ", first_tensor_shape, |
|
|
" at entry 0 and ", tensor.sizes(), " at entry ", i); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (!tensor.is_contiguous(first_tensor_mem_format) || |
|
|
tensor.strides() != first_tensor.strides() || |
|
|
tensor.dtype() != dtype) { |
|
|
return false; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int64_t numel_in_stack = first_tensor.numel() * tensors.size(); |
|
|
return numel_in_stack < at::internal::GRAIN_SIZE || at::get_num_threads() == 1; |
|
|
} |
|
|
|
|
|
template <typename TensorListType, bool should_skip_overlap_check> |
|
|
struct CanUseNativeSerialStack; |
|
|
|
|
|
template <typename TensorListType> |
|
|
struct CanUseNativeSerialStack<TensorListType, false> { |
|
|
static bool call(Tensor& result, TensorListType tensors, int64_t dim) { |
|
|
|
|
|
for (const auto i : c10::irange(tensors.size())) { |
|
|
auto lap = at::get_overlap_status(result, tensors[i]); |
|
|
TORCH_CHECK(lap != at::MemOverlapStatus::Partial && |
|
|
lap != at::MemOverlapStatus::Full, 0, |
|
|
"unsupported operation: the input tensors cannot refer to any of the " |
|
|
"output memory locations. Found overlap in input tensor ", i); |
|
|
} |
|
|
|
|
|
return can_use_native_serial_stack_impl(result, tensors, dim); |
|
|
} |
|
|
}; |
|
|
|
|
|
template <typename TensorListType> |
|
|
struct CanUseNativeSerialStack<TensorListType, true> { |
|
|
static bool call(Tensor& result, TensorListType tensors, int64_t dim) { |
|
|
return can_use_native_serial_stack_impl(result, tensors, dim); |
|
|
} |
|
|
}; |
|
|
|
|
|
}}} |
|
|
|