Add files using upload-large-folder tool

c1af2fa verified 3 months ago

15.5 kB

	#pragma once

	#include <ATen/Device.h>
	#include <ATen/Dispatch.h>
	#include <ATen/ScalarType.h>
	#include <ATen/core/Tensor.h>
	#include <ATen/native/utils/ParamsHash.h>
	#include <c10/util/Exception.h>
	#include <c10/util/irange.h>

	#ifndef AT_PER_OPERATOR_HEADERS
	#include <ATen/NativeFunctions.h>
	#else
	#include <ATen/ops/result_type_native.h>
	#endif

	#include <unordered_map>
	#include <vector>

	namespace at::native {
	namespace {
	// Check if tensor list has either a boolean tensor or a integer tensor
	inline bool has_integral_tensor(TensorList tensors, const bool includeBool) {
	return std::any_of(
	tensors.begin(), tensors.end(), [&includeBool](const auto& t) {
	return at::isIntegralType(t.scalar_type(), includeBool);
	});
	}
	// check if tensor list has bool tensors
	inline bool has_bool_tensor(TensorList tensors) {
	return std::any_of(tensors.begin(), tensors.end(), [](const auto& t) -> bool {
	return t.scalar_type() == ScalarType::Bool;
	});
	}

	// Check foreach API restrictions
	// - Tensor lists must be non-empty.
	// - All TensorLists and ScalarLists must have the same number of elements.
	// - Corresponding tensors must have the same size.
	inline void check_foreach_api_restrictions(TensorList tensors) {
	TORCH_CHECK(!tensors.empty(), "Tensor list must have at least one tensor.");
	}

	inline void check_foreach_api_restrictions(
	TensorList tensors,
	ArrayRef<Scalar> scalars) {
	check_foreach_api_restrictions(tensors);
	TORCH_CHECK(
	tensors.size() == scalars.size(),
	"Tensor list must have same number of elements as scalar list.");
	}

	inline void check_foreach_api_restrictions(
	TensorList tensors1,
	TensorList tensors2) {
	TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor.");
	TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor.");
	TORCH_CHECK(
	tensors1.size() == tensors2.size(),
	"Tensor lists must have the same number of tensors, got ",
	tensors1.size(),
	" and ",
	tensors2.size());
	}

	inline void check_foreach_api_restrictions(
	TensorList tensors1,
	TensorList tensors2,
	TensorList tensors3) {
	TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor.");
	TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor.");
	TORCH_CHECK(!tensors3.empty(), "Tensor list must have at least one tensor.");
	TORCH_CHECK(
	tensors1.size() == tensors2.size(),
	"Tensor lists must have the same number of tensors, got ",
	tensors1.size(),
	" and ",
	tensors2.size());
	TORCH_CHECK(
	tensors1.size() == tensors3.size(),
	"Tensor lists must have the same number of tensors, got ",
	tensors1.size(),
	" and ",
	tensors3.size());
	}

	inline void check_foreach_api_restrictions(
	TensorList tensors1,
	TensorList tensors2,
	TensorList tensors3,
	ArrayRef<Scalar> scalars) {
	check_foreach_api_restrictions(tensors1, tensors2, tensors3);
	TORCH_CHECK(
	tensors1.size() == scalars.size(),
	"Tensor list must have same number of elements as scalar list, got ",
	tensors1.size(),
	" and ",
	scalars.size());
	}

	inline void check_foreach_api_restrictions(
	TensorList tensors1,
	TensorList tensors2,
	ArrayRef<Scalar> scalars) {
	check_foreach_api_restrictions(tensors1, tensors2);
	TORCH_CHECK(
	tensors1.size() == scalars.size(),
	"Tensor list must have same number of elements as scalar list, got ",
	tensors1.size(),
	" and ",
	scalars.size());
	}

	// Helper function called in check_fast_path_restrictions to check whether all
	// corresponding tensors (aligning in index across the tensorLists) share the
	// same device and dtype.
	inline bool _check_tensors_share_device_and_dtype(
	ArrayRef<TensorList> tensorLists,
	const bool skip_dtype_check = false) {
	const auto expected_dtype = tensorLists[0][0].dtype();
	const auto expected_device = tensorLists[0][0].device();

	auto is_tensor_okay = [&](const Tensor& tensor) {
	return (skip_dtype_check \|\| tensor.dtype() == expected_dtype) &&
	tensor.device() == expected_device && tensor.layout() == at::kStrided &&
	tensor.is_non_overlapping_and_dense();
	};

	for (const auto& tensorList : tensorLists) {
	for (const auto& tensor : tensorList) {
	if (!is_tensor_okay(tensor)) {
	return false;
	}
	}
	}

	return true;
	}

	// Helper function called in check_fast_path_restrictions to check if
	// corresponding tensors in tensor lists have the same sizes and strides.
	inline bool _check_tensors_share_sizes_and_strides(
	ArrayRef<TensorList> tensorLists) {
	auto is_diff_stride = [](const IntArrayRef& size,
	const IntArrayRef& left_stride,
	const IntArrayRef& right_stride) -> bool {
	const size_t size_size = size.size();
	for (const auto dim : c10::irange(size_size)) {
	if (size[dim] == 1)
	continue;
	if (left_stride[dim] != right_stride[dim]) {
	return true;
	}
	}
	return false;
	};
	for (const auto i : c10::irange(1, tensorLists.size())) {
	for (const auto j : c10::irange(tensorLists[0].size())) {
	if (tensorLists[0][j].sizes() != tensorLists[i][j].sizes() \|\|
	is_diff_stride(
	tensorLists[0][j].sizes(),
	tensorLists[0][j].strides(),
	tensorLists[i][j].strides())) {
	return false;
	}
	}
	}

	return true;
	}

	// Helper function called in check_fast_path_restrictions to check whether
	// all tensors type promote properly with the scalars in scalarList. This
	// function assumes that _check_tensors_share_device_and_dtype has already been
	// called so that all corresponding tensors in tensorLists have the same dtype.
	// Then, it is sufficient to check the type promotion with just one tensorList.
	inline bool _check_tensors_do_type_promotion_with_scalars(
	TensorList tensorList,
	ArrayRef<Scalar> scalarList = {},
	bool does_op_promote_integer_inputs_to_float = false) {
	for (const auto i : c10::irange(tensorList.size())) {
	// For division, integer inputs will result in float.
	if (does_op_promote_integer_inputs_to_float) {
	if (at::isIntegralType(
	tensorList[i].scalar_type(), /includeBool/ true)) {
	return false;
	}
	}
	if (!scalarList.empty()) {
	const auto& scalar =
	scalarList.size() == 1 ? scalarList[0] : scalarList[i];
	const auto& tensor = tensorList[i];
	// note(mkozuki): This check might be responsible for
	// `_foreach_add(bool_tensors, bool_tensors)` being pushed to slow path.
	if (tensor.scalar_type() != at::native::result_type(scalar, tensor)) {
	return false;
	}
	}
	}

	return true;
	}

	// To go via 'fast' path, several conditions must be satisfied
	// - All tensors in all lists must have the same dtype.
	// - All tensors must be on the same device
	// - All tensors must have strided layout
	// - All tensors must be non-overlapping and dense
	// - Resulting tensor must have the same dtype as the input one

	// [note: what's ``does_op_promote_integer_inputs_to_float=true``?]
	// ``does_op_promote_integer_inputs_to_float=true`` means that the result of
	// the op will be float even if inputs are integer or boolean, which
	// currently fast path does not support. In short, this flag, when
	// turned on, gatekeeps the op from going down the fastpath.

	// Please, make sure to call check_foreach_api_restrictions before calling this
	// method. There is a set of preconditions that have to be satisfied.
	inline bool check_fast_path_restrictions(
	ArrayRef<TensorList> tensorLists,
	ArrayRef<Scalar> scalarList = {},
	bool does_op_promote_integer_inputs_to_float = false) {
	return _check_tensors_share_device_and_dtype(tensorLists) &&
	_check_tensors_share_sizes_and_strides(tensorLists) &&
	_check_tensors_do_type_promotion_with_scalars(
	tensorLists[0],
	scalarList,
	does_op_promote_integer_inputs_to_float);
	}

	inline std::vector<c10::Scalar> convert_tensor_to_scalar_list(
	const Tensor& scalarList_,
	int64_t expect_length) {
	std::vector<c10::Scalar> scalarList;
	TORCH_CHECK(
	scalarList_.device() == c10::kCPU,
	"Expected scalars to be on CPU, got ",
	scalarList_.device(),
	" instead.");
	TORCH_CHECK(
	scalarList_.is_contiguous(), "Expected scalars to be contiguous.");
	TORCH_CHECK(
	scalarList_.dim() == 1,
	"Expected packed scalar Tensor to be of dimension 1. Got ",
	scalarList_.dim(),
	" instead.");
	AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
	kComplexHalf,
	kHalf,
	kBool,
	kBFloat16,
	scalarList_.scalar_type(),
	"convert_tensor_to_scalar_list",
	[&]() {
	const scalar_t* scalar_data = scalarList_.const_data_ptr<scalar_t>();
	TORCH_CHECK(
	(expect_length == scalarList_.size(0)),
	"Expected length of scalars to match input of length ",
	expect_length,
	" but got ",
	scalarList_.size(0),
	" instead.");
	for (int64_t i = 0; i < scalarList_.size(0); i++) {
	scalarList.emplace_back(scalar_data[i]);
	}
	});
	return scalarList;
	}

	// see: [note: what's ``does_op_promote_integer_inputs_to_float=true``?]
	inline bool can_use_fast_route(
	ArrayRef<TensorList> tensorLists,
	ArrayRef<Scalar> scalarList = {},
	bool does_op_promote_integer_inputs_to_float = false) {
	return check_fast_path_restrictions(
	tensorLists, scalarList, does_op_promote_integer_inputs_to_float);
	}

	// see: [note: what's ``does_op_promote_integer_inputs_to_float=true``?]
	inline bool can_use_fast_route(
	TensorList tensors1,
	TensorList tensors2,
	bool does_op_promote_integer_inputs_to_float = false) {
	return can_use_fast_route(
	{tensors1, tensors2}, {}, does_op_promote_integer_inputs_to_float);
	}

	using DeviceDtypeKey = std::pair<at::Device, at::ScalarType>;
	using IndicesT = std::vector<size_t>;
	using nested_optional_tensorvec_t =
	std::vector<std::vector<std::optional<at::Tensor>>>;
	using TensorsAndIndicesT = std::pair<nested_optional_tensorvec_t, IndicesT>;
	using FlatMap = std::unordered_map<
	DeviceDtypeKey,
	TensorsAndIndicesT,
	ParamsHash<DeviceDtypeKey>>;

	inline FlatMap _group_tensors_by_first_tensors_device_and_dtype(
	const nested_optional_tensorvec_t& nested_tensorlist,
	const bool with_indices) {
	FlatMap grouped_tensors_with_indices;

	TORCH_CHECK(!nested_tensorlist.empty());
	TORCH_CHECK(!nested_tensorlist[0].empty());
	const auto num_lists = nested_tensorlist.size();
	const auto num_tensors = nested_tensorlist[0].size();

	TORCH_CHECK(std::all_of(
	nested_tensorlist.cbegin(),
	nested_tensorlist.cend(),
	[&](const auto& tensorlist) -> bool {
	// note(crcrpar): Allow empty tensorlists following
	// ref:
	// https://github.com/pytorch/pytorch/blob/85885301fd3c6adb8b9dc3cf7afadf6945566684/torch/utils/_foreach_utils.py#L21-L24
	return tensorlist.size() == num_tensors \|\| tensorlist.size() == 0;
	}));

	for (const auto& tensor_index : c10::irange(num_tensors)) {
	const auto key = [&]() -> DeviceDtypeKey {
	const auto t = nested_tensorlist[0][tensor_index];
	TORCH_CHECK(
	t.has_value(),
	"Tensors of the first list of nested Tensor lists are supposed to be defined but ",
	"the ",
	tensor_index,
	"-th Tensor is not.");
	return {t->device(), t->scalar_type()};
	}();
	TORCH_CHECK(
	std::all_of(
	nested_tensorlist.cbegin(),
	nested_tensorlist.cend(),
	[&](const auto& tensorlist) -> bool {
	if (tensorlist.size() == 0) {
	return true;
	}
	const auto& tensor = tensorlist[tensor_index];
	// note(crcrpar): Currently the scope of this function is
	// optimizers so there could be `state_steps` and other scalars
	// whose elements are float tensors no matter what the parameter's
	// dtype is.
	if (!tensor.has_value()) {
	return true;
	} else {
	const auto s = tensor->scalar_type();
	const auto d = tensor->device();
	// Note: `step` or `state_step` is float32 by default.
	if (key.first == d) {
	return key.second == s \|\| s == at::ScalarType::Float \|\|
	s == at::ScalarType::Double;
	} else if (d.is_cpu()) {
	// note(crcrpar): There are some test cases (e.g.
	// TestOptim::test_adam) where state_steps are on CPU and the
	// others are on CUDA. Currently a state_step Tensor has the
	// dtype of float.
	return s == at::ScalarType::Float \|\|
	s == at::ScalarType::Double;
	} else {
	return false;
	}
	}
	}),
	"Tensors of the same index must be on the same device and the same dtype except `step` tensors that can be CPU and float32/64 notwithstanding");
	if (!grouped_tensors_with_indices.count(key)) {
	grouped_tensors_with_indices.insert(
	{key,
	TensorsAndIndicesT{
	[&]() -> nested_optional_tensorvec_t {
	nested_optional_tensorvec_t nested_tensorvec;
	nested_tensorvec.reserve(num_lists);
	for (const auto& i : c10::irange(num_lists)) {
	std::vector<std::optional<at::Tensor>> tensors;
	if (!nested_tensorlist[i].empty()) {
	// NB: num_tensors is the max possible length for any of
	// the inner lists of tensor references. Reserving the max
	// trades memory for perf. This should not have significant
	// impact.
	tensors.reserve(num_tensors);
	}
	nested_tensorvec.emplace_back(tensors);
	}
	return nested_tensorvec;
	}(),
	[&]() -> IndicesT {
	if (!with_indices) {
	return {};
	} else {
	IndicesT indices;
	indices.reserve(num_tensors);
	return indices;
	}
	}()}});
	}
	for (const auto& list_index : c10::irange(num_lists)) {
	if (!nested_tensorlist[list_index].empty()) {
	grouped_tensors_with_indices[key].first[list_index].emplace_back(
	nested_tensorlist[list_index][tensor_index]);
	}
	}
	if (with_indices) {
	grouped_tensors_with_indices[key].second.emplace_back(tensor_index);
	}
	}

	return grouped_tensors_with_indices;
	}

	} // namespace
	} // namespace at::native