diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/AmpKernels.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/AmpKernels.h new file mode 100644 index 0000000000000000000000000000000000000000..c463c80e1c6dcfff66bde315d59bf7fcb73e9860 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/AmpKernels.h @@ -0,0 +1,28 @@ +#pragma once + +#include +#include + +namespace at { +class Tensor; + +namespace native { + +using _amp_foreach_non_finite_check_and_unscale_cpu__fn = void (*)( + TensorList, + Tensor&, + const Tensor&); + +using _amp_update_scale_cpu__fn = Tensor& (*)( + Tensor&, + Tensor&, + const Tensor&, + double, + double, + int64_t); + +DECLARE_DISPATCH(_amp_foreach_non_finite_check_and_unscale_cpu__fn, _amp_foreach_non_finite_check_and_unscale_cpu_stub); +DECLARE_DISPATCH(_amp_update_scale_cpu__fn, _amp_update_scale_cpu_stub); + +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BucketizationUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BucketizationUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..90747c264b1564d072d8f57fdd38b2ef743ea62c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BucketizationUtils.h @@ -0,0 +1,173 @@ +#pragma once + +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +namespace at::native { + +// original values given by raw_*. If an original value is not contiguous, will make a contiguous copy to +// the corresponding trimmed_* value. Additionally, if the dtypes of the boundary and input tensor do not +// match, will change them to be a common super type so comparisons are done between the same types. +// For any trimmed_* tensor, if its outgoing value matches what it was incoming (typically null), then the +// corresponding raw_* version should be used since it was already contiguous of the right type. +inline void searchsorted_maybe_trim_input_tensors( + Tensor& trimmed_input, + Tensor& trimmed_boundaries, + Tensor& trimmed_sorter, + const Tensor& raw_input, + const Tensor& raw_boundaries, + const Tensor& raw_sorter) { + bool in_is_contiguous = raw_input.is_contiguous(); + bool bd_is_contiguous = raw_boundaries.is_contiguous(); + bool sort_is_contiguous = raw_sorter.is_contiguous(); + + if (!in_is_contiguous) { + TORCH_WARN_ONCE("torch.searchsorted(): input value tensor is non-contiguous, this will lower the performance due " + "to extra data copy when converting non-contiguous tensor to contiguous, please use contiguous input value " + "tensor if possible. This message will only appear once per program."); + trimmed_input = raw_input.contiguous(); + } + if (!bd_is_contiguous) { + TORCH_WARN_ONCE("torch.searchsorted(): boundary tensor is non-contiguous, this will lower the performance due " + "to extra data copy when converting non-contiguous tensor to contiguous, please use contiguous boundary " + "tensor if possible. This message will only appear once per program."); + trimmed_boundaries = raw_boundaries.contiguous(); + } + if (!sort_is_contiguous) { + TORCH_WARN_ONCE("torch.searchsorted(): sorter tensor is non-contiguous, this will lower the performance due " + "to extra data copy when converting non-contiguous tensor to contiguous, please use contiguous sorter " + "tensor if possible. This message will only appear once per program."); + trimmed_sorter = raw_sorter.contiguous(); + } + if (raw_input.dtype() != raw_boundaries.dtype()) { + at::native::ResultTypeState state = {}; + state = at::native::update_result_type_state(raw_boundaries, state); + state = at::native::update_result_type_state(raw_input, state); + ScalarType common_stype = at::native::result_type(state); + + TORCH_INTERNAL_ASSERT(common_stype != ScalarType::Undefined); + if (common_stype != raw_input.scalar_type()) { + trimmed_input = in_is_contiguous ? raw_input.to(common_stype) : trimmed_input.to(common_stype); + } + if (common_stype != raw_boundaries.scalar_type()) { + trimmed_boundaries = bd_is_contiguous ? raw_boundaries.to(common_stype) : trimmed_boundaries.to(common_stype); + } + } +} + +/* unused but needed for internal jagged tensor class */ +inline void searchsorted_maybe_trim_input_tensors( + Tensor& trimmed_input, + Tensor& trimmed_boundaries, + const Tensor& raw_input, + const Tensor& raw_boundaries) { + Tensor trimmed_sorter; + Tensor raw_sorter; + return searchsorted_maybe_trim_input_tensors( + trimmed_input, + trimmed_boundaries, + trimmed_sorter, + raw_input, + raw_boundaries, + raw_sorter); +} + +inline bool searchsorted_dims_matched_before_last_dim(const Tensor& boundaries, const Tensor& input) { + if (boundaries.dim() != input.dim()) { + return false; + } + const auto& dims_bd = boundaries.sizes(); + const auto& dims_in = input.sizes(); + for (int64_t dim = 0; dim + 1 < boundaries.dim(); ++dim) { + if (dims_bd[dim] != dims_in[dim]) { + return false; + } + } + return true; +} + +inline Tensor searchsorted_scalar_tensor(const Scalar& scalar, const c10::Device& device) { + auto tensor = c10::scalar_to_tensor(scalar, device); + // This is to adopt the scalar promotion rules defined in native/TypeProperties.h + // So we have the same type promotion rules as binary operations. + tensor.unsafeGetTensorImpl()->set_wrapped_number(true); + return tensor; +} + +inline void searchsorted_pre_check( + const Tensor& boundaries, + const Tensor& input, + const Tensor& output, + const bool out_int32, + const bool right, + const std::optional side_opt, + const Tensor& sorter) { + if (side_opt) { + const c10::string_view side = *side_opt; + TORCH_CHECK(side == "left" || side == "right", "torch.searchsorted(): side can only be 'left' or 'right' but ", + "got ", side); + + // assume the user has not explicitly set (right=False, side="right") + TORCH_CHECK(!right || side == "right", "torch.searchsorted(): side and right can't be set to opposites, got side " + "of ", side, " while right was True"); + } + + TORCH_CHECK(boundaries.device() == input.device(), "torch.searchsorted(): boundaries and input value tensors ", + "should have same device type, but got boundaries tensor device type ", boundaries.device(), " and input value ", + "tensor device type ", input.device()); + + if (sorter.defined()) { + TORCH_CHECK(sorter.device() == boundaries.device(), "torch.searchsorted(): sorter and boundary tensors should ", + "have same device type, but got sorter tensor device type ", sorter.device(), " and input value tensor ", + "device type ", boundaries.device()); + + TORCH_CHECK(sorter.sizes() == boundaries.sizes(), "torch.searchsorted(): boundary and sorter must have the same " + "size, but got boundary tensor ", boundaries.sizes(), "and got sorter tensor ", sorter.sizes()); + + TORCH_CHECK(sorter.scalar_type() == ScalarType::Long, "torch.searchsorted(): sorter must be a tensor of long ", + "dtype but got dtype ", sorter.scalar_type()); + + if (sorter.numel() > 0) { + auto minmax = sorter.aminmax(); + int64_t vmin = std::get<0>(minmax).item().toLong(); + int64_t vmax = std::get<1>(minmax).item().toLong(); + TORCH_CHECK(vmin >= 0 && vmax < sorter.sizes().back(), "torch.searchsorted(): sorter index out of range"); + } + } + + TORCH_CHECK(input.dim() > 0 || (input.dim() == 0 && input.numel() == 1 && boundaries.dim() == 1), + "torch.searchsorted(): input value can be a scalar only when boundaries tensor dimension is 1, but we got ", + "boundaries tensor dim(", boundaries.dim(), ") and input value's dim(", input.dim(), ") numel(", + input.numel(), ")"); + + TORCH_CHECK(boundaries.dim() != 0, "torch.searchsorted(): boundaries tensor should have positive dimension, but ", + "got 0 dimension"); + + TORCH_CHECK(boundaries.dim() == 1 || searchsorted_dims_matched_before_last_dim(boundaries, input), + "torch.searchsorted(): boundaries tensor should be 1 dimension or the first N-1 dimensions of boundaries tensor ", + "and input value tensor must match, but we got boundaries tensor ", boundaries.sizes(), " and input value tensor ", + input.sizes()); + + ScalarType output_dtype = output.scalar_type(); + TORCH_CHECK( + (output_dtype == ScalarType::Long && !out_int32) || + (output_dtype == ScalarType::Int && out_int32), + "torch.searchsorted(): output tensor's dtype is wrong, it can only be Int(int32) or Long(int64) depending on ", + "whether out_int32 flag is True, but we got output tensor's dtype ", output_dtype, + " and out_int32 flag is ", (out_int32 ? "True" : "False")); + + if (out_int32) { + TORCH_CHECK(boundaries.sizes().back() < INT_MAX, + "torch.searchsorted(): the size of boundaries' last dimension should be less than ", INT_MAX, ", but we got ", + boundaries.sizes().back()); + } +} + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CPUBlas.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CPUBlas.h new file mode 100644 index 0000000000000000000000000000000000000000..ad209329c95eccf1a5203eae53132ffc7178322e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CPUBlas.h @@ -0,0 +1,226 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + + +namespace at::native::cpublas { + +namespace internal { +void normalize_last_dims( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + int64_t *lda, int64_t *ldb, int64_t *ldc); +} // namespace internal + +using gemm_fn = void(*)( + at::ScalarType type, + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + const Scalar& alpha, + const void *a, int64_t lda, + const void *b, int64_t ldb, + const Scalar& beta, + void *c, int64_t ldc); + +DECLARE_DISPATCH(gemm_fn, gemm_stub); + +template +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + at::opmath_type alpha, + const scalar_t *a, int64_t lda, + const scalar_t *b, int64_t ldb, + at::opmath_type beta, + scalar_t *c, int64_t ldc) { + internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc); + gemm_stub( + kCPU, c10::CppTypeToScalarType::value, + transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + double alpha, + const double *a, int64_t lda, + const double *b, int64_t ldb, + double beta, + double *c, int64_t ldc); + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + float alpha, + const float *a, int64_t lda, + const float *b, int64_t ldb, + float beta, + float *c, int64_t ldc); + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + float alpha, + const at::BFloat16 *a, int64_t lda, + const at::BFloat16 *b, int64_t ldb, + float beta, + at::BFloat16 *c, int64_t ldc); + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + const float alpha, + const at::BFloat16 *a, int64_t lda, + const at::BFloat16 *b, int64_t ldb, + const float beta, + float *c, int64_t ldc); + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + float alpha, + const at::Half *a, int64_t lda, + const at::Half *b, int64_t ldb, + float beta, + at::Half *c, int64_t ldc); + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + const float alpha, + const at::Half *a, int64_t lda, + const at::Half *b, int64_t ldb, + const float beta, + float *c, int64_t ldc); + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + c10::complex alpha, + const c10::complex *a, int64_t lda, + const c10::complex *b, int64_t ldb, + c10::complex beta, + c10::complex *c, int64_t ldc); + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + c10::complex alpha, + const c10::complex *a, int64_t lda, + const c10::complex *b, int64_t ldb, + c10::complex beta, + c10::complex *c, int64_t ldc); + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + int64_t alpha, + const int64_t *a, int64_t lda, + const int64_t *b, int64_t ldb, + int64_t beta, + int64_t *c, int64_t ldc); + +template +void gemm_batched( + TransposeType transa, TransposeType transb, + int64_t batch_size, int64_t m, int64_t n, int64_t k, + scalar_t alpha, + const scalar_t * const *a, int64_t lda, + const scalar_t * const *b, int64_t ldb, + const scalar_t beta, + scalar_t * const *c, int64_t ldc); + +template +void gemm_batched_with_stride( + TransposeType transa, TransposeType transb, + int64_t batch_size, int64_t m, int64_t n, int64_t k, + scalar_t alpha, + const scalar_t *a, int64_t lda, int64_t batch_stride_a, + const scalar_t *b, int64_t ldb, int64_t batch_stride_b, + scalar_t beta, + scalar_t *c, int64_t ldc, int64_t batch_stride_c); + +using axpy_fn = void(*)(at::ScalarType type, int64_t n, const Scalar& a, const void *x, int64_t incx, void *y, int64_t incy); + +DECLARE_DISPATCH(axpy_fn, axpy_stub); + +template +void axpy(int64_t n, scalar_t a, const scalar_t *x, int64_t incx, scalar_t *y, int64_t incy){ + if(n == 1) + { + incx = 1; + incy = 1; + } + axpy_stub( + kCPU, c10::CppTypeToScalarType::value, + n, a, x, incx, y, incy); +} + +void axpy(int64_t n, double a, const double *x, int64_t incx, double *y, int64_t incy); +void axpy(int64_t n, float a, const float *x, int64_t incx, float *y, int64_t incy); +void axpy(int64_t n, c10::complex a, const c10::complex *x, int64_t incx, c10::complex *y, int64_t incy); +void axpy(int64_t n, c10::complex a, const c10::complex *x, int64_t incx, c10::complex *y, int64_t incy); + +using copy_fn = void(*)(at::ScalarType type, int64_t n, const void *x, int64_t incx, void *y, int64_t incy); + +DECLARE_DISPATCH(copy_fn, copy_stub); + +template +void copy(int64_t n, const scalar_t *x, int64_t incx, scalar_t *y, int64_t incy) { + if(n == 1) + { + incx = 1; + incy = 1; + } + copy_stub( + kCPU, c10::CppTypeToScalarType::value, + n, x, incx, y, incy); +} + +void copy(int64_t n, const double *x, int64_t incx, double *y, int64_t incy); +void copy(int64_t n, const float *x, int64_t incx, float *y, int64_t incy); +void copy(int64_t n, const c10::complex *x, int64_t incx, c10::complex *y, int64_t incy); +void copy(int64_t n, const c10::complex *x, int64_t incx, c10::complex *y, int64_t incy); + +// Batch-reduce GEMM +// Operates by the following formula: +// C = alpha * SUM(A[i] x B[i]) + beta * C, i = 0 to batch size +// A Base pointer to a tensor A. +// B Base pointer to a tensor B. +// C Pointer to a tensor C (accumulation buffer). +TORCH_API void brgemm( + int64_t M, + int64_t N, + int64_t K, + int64_t ld_a, + int64_t ld_b, + int64_t ld_c, + const float alpha, + const float beta, + const at::Half* A, + const at::Half* B, + float* C); + +// Release brgemm hardware context +void brgemm_release(); + +// Pack B matrix to get better performance if needed +void pack( + int64_t K, + int64_t N, + int64_t ld_in, + int64_t ld_out, + ScalarType dt_in, + ScalarType dt_out, + const void* in, + void* out); + +// Whether pack is needed in the platform. +bool need_pack(ScalarType dt_in); + +} // namespace at::native::cpublas diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CPUFallback.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CPUFallback.h new file mode 100644 index 0000000000000000000000000000000000000000..44cb534b8db2c9b063bef6b6e2c24f16db0ce454 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CPUFallback.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace at::native { + +// This function implements a boxed fallback to CPU. +// External backends can add their own custom logging on top if it to customize their own CPU fallbacks. +TORCH_API void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool error_on_views = false, + c10::DispatchKey cpu_dispatch_key = c10::DispatchKey::CPU); + +// This is a helper function that backends can use to directly call their boxed CPU fallback +// TODO: update and add a usage example after https://github.com/pytorch/pytorch/pull/58092 lands. +template +struct _call_fallback_fn final {}; + +template +struct _call_fallback_fn final { + static ReturnType call(typename c10::maybe_keep_symint::type... args) { + auto op = c10::Dispatcher::singleton() + // TODO: figure out how to make compiler happy without dynamic casts + .findSchemaOrThrow((const char*) Op::name, (const char*) Op::overload_name) + //.findSchemaOrThrow("a", "b") + .typed::type...)>(); + return c10::impl::BoxedKernelWrapper::type...)>::call( + c10::BoxedKernel::makeFromFunction(), + op, + c10::DispatchKeySet(), // we know that the cpu_fallback doesn't use the dispatch keyset. + // TODO: get std::forward<> to work + args... + ); + } +}; + +template +using call_fallback_fn_symint = _call_fallback_fn; + +template +using call_fallback_fn = _call_fallback_fn; + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CanUse32BitIndexMath.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CanUse32BitIndexMath.h new file mode 100644 index 0000000000000000000000000000000000000000..db9742e04021e6fa6942c540c28f4ca6ff90d5df --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CanUse32BitIndexMath.h @@ -0,0 +1,13 @@ +#pragma once +#include +#include + +namespace at { +class TensorBase; +} + +namespace at::native { + +TORCH_API bool canUse32BitIndexMath(const at::TensorBase &t, int64_t max_elem=std::numeric_limits::max()); + +} diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CompositeRandomAccessor.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CompositeRandomAccessor.h new file mode 100644 index 0000000000000000000000000000000000000000..970b7da5cb70931ccb450a6ec24d511f975248c6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CompositeRandomAccessor.h @@ -0,0 +1,34 @@ +#pragma once + +#include + +namespace at::native { + +struct TupleInfoCPU { + template + using tuple = std::tuple; + + template + static constexpr auto tie(Types&... args) noexcept { + return std::tie(args...); + } +}; + +template +using CompositeRandomAccessorCPU = + CompositeRandomAccessor; + +template +void swap( + references_holder rh1, + references_holder rh2 +) { + return std::swap(rh1.data(), rh2.data()); +} + +template +auto get(references_holder rh) -> decltype(std::get(rh.data())) { + return std::get(rh.data()); +} + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CompositeRandomAccessorCommon.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CompositeRandomAccessorCommon.h new file mode 100644 index 0000000000000000000000000000000000000000..9111c3515afcefec2d81a261737ec28bcae00cdc --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CompositeRandomAccessorCommon.h @@ -0,0 +1,263 @@ +#include + +#pragma once + +namespace at::native { + +namespace { + +// operator_brackets_proxy is used in +// CompositeRandomAccessor in place of operator[]. +// For some iterators, references returned by operator[] +// could become invalid, operator_brackets_proxy tries to +// resolve that by making accessor[n] to be equivalent to +// *(accessor + n). +template +class operator_brackets_proxy { + using reference = typename std::iterator_traits::reference; + using value_type = typename std::iterator_traits::value_type; + +public: + C10_HOST_DEVICE + operator_brackets_proxy(Accessor const& accessor) + : accessor(accessor) + {} + + C10_HOST_DEVICE + operator reference() { + return *accessor; + } + + C10_HOST_DEVICE + reference operator*() { + return *accessor; + } + + C10_HOST_DEVICE + operator_brackets_proxy& operator=(value_type const& val) { + *accessor = val; + return *this; + } + +private: + Accessor accessor; +}; + +} + +// references_holder is used as a surrogate for the +// references type from std::iterator_traits in CompositeRandomAccessor. +// It is assumed in CompositeRandomAccessor that +// References = tuple, +// Values = tuple by default, +// but they could be anything as long as References could be +// cast to Values. +// If you plan to use it with STL, for example, you will need to +// define 'swap` and `get`(aka std::get) methods. +template +class references_holder { +public: + using values = Values; + using references = References; + + C10_HOST_DEVICE + references_holder(references refs) + : refs{std::move(refs)} + {} + + C10_HOST_DEVICE + operator references() { + return refs; + } + + C10_HOST_DEVICE + operator values() { + return refs; + } + + C10_HOST_DEVICE + references_holder& operator=(values vals) { + refs = vals; + return *this; + } + + C10_HOST_DEVICE + references& data() { + return refs; + } + +protected: + references refs; +}; + +// CompositeRandomAccessor is essentially a simplified version of +// a random access iterator over two random access iterators. +// TupleInfo should contain a variadic type `tuple`, and a method `tie`, +// which constructs a tuple of references from a variadic list of arguments. +template +class CompositeRandomAccessor { + using self_type = CompositeRandomAccessor; + + using key_accessor_value_type = + typename std::iterator_traits::value_type; + using value_accessor_value_type = + typename std::iterator_traits::value_type; + using key_accessor_reference_type = + typename std::iterator_traits::reference; + using value_accessor_reference_type = + typename std::iterator_traits::reference; + + using composite_value_type = typename TupleInfo::template tuple< + key_accessor_value_type, + value_accessor_value_type>; + using composite_reference = typename TupleInfo::template tuple< + key_accessor_reference_type, + value_accessor_reference_type>; + +public: + using value_type = composite_value_type; + using reference = references_holder; + // Note that CompositeRandomAccessor does not hold key and values + // in a specific datastructure, which means that a pointer to a (key, value) + // is not defined. Hence we just use a pointer type of the KeyAccessor. + using pointer = typename std::iterator_traits::pointer; + using difference_type = typename std::iterator_traits::difference_type; + using iterator_category = std::random_access_iterator_tag; + + C10_HOST_DEVICE + CompositeRandomAccessor() = default; + + C10_HOST_DEVICE + CompositeRandomAccessor(KeyAccessor keys, ValueAccessor values) + : keys(keys), values(values) + {} + + // Pointer-like operations { + C10_HOST_DEVICE + reference operator*() const { + return TupleInfo::tie(*keys, *values); + } + + // operator->() is supposed to return a pointer type. + // Since CompositeRandomAccessor does not hold pointers to pairs, + // we just return a pointer to a key. + C10_HOST_DEVICE + auto* operator->() const { + return keys.operator->(); + } + + C10_HOST_DEVICE + reference operator[](difference_type idx) { + return operator_brackets_proxy( + CompositeRandomAccessor(keys + idx, values + idx) + ); + } + // } + + // Prefix/postfix increment/decrement { + C10_HOST_DEVICE + CompositeRandomAccessor& operator++() { + ++keys; + ++values; + return *this; + } + + C10_HOST_DEVICE + CompositeRandomAccessor operator++(int) { + CompositeRandomAccessor copy(*this); + ++*this; + return copy; + } + + C10_HOST_DEVICE + CompositeRandomAccessor& operator--() { + --keys; + --values; + return *this; + } + + C10_HOST_DEVICE + CompositeRandomAccessor operator--(int) { + CompositeRandomAccessor copy(*this); + --*this; + return copy; + } + // } + + // Arithmetic operations { + C10_HOST_DEVICE + CompositeRandomAccessor& operator+=(difference_type offset) { + keys += offset; + values += offset; + return *this; + } + + C10_HOST_DEVICE + CompositeRandomAccessor operator+(difference_type offset) const { + return CompositeRandomAccessor(keys + offset, values + offset); + } + + C10_HOST_DEVICE + friend CompositeRandomAccessor operator+( + difference_type offset, + const CompositeRandomAccessor& accessor + ) { + return accessor + offset; + } + + C10_HOST_DEVICE + CompositeRandomAccessor& operator-=(difference_type offset) { + keys -= offset; + values -= offset; + return *this; + } + + C10_HOST_DEVICE + CompositeRandomAccessor operator-(difference_type offset) const { + return CompositeRandomAccessor(keys - offset, values - offset); + } + + C10_HOST_DEVICE + difference_type operator-(const CompositeRandomAccessor& other) const { + return keys - other.keys; + } + // } + + // Comparison operators { + C10_HOST_DEVICE + bool operator==(const CompositeRandomAccessor& other) const { + return keys == other.keys; + } + + C10_HOST_DEVICE + bool operator!=(const CompositeRandomAccessor& other) const { + return keys != other.keys; + } + + C10_HOST_DEVICE + bool operator<(const CompositeRandomAccessor& other) const { + return keys < other.keys; + } + + C10_HOST_DEVICE + bool operator<=(const CompositeRandomAccessor& other) const { + return keys <= other.keys; + } + + C10_HOST_DEVICE + bool operator>(const CompositeRandomAccessor& other) const { + return keys > other.keys; + } + + C10_HOST_DEVICE + bool operator>=(const CompositeRandomAccessor& other) const { + return keys >= other.keys; + } + // } + +protected: + KeyAccessor keys; + ValueAccessor values; +}; + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..ad05a820f7b275096a3fe933e60bfd7538ef84bd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvUtils.h @@ -0,0 +1,449 @@ +#pragma once +#include +#include +#include +#include +#include +#include + +#include + +namespace at::native { + +using conv_depthwise2d_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, std::array); +DECLARE_DISPATCH(conv_depthwise2d_backward_fn, conv_depthwise2d_backward_stub); +using conv_depthwise3d_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, std::array); +DECLARE_DISPATCH(conv_depthwise3d_backward_fn, conv_depthwise3d_backward_stub); +using cudnn_convolution_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, int64_t, bool, bool, bool, std::array); +DECLARE_DISPATCH(cudnn_convolution_backward_fn, cudnn_convolution_backward_stub); +using mps_convolution_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, int64_t, std::array); +DECLARE_DISPATCH(mps_convolution_backward_fn, mps_convolution_backward_stub); +using cudnn_convolution_transpose_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, bool, std::array); +DECLARE_DISPATCH(cudnn_convolution_transpose_backward_fn, cudnn_convolution_transpose_backward_stub); +using miopen_convolution_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, int64_t, bool, bool, std::array); +DECLARE_DISPATCH(miopen_convolution_backward_fn, miopen_convolution_backward_stub); +using miopen_convolution_transpose_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, std::array); +DECLARE_DISPATCH(miopen_convolution_transpose_backward_fn, miopen_convolution_transpose_backward_stub); +using miopen_depthwise_convolution_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, int64_t, bool, bool, std::array); +DECLARE_DISPATCH(miopen_depthwise_convolution_backward_fn, miopen_depthwise_convolution_backward_stub); +using mkldnn_convolution_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, int64_t, std::array); +DECLARE_DISPATCH(mkldnn_convolution_backward_fn, mkldnn_convolution_backward_stub); +using mkldnn_convolution_transpose_fn = Tensor(*)(const Tensor&, const Tensor&, const std::optional&, + IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t); +DECLARE_DISPATCH(mkldnn_convolution_transpose_fn, mkldnn_convolution_transpose_stub); +using mkldnn_convolution_transpose_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, int64_t, std::array); +DECLARE_DISPATCH(mkldnn_convolution_transpose_backward_fn, mkldnn_convolution_transpose_backward_stub); +using slow_conv_dilated2d_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, std::array); +DECLARE_DISPATCH(slow_conv_dilated2d_backward_fn, slow_conv_dilated2d_backward_stub); +using slow_conv_dilated3d_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, std::array); +DECLARE_DISPATCH(slow_conv_dilated3d_backward_fn, slow_conv_dilated3d_backward_stub); +using slow_conv_transpose2d_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, std::array); +DECLARE_DISPATCH(slow_conv_transpose2d_backward_fn, slow_conv_transpose2d_backward_stub); +using slow_conv_transpose3d_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, std::array); +DECLARE_DISPATCH(slow_conv_transpose3d_backward_fn, slow_conv_transpose3d_backward_stub); + +namespace { + bool is_cudnnv8_heuristic_mode_b() { + static const bool is_cudnnv8_heuristic_mode_b = c10::utils::check_env("TORCH_CUDNN_USE_HEURISTIC_MODE_B") == true; + return is_cudnnv8_heuristic_mode_b; + } +} + +inline bool cudnnv8_enabled_check_debug() { + static bool cudnnv8_flag = c10::utils::check_env("TORCH_CUDNN_V8_API_DISABLED") != true; + static bool cudnnv8_debug = c10::utils::check_env("TORCH_CUDNN_V8_API_DEBUG") == true; + static uint8_t cudnnv8_debugcount = 0; + if (cudnnv8_debug == 1 && cudnnv8_debugcount < 10) { + TORCH_WARN("TORCH_CUDNN_V8_DEBUG ON, V8 ON: ", cudnnv8_flag, " TORCH_CUDNN_USE_HEURISTIC_MODE B: ", is_cudnnv8_heuristic_mode_b()); + cudnnv8_debugcount++; + } + return cudnnv8_flag == 1; +} + +inline bool cudnnv8_use_heur_mode_b() { + return is_cudnnv8_heuristic_mode_b(); +} + +// Keep in sync with py::enum_ in Module.cpp +enum class ConvBackend { + CudaDepthwise2d, + CudaDepthwise3d, + Cudnn, + CudnnTranspose, + Empty, + Miopen, + MiopenDepthwise, + MiopenTranspose, + Mkldnn, + MkldnnTranspose, + MkldnnEmpty, + NnpackSpatial, + Overrideable, + Slow2d, + Slow3d, + SlowDilated2d, + SlowDilated3d, + SlowTranspose2d, + SlowTranspose3d, + Winograd3x3Depthwise, + Xnnpack2d, + Mps, + MpsTranspose, +}; + +// Overload for selecting the convolution backend from the full set of convolution inputs. +// This overload is exposed to python for testing, etc. +TORCH_API ConvBackend select_conv_backend( + const Tensor& input, const Tensor& weight, const std::optional& bias_opt, + SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation, + bool transposed, SymIntArrayRef output_padding, c10::SymInt groups, const at::OptionalSymIntArrayRef bias_sizes_opt); + +TORCH_API at::MemoryFormat _determine_backend_memory_format(const Tensor& input, + const Tensor& weight, + const ConvBackend backend); + +// --------------------------------------------------------------------- +// +// Math +// +// --------------------------------------------------------------------- + +constexpr int input_batch_size_dim = 0; // also grad_input +constexpr int input_channels_dim = 1; +constexpr int output_batch_size_dim = 0; // also grad_output +constexpr int output_channels_dim = 1; +constexpr int weight_output_channels_dim = 0; +constexpr int weight_input_channels_dim = 1; + +// Often written as 2 + max_dim (extra dims for batch size and channels) +constexpr int max_dim = 3; + +// --------------------------------------------------------------------- +// +// Checking +// +// --------------------------------------------------------------------- + +// Used on pad, stride and dilation +static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name) +{ + TORCH_CHECK(args.size() <= expected_size, + "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ", + expected_size, " (while checking arguments for ", c, ")"); + TORCH_CHECK(args.size() >= expected_size, + "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ", + expected_size, " (while checking arguments for ", c, ")"); + + auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;}); + if (num_negative_values > 0){ + std::stringstream ss; + ss << arg_name << " should be greater than zero but got ("; + std::copy(args.begin(), args.end() - 1, std::ostream_iterator(ss,", ")); + ss << args.back() << ")" << " (while checking arguments for " << c << ")"; + AT_ERROR(ss.str()); + } +} + + +// NOTE [ Convolution checks ] +// +// NB: For many call sites, it is not strictly necessary to check all of +// these relationships (for example, for forward convolution, we compute +// the size of output ourselves, so we don't actually need to check +// output. However, writing a single function that does everything +// means we get to reuse it for both forwards and all backwards +// variants, even when the set of "real" inputs varies. The magic of +// relational computing! +// +// (There is one downside, which is that it is slightly harder to write +// error messages which are able to distinguish between real inputs +// (which the user can change) and computed inputs (which the user can +// only indirectly affect). It would be an interesting exercise to +// come up with a general framework to handle such situations.) +inline void convolution_shape_check( + CheckedFrom c, + const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) +{ + check_args(c, padding, input->dim() - 2, "padding"); + check_args(c, stride, padding.size(), "stride"); + check_args(c, dilation, padding.size(), "dilation"); + + // Input + checkDimRange(c, input, 3, 6 /* exclusive */); + checkSize_symint(c, input, input_channels_dim, weight->size(1) * groups); + + // Weight + checkSameDim(c, input, weight); + + // TODO: check that output->size() matches output_sizes + // TODO: check that weight matches output->sizes() + checkSameDim(c, input, output); +} + +// NB: conv_output_size and conv_input_size are not bijections, +// as conv_output_size loses information; this is why conv_input_size +// takes an extra output_padding argument to resolve the ambiguity. + +template +inline std::vector _conv_output_size( + ArrayRef input_size, ArrayRef weight_size, + ArrayRef padding, ArrayRef stride, ArrayRef dilation = ArrayRef() +) { + // ASSERT(input_size.size() > 2) + // ASSERT(input_size.size() == weight_size.size()) + bool has_dilation = !dilation.empty(); + auto dim = input_size.size(); + std::vector output_size(dim); + output_size[0] = input_size[input_batch_size_dim]; + output_size[1] = weight_size[weight_output_channels_dim]; + for (const auto d : c10::irange(2, dim)) { + auto dilation_ = has_dilation ? dilation[d - 2] : 1; + auto kernel = dilation_ * (weight_size[d] - 1) + 1; + output_size[d] = (input_size[d] + (2 * padding[d - 2]) - kernel) / stride[d - 2] + 1; + } + return output_size; +} + +inline std::vector conv_output_size( + IntArrayRef input_size, IntArrayRef weight_size, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef() +) { + return _conv_output_size(input_size, weight_size, padding, stride, dilation); +} + +inline std::vector conv_output_size( + SymIntArrayRef input_size, SymIntArrayRef weight_size, + SymIntArrayRef padding, SymIntArrayRef stride, SymIntArrayRef dilation = SymIntArrayRef() +) { + return _conv_output_size(input_size, weight_size, padding, stride, dilation); +} + +template +std::vector _conv_input_size( + ArrayRef output_size, ArrayRef weight_size, + ArrayRef padding, ArrayRef output_padding, ArrayRef stride, ArrayRef dilation, T groups +) { + // ASSERT(output_size.size() > 2) + // ASSERT(output_size.size() == weight_size.size()) + auto dim = output_size.size(); + std::vector input_size(dim); + input_size[0] = output_size[output_batch_size_dim]; + input_size[1] = weight_size[weight_input_channels_dim] * groups; + for (const auto d : c10::irange(2, dim)) { + auto kernel = (weight_size[d] - 1) * dilation[d - 2] + 1; + input_size[d] = (output_size[d] - 1) * stride[d - 2] - (padding[d - 2] * 2) + + kernel + output_padding[d - 2]; + } + return input_size; +} + +inline std::vector conv_input_size( + SymIntArrayRef output_size, SymIntArrayRef weight_size, + SymIntArrayRef padding, SymIntArrayRef output_padding, SymIntArrayRef stride, SymIntArrayRef dilation, c10::SymInt groups +) { + return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, std::move(groups)); +} + +inline std::vector conv_input_size( + IntArrayRef output_size, IntArrayRef weight_size, + IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups +) { + return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups); +} + +template +std::vector _conv_weight_size( + ArrayRef input_size, ArrayRef output_size, + ArrayRef padding, ArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups +) { + auto dim = input_size.size(); + std::vector weight_size(dim); + weight_size[0] = output_size[1]; + weight_size[1] = input_size[1] / groups; + for (const auto d : c10::irange(2, dim)) { + auto kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2] + + padding[d - 2] * 2 - output_padding[d - 2]; + weight_size[d] = (kernel - 1) / dilation[d - 2] + 1; + } + return weight_size; +} + +inline std::vector conv_weight_size( + SymIntArrayRef input_size, SymIntArrayRef output_size, + SymIntArrayRef padding, SymIntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups +) { + return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups); +} + +inline std::vector conv_weight_size( + IntArrayRef input_size, IntArrayRef output_size, + IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups +) { + return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups); +} + +inline Tensor reshape_bias(int64_t dim, const Tensor& bias) { + std::vector shape(dim, 1); + shape[1] = -1; + return bias.reshape(shape); +} + +inline at::MemoryFormat cudnn_conv_suggest_memory_format(const at::Tensor& input, const at::Tensor& weight) { + // disable NHWC for float64 input. + if (!at::detail::getCUDAHooks().compiledWithCuDNN() || + input.scalar_type() == at::kDouble || + weight.scalar_type() == at::kDouble) { + return at::MemoryFormat::Contiguous; + } + long cudnn_version = at::detail::getCUDAHooks().versionCuDNN(); + auto input_memory_format = input.suggest_memory_format(); + auto weight_memory_format = weight.suggest_memory_format(); + auto weight_ndim = weight.ndimension(); + + bool can_use_cudnn_channels_last_2d = (cudnn_version >= 7603) && (weight_ndim == 4) && ( + (input_memory_format == at::MemoryFormat::ChannelsLast) || + (weight_memory_format == at::MemoryFormat::ChannelsLast) + ); + if (can_use_cudnn_channels_last_2d) { + return at::MemoryFormat::ChannelsLast; + } + + bool can_use_cudnn_channels_last_3d = (cudnn_version >= 8005) && (weight_ndim == 5) && ( + (input_memory_format == at::MemoryFormat::ChannelsLast3d) || + (weight_memory_format == at::MemoryFormat::ChannelsLast3d) + ); + if (can_use_cudnn_channels_last_3d) { + return at::MemoryFormat::ChannelsLast3d; + } + + return at::MemoryFormat::Contiguous; +} + +// controls whether emptyCache will be called following cudnn conv benchmarking +TORCH_API void _cudnn_set_conv_benchmark_empty_cache(bool enable); +TORCH_API bool _cudnn_get_conv_benchmark_empty_cache(); + + +inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { + + // disable NHWC for float64 input. + if (!at::detail::getCUDAHooks().compiledWithMIOpen() || + input.scalar_type() == at::kDouble || + weight.scalar_type() == at::kDouble) { + return false; + } + + bool can_use_miopen_channels_last_2d = false; + // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen + // See #64427 + static std::optional PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC"); + + auto input_memory_format = input.suggest_memory_format(); + auto weight_memory_format = weight.suggest_memory_format(); + + can_use_miopen_channels_last_2d = PYTORCH_MIOPEN_SUGGEST_NHWC && *PYTORCH_MIOPEN_SUGGEST_NHWC && ( + ( (input_memory_format == at::MemoryFormat::ChannelsLast) || + (weight_memory_format == at::MemoryFormat::ChannelsLast) ) + ); + + bool can_use_miopen_channels_last_3d = false; + + return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d; +} + +inline bool mkldnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { + + // disable NHWC for float64 input. + if (input.scalar_type() == at::kDouble || + weight.scalar_type() == at::kDouble) { + return false; + } + + // disable NHWC for MkldnnCPU tensor. + if (input.is_mkldnn() || weight.is_mkldnn()) { + return false; + } + + auto input_memory_format = input.suggest_memory_format(); + auto weight_memory_format = weight.suggest_memory_format(); + + bool can_use_mkldnn_channels_last_2d = + (input_memory_format == at::MemoryFormat::ChannelsLast) || + (weight_memory_format == at::MemoryFormat::ChannelsLast); + + bool can_use_mkldnn_channels_last_3d = + (input_memory_format == at::MemoryFormat::ChannelsLast3d) || + (weight_memory_format == at::MemoryFormat::ChannelsLast3d); + + return can_use_mkldnn_channels_last_2d || can_use_mkldnn_channels_last_3d; +} + +inline bool thnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { + + auto input_memory_format = input.suggest_memory_format(); + auto weight_memory_format = weight.suggest_memory_format(); + + bool can_use_thnn_channels_last_2d = input.device().is_cpu() && ( + (input_memory_format == at::MemoryFormat::ChannelsLast) || ( + weight_memory_format == at::MemoryFormat::ChannelsLast)); + + return can_use_thnn_channels_last_2d; +} + +inline bool xpu_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { + + // check layout only for xpu tensor. + if (!input.is_xpu() || !weight.is_xpu()) { + return false; + } + + // disable NHWC for float64 input. + if (input.scalar_type() == at::kDouble || + weight.scalar_type() == at::kDouble) { + return false; + } + + auto input_memory_format = input.suggest_memory_format(); + auto weight_memory_format = weight.suggest_memory_format(); + + bool can_use_xpu_channels_last_2d = + (input_memory_format == at::MemoryFormat::ChannelsLast) || + (weight_memory_format == at::MemoryFormat::ChannelsLast); + + bool can_use_xpu_channels_last_3d = + (input_memory_format == at::MemoryFormat::ChannelsLast3d) || + (weight_memory_format == at::MemoryFormat::ChannelsLast3d); + + return can_use_xpu_channels_last_2d || can_use_xpu_channels_last_3d; +} + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvolutionMM3d.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvolutionMM3d.h new file mode 100644 index 0000000000000000000000000000000000000000..3de6763015c6616599a604ee169dacc55985a385 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvolutionMM3d.h @@ -0,0 +1,14 @@ +#include + +namespace at::native { + +std::tuple slow_conv3d_backward_cpu( + const Tensor& grad_output, + const Tensor& self, + const Tensor& weight, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + std::array output_mask); + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Copy.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Copy.h new file mode 100644 index 0000000000000000000000000000000000000000..14abb32fa5ad4ba3cd8c78084569b313a4a692cd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Copy.h @@ -0,0 +1,20 @@ +#pragma once + +#include + +namespace at { + +class Tensor; +struct TensorIterator; +class TensorBase; + +namespace native { + +using copy_fn = void (*)(TensorIterator&, bool non_blocking); + +DECLARE_DISPATCH(copy_fn, copy_stub); + +TORCH_API void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src); + +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Cross.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Cross.h new file mode 100644 index 0000000000000000000000000000000000000000..9daee7f2d6c43586630ad38ac731b22ce2416ff3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Cross.h @@ -0,0 +1,14 @@ +#pragma once + +#include + +namespace at { +class Tensor; + +namespace native { + +using cross_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const int64_t d); + +DECLARE_DISPATCH(cross_fn, cross_stub); + +}} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DilatedConvolutionUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DilatedConvolutionUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..cd580020374a66aa058938e1186fbfd577a76980 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DilatedConvolutionUtils.h @@ -0,0 +1,229 @@ +#pragma once + +#include +#include + +#include +#include +#include + +#define TORCH_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \ + TORCH_CHECK( \ + T.dim() == DIM && T.size(DIM_SIZE) == SIZE, \ + "Need " #T " of dimension ", \ + DIM, \ + " and " #T ".size[", \ + DIM_SIZE, \ + "] == ", \ + SIZE, \ + " but got input to be of shape ", \ + T.sizes()) + +namespace at::native::internal { +namespace { +inline bool all_positive(IntArrayRef& arr) { + return std::all_of( + arr.begin(), arr.end(), [](int64_t item) { return item > 0; }); +} + +inline bool all_nonnegative(std::vector& arr) { + return std::all_of( + arr.begin(), arr.end(), [](int64_t item) { return item >= 0; }); +} + +} // namespace + +// calculate the rear part of output tensor sizes +template +std::vector get_output_size( + const Tensor& input, + IntArrayRef kernel_size, + IntArrayRef stride_size, + IntArrayRef pad_size, + IntArrayRef dilation_size) { + std::vector sizes; + for (const auto index : c10::irange(dim)) { + sizes.push_back( + div_rtn( + input.size(index + input.dim() - dim) + 2 * pad_size[index] - + (dilation_size[index] * (kernel_size[index] - 1) + 1), + stride_size[index]) + + 1); + } + return sizes; +} + +// calculate the sizes of output tensor +template +std::vector get_output_size( + const Tensor& input, + const Tensor& weight, + IntArrayRef kernel_size, + IntArrayRef stride_size, + IntArrayRef pad_size, + IntArrayRef dilation_size) { + auto output_size = get_output_size( + input, kernel_size, stride_size, pad_size, dilation_size); + output_size.insert(output_size.begin(), weight.size(0)); + if (input.dim() == dim + 2) { + output_size.insert(output_size.begin(), input.size(0)); + } + return output_size; +} +/* + slow_conv_dilated_shape_check - check user-input to dilated convolution + forward and backward functions. +*/ +template +void slow_conv_dilated_shape_check( + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + const Tensor& grad_output, + IntArrayRef kernel_size, + IntArrayRef stride_size, + IntArrayRef pad_size, + IntArrayRef dilation_size) { + /* + When the following tensors are defined: + + bias, grad_weight, grad_output + + then these are assumed to be contiguous without checking + because of these tensors are made contiguous by calling + .contiguous() method or by resizing of zero-sized tensors in + forward/backward functions. + + When grad_weight is defined then it is assumed without + checking to have the same shape as weight, see backward + functions. + */ + // Check size arguments + TORCH_CHECK( + kernel_size.size() == dim, + "kernel sizes length should be ", + dim, + ", but got ", + kernel_size.size()); + TORCH_CHECK( + stride_size.size() == dim, + "strides length should be ", + dim, + ", but got ", + stride_size.size()); + TORCH_CHECK( + dilation_size.size() == dim, + "dilations length should be ", + dim, + ", but got ", + dilation_size.size()); + TORCH_CHECK( + pad_size.size() == dim, + "pads length should be ", + dim, + ", but got ", + pad_size.size()); + + TORCH_CHECK( + all_positive(kernel_size), + "kernel size should be greater than zero, but got ", + kernel_size); + TORCH_CHECK( + all_positive(stride_size), + "stride should be greater than zero, but got ", + stride_size); + TORCH_CHECK( + all_positive(dilation_size), + "dilation should be greater than zero, but got ", + dilation_size); + + // check input + TORCH_CHECK(input.defined(), "input must be defined"); + bool is_batch = input.dim() == dim + 2; + int64_t n = (is_batch ? 2 : 1); + int64_t ndim = n + dim; + if (!is_batch) { + // input dim has to be dim + 1 if not batched + TORCH_CHECK( + input.dim() == dim + 1, + "input must be 4D or 5D tensor but got ", + input.dim(), + "D tensor"); + } + + // check output sizes + auto output_size = get_output_size( + input, kernel_size, stride_size, pad_size, dilation_size); + + TORCH_CHECK( + all_nonnegative(output_size), + "calculated output size ", + output_size, + " is too small (all sizes must be non-negative)"); + + // check weight + TORCH_CHECK(weight.defined(), "weight must be defined"); + TORCH_CHECK( + weight.dim() == dim + 2, + "weight must be ", + dim + 2, + "D tensor but got ", + weight.dim(), + "D tensor dim=", + dim); + TORCH_CHECK( + weight.sizes().slice(2) == kernel_size, + "weight[2:] shape ", + weight.sizes().slice(2), + " must be equal to kernel_size ", + kernel_size); + + TORCH_CHECK_DIM_SIZE(input, input.dim(), (is_batch ? 1 : 0), weight.size(1)); + + // check bias when present + if (bias.defined()) { + TORCH_CHECK( + bias.dim() == 1, + "bias must be 1D tensor but got ", + bias.dim(), + "D tensor"); + TORCH_CHECK_DIM_SIZE(bias, 1, 0, weight.size(0)); + } + + // check grad_output when present + if (grad_output.defined()) { + TORCH_CHECK( + grad_output.dim() == ndim, + "grad_output must be ", + ndim, + "D tensor but got ", + grad_output.dim(), + "D tensor"); + if (is_batch) { + TORCH_CHECK( + grad_output.size(0) == input.size(0), + "grad_output.size(0)=", + grad_output.size(0), + " must be input.size(0)=", + input.size(0)); + } + TORCH_CHECK( + grad_output.size(n - 1) == weight.size(0), + "grad_output.size(", + n - 1, + ")=", + grad_output.size(n - 1), + " must be weight.size(0)=", + weight.size(0)); + TORCH_CHECK( + grad_output.sizes().slice(n) == output_size, + "grad_output[", + n, + ":] shape", + grad_output.sizes().slice(n), + " must be equal to output size ", + output_size); + } +} + +} // namespace at::native::internal diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DispatchStub.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DispatchStub.h new file mode 100644 index 0000000000000000000000000000000000000000..641b779701d1e207bd709faa77191816fa8bc1b8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DispatchStub.h @@ -0,0 +1,444 @@ +#pragma once + +#include +#include +#include + +#include +#include +#include + +// Implements instruction set specific function dispatch. +// +// Kernels that may make use of specialized instruction sets (e.g. AVX2) are +// compiled multiple times with different compiler flags (e.g. -mavx2). A +// DispatchStub contains a table of function pointers for a kernel. At runtime, +// the fastest available kernel is chosen based on the features reported by +// cpuinfo. +// +// Example: +// +// In native/MyKernel.h: +// using fn_type = void(*)(const Tensor& x); +// DECLARE_DISPATCH(fn_type, stub); +// +// In native/MyKernel.cpp +// DEFINE_DISPATCH(stub); +// +// In native/cpu/MyKernel.cpp: +// namespace { +// // use anonymous namespace so that different cpu versions won't conflict +// void kernel(const Tensor& x) { ... } +// } +// REGISTER_DISPATCH(stub, &kernel); +// +// To call: +// stub(kCPU, tensor); +// +// TODO: CPU instruction set selection should be folded into whatever +// the main dispatch mechanism is. +// +// Supported device types for registration: +// - CPU: Central Processing Unit +// - CUDA: NVIDIA GPUs +// - HIP: AMD GPUs +// - MPS: Apple Silicon GPUs (Metal Performance Shaders) +// - MTIA: Meta Training and Inference Devices +// - XPU: Intel GPUs +// - PrivateUse1: Reserved for private/custom device types +// +// If you want to update the list of supported devices, add a new dispatch_ptr +// member in DispatchStubImpl.h and update the get_call_ptr switch. +// As well you will need to update the inlined list in 'is_device_supported` +// +// +// ignore warnings about DispatchStub::DEFAULT, AVX, AVX2 defined elsewhere +C10_CLANG_DIAGNOSTIC_PUSH() +C10_CLANG_DIAGNOSTIC_IGNORE("-Wundefined-var-template") + +namespace at::native { + +enum class CPUCapability { + DEFAULT = 0, +#if defined(HAVE_VSX_CPU_DEFINITION) + VSX = 1, +#elif defined(HAVE_ZVECTOR_CPU_DEFINITION) + ZVECTOR = 1, +#else + AVX2 = 1, + AVX512 = 2, +#endif + NUM_OPTIONS +}; + +// Enum for error types +enum class ErrorType { + MissingDeviceKernel, + DeviceNotSupported +}; + +// Alias for the return type using std::variant +using DispatchResult = std::variant; + +CPUCapability get_cpu_capability(); + +template +struct DispatchStub; + +/** + * The sole purpose of this class is to outline methods that don't need to be + * specialized or otherwise inlined and duplicated (by the compiler due to + * template expansion), since it causes size bloat if there are a significant + * number of specialization of the DispatchStub<> class. + */ +struct TORCH_API DispatchStubImpl { + + // The DispatchStubImpl::try_get_call_ptr() method is used to get the call + // pointer for a given device type. If the call pointer is not found, + // DispatchStubImpl::try_get_call_ptr() returns an ErrorType. + // The main difference between try_get_call_ptr() and get_call_ptr() is that + // try_get_call_ptr() will return the ErrorType and not raise an exception. + DispatchResult try_get_call_ptr( + c10::DeviceType device_type + , void *DEFAULT +#ifdef HAVE_AVX512_CPU_DEFINITION + , void *AVX512 +#endif +#ifdef HAVE_AVX2_CPU_DEFINITION + , void *AVX2 +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + , void *VSX +#endif +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + , void *ZVECTOR +#endif + ); + + // Analogous to try_get_call_ptr(), but it will return the ErrorType and not + // raise an exception. + DispatchResult try_choose_cpu_impl( + void *DEFAULT +#ifdef HAVE_AVX512_CPU_DEFINITION + , void *AVX512 +#endif +#ifdef HAVE_AVX2_CPU_DEFINITION + , void *AVX2 +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + , void *VSX +#endif +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + , void *ZVECTOR +#endif + ); + + + void* get_call_ptr( + c10::DeviceType device_type + , void *DEFAULT +#ifdef HAVE_AVX512_CPU_DEFINITION + , void *AVX512 +#endif +#ifdef HAVE_AVX2_CPU_DEFINITION + , void *AVX2 +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + , void *VSX +#endif +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + , void *ZVECTOR +#endif + ); + + /** + * The CPU Dispatch actual method is chosen in decreasing order of preference by + * DispatchStubImpl::choose_cpu_impl() in case none is found by + * DispatchStubImpl::get_call_ptr() in cpu_dispatch_ptr. + */ + void* choose_cpu_impl( + void *DEFAULT +#ifdef HAVE_AVX512_CPU_DEFINITION + , void *AVX512 +#endif +#ifdef HAVE_AVX2_CPU_DEFINITION + , void *AVX2 +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + , void *VSX +#endif +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + , void *ZVECTOR +#endif + ); + + // Fixing dispatch error in Windows debug builds. + // See https://github.com/pytorch/pytorch/issues/22681 for more details. + #if defined(_MSC_VER) && defined(_DEBUG) + std::atomic cpu_dispatch_ptr; + void* cuda_dispatch_ptr; + void* hip_dispatch_ptr; + void* mps_dispatch_ptr; + void* mtia_dispatch_ptr; + #if defined(USE_XPU) + void* xpu_dispatch_ptr; + #endif + void* privateuse1_dispatch_ptr; + #else + std::atomic cpu_dispatch_ptr{nullptr}; + void* cuda_dispatch_ptr = nullptr; + void* hip_dispatch_ptr = nullptr; + void* mps_dispatch_ptr = nullptr; + void* mtia_dispatch_ptr = nullptr; + #if defined(USE_XPU) + void* xpu_dispatch_ptr = nullptr; + #endif + void* privateuse1_dispatch_ptr = nullptr; + #endif +}; + +template +struct DispatchStub { + using FnPtr = rT (*) (Args...); + + DispatchStub() = default; + DispatchStub(const DispatchStub&) = delete; + DispatchStub& operator=(const DispatchStub&) = delete; + +private: + FnPtr get_call_ptr(const c10::DeviceType device_type) { + return reinterpret_cast( + impl.get_call_ptr(device_type + , reinterpret_cast(DEFAULT) +#ifdef HAVE_AVX512_CPU_DEFINITION + , reinterpret_cast(AVX512) +#endif +#ifdef HAVE_AVX2_CPU_DEFINITION + , reinterpret_cast(AVX2) +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + , reinterpret_cast(VSX) +#endif +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + , reinterpret_cast(ZVECTOR) +#endif + ) + ); + } + +public: + template + rT operator()(c10::DeviceType device_type, ArgTypes&&... args) { + FnPtr call_ptr = get_call_ptr(device_type); + return (*call_ptr)(std::forward(args)...); + } + + void set_cuda_dispatch_ptr(FnPtr fn_ptr) { + impl.cuda_dispatch_ptr = reinterpret_cast(fn_ptr); + } + + #if defined(USE_XPU) + void set_xpu_dispatch_ptr(FnPtr fn_ptr){ + impl.xpu_dispatch_ptr = reinterpret_cast(fn_ptr); + } + #endif + + void set_hip_dispatch_ptr(FnPtr fn_ptr) { + impl.hip_dispatch_ptr = reinterpret_cast(fn_ptr); + } + + void set_mps_dispatch_ptr(FnPtr fn_ptr) { + impl.mps_dispatch_ptr = reinterpret_cast(fn_ptr); + } + + void set_mtia_dispatch_ptr(FnPtr fn_ptr) { + impl.mtia_dispatch_ptr = reinterpret_cast(fn_ptr); + } + + void set_privateuse1_dispatch_ptr(FnPtr fn_ptr) { + impl.privateuse1_dispatch_ptr = reinterpret_cast(fn_ptr); + } + + // Returns true if the dispatcher has a kernel registered for this device + // type. + bool is_device_supported(const c10::DeviceType device_type) { + auto result = impl.try_get_call_ptr(device_type + , reinterpret_cast(DEFAULT) +#ifdef HAVE_AVX512_CPU_DEFINITION + , reinterpret_cast(AVX512) +#endif +#ifdef HAVE_AVX2_CPU_DEFINITION + , reinterpret_cast(AVX2) +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + , reinterpret_cast(VSX) +#endif +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + , reinterpret_cast(ZVECTOR) +#endif + ); + if (std::holds_alternative(result)){ + return false; + } + return true; + }; + + static TORCH_API FnPtr DEFAULT; +#ifdef HAVE_AVX512_CPU_DEFINITION + static TORCH_API FnPtr AVX512; +#endif +#ifdef HAVE_AVX2_CPU_DEFINITION + static TORCH_API FnPtr AVX2; +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + static TORCH_API FnPtr VSX; +#endif +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + static TORCH_API FnPtr ZVECTOR; +#endif +private: + DispatchStubImpl impl; +}; + +namespace { +template +struct RegisterCUDADispatch { + RegisterCUDADispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) { + stub.set_cuda_dispatch_ptr(value); + } +}; + +template +struct RegisterXPUDispatch { + RegisterXPUDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value){ + stub.set_xpu_dispatch_ptr(value); + } +}; + +template +struct RegisterMPSDispatch { + RegisterMPSDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) { + stub.set_mps_dispatch_ptr(value); + } +}; + +template +struct RegisterHIPDispatch { + RegisterHIPDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) { + // TODO: make this point at hip_dispatch_ptr + stub.set_cuda_dispatch_ptr(value); + } +}; + +template +struct RegisterMTIADispatch { + RegisterMTIADispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) { + stub.set_mtia_dispatch_ptr(value); + } +}; + +template +struct RegisterPRIVATEUSE1Dispatch { + RegisterPRIVATEUSE1Dispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) { + stub.set_privateuse1_dispatch_ptr(value); + } +}; + +} // anonymous namespace +// Compiler will complain if you put things like std::tuple in +// the `fn` argument of DECLARE_DISPATCH. Some possible workarounds, e.g., +// adding parentheses and using helper struct to get rid of the parentheses, do +// not work with MSVC. So do a `using`-declaration if you need to pass in such +// `fn`, e.g., grid_sampler_2d_backward_cpu_kernel in GridSampleKernel.h. +#define DECLARE_DISPATCH(fn, name) \ + struct name##_DECLARE_DISPATCH_type : DispatchStub { \ + name##_DECLARE_DISPATCH_type() = default; \ + name##_DECLARE_DISPATCH_type(const name##_DECLARE_DISPATCH_type&) = delete; \ + name##_DECLARE_DISPATCH_type& operator=(const name##_DECLARE_DISPATCH_type&) = delete; \ + }; \ + extern TORCH_API struct name##_DECLARE_DISPATCH_type name; + +#define DEFINE_DISPATCH(name) struct name##_DECLARE_DISPATCH_type name + +#define REGISTER_ARCH_DISPATCH(name, arch, fn) \ + template <> name##_DECLARE_DISPATCH_type::FnPtr TORCH_API DispatchStub::arch = fn; + +#ifdef HAVE_AVX512_CPU_DEFINITION +#define REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX512, fn) +#else +#define REGISTER_AVX512_DISPATCH(name, fn) +#endif + +#ifdef HAVE_AVX2_CPU_DEFINITION +#define REGISTER_AVX2_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX2, fn) +#else +#define REGISTER_AVX2_DISPATCH(name, fn) +#endif + +#ifdef HAVE_VSX_CPU_DEFINITION +#define REGISTER_VSX_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, VSX, fn) +#else +#define REGISTER_VSX_DISPATCH(name, fn) +#endif + +#ifdef HAVE_ZVECTOR_CPU_DEFINITION +#define REGISTER_ZVECTOR_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, ZVECTOR, fn) +#else +#define REGISTER_ZVECTOR_DISPATCH(name, fn) +#endif + +// Macro to register the same kernel for all CPU arch types. This is useful +// if a kernel does not benefit from being recompiled across different arch types. +#define REGISTER_ALL_CPU_DISPATCH(name, fn) \ + REGISTER_ARCH_DISPATCH(name, DEFAULT, fn) \ + REGISTER_AVX512_DISPATCH(name, fn) \ + REGISTER_AVX2_DISPATCH(name, fn) \ + REGISTER_VSX_DISPATCH(name, fn) \ + REGISTER_ZVECTOR_DISPATCH(name, fn) + +#define REGISTER_NO_CPU_DISPATCH(name) \ + REGISTER_ALL_CPU_DISPATCH(name, nullptr) + +#define REGISTER_CUDA_DISPATCH(name, fn) \ + static RegisterCUDADispatch name ## __register(name, fn); + +#define REGISTER_XPU_DISPATCH(name, fn) \ + static RegisterXPUDispatch name ## __register(name, fn); + +#define REGISTER_HIP_DISPATCH(name, fn) \ + static RegisterHIPDispatch name ## __register(name, fn); + +#define REGISTER_MPS_DISPATCH(name, fn) \ + static RegisterMPSDispatch name ## __register(name, fn); + +#define REGISTER_MTIA_DISPATCH(name, fn) \ + static RegisterMTIADispatch name ## __register(name, fn); + +#define REGISTER_PRIVATEUSE1_DISPATCH(name, fn) \ + static RegisterPRIVATEUSE1Dispatch name ## __register(name, fn); + +// NB: This macro must be used in an actual 'cu' file; if you try using +// it from a 'cpp' file it will not work! +#if defined(__CUDACC__) +#define REGISTER_DISPATCH(name, fn) REGISTER_CUDA_DISPATCH(name, fn) +#elif defined(__HIPCC__) +// TODO: cut this over to HIP dispatch once we stop pretending that CUDA +// is HIP in the PyTorch HIPify build. +#define REGISTER_DISPATCH(name, fn) REGISTER_CUDA_DISPATCH(name, fn) +// #define REGISTER_DISPATCH(name, fn) REGISTER_HIP_DISPATCH(name, fn) +#elif defined(__OBJC__) && defined(USE_MPS) +// NB: this macro must be used from a 'mm' file in order to dispatch a MPS kernel +#define REGISTER_DISPATCH(name, fn) REGISTER_MPS_DISPATCH(name, fn) +#elif defined(CPU_CAPABILITY) +// REGISTER_DISPATCH now dispatches an AVX512 kernel to nullptr but registers other dispatches. +// ALSO_REGISTER_AVX512_DISPATCH should be used for ensuring AVX512 dispatch, among others. +#ifdef CPU_CAPABILITY_AVX512 +#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, ((void*)(fn) ? nullptr : nullptr)) +#else +#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn) +#endif +#define ALSO_REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn) +#endif +} // namespace at::native + +C10_CLANG_DIAGNOSTIC_POP() diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DistributionTemplates.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DistributionTemplates.h new file mode 100644 index 0000000000000000000000000000000000000000..38c171e56dfae97f5162a5b70e6ceedd0a91db7d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DistributionTemplates.h @@ -0,0 +1,394 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#include +#include +#endif + +namespace at::native::templates { + +// ==================================================== Random ======================================================== + +// The purpose of `update_from` and `update_to` is to find the closest valid int64_t number that can be used as actual `from`. +// The current implementation of `random_` uses uint64_t arithmetics and casts the result to the target dtype(scalar_t). +// This casting can result in generating numbers that happen to be greater or equal to `to` value. For instance: +// +// auto actual = torch::empty({3, 3}, torch::half); +// actual.random_(0, 65504); +// +// If random's uint64_t arithmetics produces 65503 as a random value after casting to torch::half it becomes 65504 +// and violates the requirement that random value must be less than `to`. To resolve this issue `update_from` and `update_to` +// moves `from` to the right and `to` to the left to the next closest value that won't go outside [from, to) after casting to +// the target dtype. For `to` = 65504 it moves left for (1 << (log2(to) - 11 + 1)) = 32 and becomes 65472, which is previous +// available number for torch::half dtype. +template +int64_t update_from(int64_t from) { + static_assert( + std::is_floating_point::value || + std::is_same::value || + std::is_same::value, "scalar_t must be floating-point type"); + const auto from_plus_1 = static_cast(static_cast(from + 1)); + if (from_plus_1 < from) { + int64_t from_ = std::abs(from + 1); + int n = 0; + while (from_ >>= 1) ++n; + // NOLINTNEXTLINE(clang-analyzer-core.UndefinedBinaryOperatorResult) + from = from_plus_1 + (1LL << (n - std::numeric_limits::digits + 1)); + } + return from; +} + +template +int64_t update_to(int64_t to) { + static_assert( + std::is_floating_point::value || + std::is_same::value || + std::is_same::value, "scalar_t must be floating-point type"); + const auto to_minus_1 = static_cast(static_cast(to - 1)); + if (to_minus_1 >= to) { + int64_t to_ = std::abs(to - 1); + int n = 0; + while (to_ >>= 1) ++n; + // NOLINTNEXTLINE(clang-analyzer-core.UndefinedBinaryOperatorResult) + to = to_minus_1 - (1LL << (n - std::numeric_limits::digits + 1)); + } + return to; +} + +// Return earlier for not invoking kernel. +// See https://github.com/pytorch/pytorch/issues/103418 for more details +#define CHECK_EMPTY_AND_RETURN(tensor) \ + if (tensor.numel() == 0) { \ + return tensor; \ + } + +template class random_kernel, typename RNG> +at::Tensor& random_impl(at::Tensor& self, std::optional generator) { + CHECK_EMPTY_AND_RETURN(self); + auto iter = at::TensorIterator::borrowing_nullary_op(self); + random_kernel()(iter, generator); + return self; +} + +#define CHECK_OUT_OF_BOUNDS(var, name, min, max, dtype) \ + TORCH_CHECK(var >= min && var <= max, name , " is out of bounds for ", dtype); \ + +#define WARN_OUT_OF_BOUNDS(var, name, digits, dtype) \ + if (var < -(1LL << digits) || var > (1LL << digits)) { \ + TORCH_WARN(name , " is out of bounds [-(2^", digits, "), 2^", digits, "]. ", \ + "Due to precision limitations ", dtype, " can support discrete uniform distribution only within this range. ", \ + "This warning will become an error in version 1.7 release, please fix the code in advance"); \ + } + +inline void check_from_to_in_range(int64_t from, int64_t to_inc, caffe2::TypeMeta dtype) { + const auto scalar_type = typeMetaToScalarType(dtype); + if (isFloatingType(scalar_type)) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "check_random_fp_bounds", [&] { + const auto min = static_cast(std::numeric_limits::lowest()); + const auto max = static_cast(std::numeric_limits::max()); + CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype); + CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", min, max, dtype); + + constexpr auto digits = std::numeric_limits::digits; + WARN_OUT_OF_BOUNDS(from, "from", digits, dtype); + WARN_OUT_OF_BOUNDS(to_inc, "to - 1", digits, dtype); + }); + } else if (scalar_type == kUInt64) { + // When you do a comparison between int64_t and uint64_t, the usual + // arithmetic conversions say that the int64_t value is promoted to + // unsigned. But this conversion wraps around: if I had -1 as my int64_t, + // then it will promote to 0xFFFFFFFFFFFFFFFF in uint64_t. This is never + // the right thing to do. + CHECK_OUT_OF_BOUNDS(from, "from", 0, INT64_MAX, dtype); + CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", 0, INT64_MAX, dtype); + } else if (isIntegralType(scalar_type, /*includeBool=*/true)) { + AT_DISPATCH_V2(scalar_type, "check_random_integral_bounds", AT_WRAP([&]() { + const auto min = static_cast(std::numeric_limits::lowest()); + const auto max = static_cast(std::numeric_limits::max()); + CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype); + CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", min, max, dtype); + }), AT_EXPAND(AT_INTEGRAL_TYPES), kUInt16, kUInt32, kBool); + } else { + TORCH_CHECK(false, "check_random_bounds handles only integral, floating-point and boolean types"); + } +} + +template class random_from_to_kernel, typename RNG> +at::Tensor& random_from_to_impl(at::Tensor& self, int64_t from, std::optional to_opt, std::optional generator) { + uint64_t range = 0; + auto iter = at::TensorIterator::borrowing_nullary_op(self); + if (to_opt.has_value()) { + // [from, to) + int64_t to = *to_opt; + TORCH_CHECK(from < to, "random_ expects 'from' to be less than 'to', but got from=", from, " >= to=", to); + if (isFloatingType(iter.dtype())) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "random_update_from_to", [&] { + from = update_from(from); + to = update_to(to); + TORCH_CHECK(from < to, "random_ expects 'from' casted to dtype to be less than 'to' casted to dtype, but got from=", from, " >= to=", to); + }); + } + check_from_to_in_range(from, to - 1, self.dtype()); + CHECK_EMPTY_AND_RETURN(self); + range = static_cast(to) - static_cast(from); + random_from_to_kernel()(iter, range, from, generator); + } else if (from != std::numeric_limits::lowest()) { + // [from, std::numeric_limits::max()] + int64_t to_inc = 0; + if (isFloatingType(iter.dtype())) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "random_from_to_range_calc", [&] { + constexpr int64_t scalar_t_max = static_cast(1) << std::numeric_limits::digits; + to_inc = scalar_t_max > std::numeric_limits::max() ? std::numeric_limits::max() : static_cast(scalar_t_max); + from = update_from(from); + TORCH_CHECK(from < to_inc, "random_ expects 'from' casted to dtype to be less than or equal to 'to_inc' casted to dtype, but got from=", from, " > to_inc=", to_inc); + }); + } else if (isIntegralType(iter.dtype(), /*includeBool=*/true)) { + AT_DISPATCH_V2(self.scalar_type(), "random_from_to_range_calc", AT_WRAP([&] { + if constexpr (std::is_same_v) { + to_inc = static_cast(true); + } else { + to_inc = static_cast(std::numeric_limits::max()); + } + }), AT_EXPAND(AT_INTEGRAL_TYPES_V2), kBool); + } else { + TORCH_CHECK(false, "random_from_to_impl handles only integral, floating-point and boolean types"); + } + check_from_to_in_range(from, to_inc, self.dtype()); + CHECK_EMPTY_AND_RETURN(self); + range = static_cast(to_inc) - static_cast(from) + 1; + random_from_to_kernel()(iter, range, from, generator); + } else { + // [std::numeric_limits::lowest(), std::numeric_limits::max()] + // range = 2^64 + CHECK_EMPTY_AND_RETURN(self); + random_from_to_kernel()(iter, generator); + } + return self; +} + +// ==================================================== Normal ======================================================== + +#define CHECK_NORMAL_TENSOR_STD(std) \ + do { \ + TORCH_CHECK( \ + !std.is_complex(), \ + "normal expects standard deviation to be non-complex"); \ + TORCH_CHECK( \ + std.numel() == 0 || std.is_meta() || std.min().ge(0).item(), \ + "normal expects all elements of std >= 0.0"); \ + } while (0) + +#define CHECK_NORMAL_STD(std) \ + TORCH_CHECK(std >= 0.0, "normal expects std >= 0.0, but found std ", std); + +template class normal_kernel, typename RNG> +Tensor& normal_impl_(Tensor& self, double mean, double std, std::optional gen) { + CHECK_NORMAL_STD(std); + CHECK_EMPTY_AND_RETURN(self); + + if (self.is_complex()) { + auto float_tensor = at::view_as_real(self); + // variance for normal distribution of the real and imaginary values + // is half of the input variance + normal_kernel()(float_tensor, mean, std/(std::sqrt(2)), gen); + } else { + normal_kernel()(self, mean, std, gen); + } + return self; +} + +template class normal_kernel, typename RNG> +Tensor& normal_out_impl(Tensor& output, const Tensor& mean, double std, std::optional gen) { + CHECK_NORMAL_STD(std); + auto std_tensor = at::empty_like(output, MemoryFormat::Contiguous); + auto shape = at::infer_size(mean.sizes(), std_tensor.sizes()); + at::native::resize_output(output, shape); + normal_impl_(output, 0, std, gen); + output.add_(mean); + return output; +} + +template class normal_kernel, typename RNG> +Tensor& normal_out_impl(Tensor& output, double mean, const Tensor& std, std::optional gen) { + CHECK_NORMAL_TENSOR_STD(std); + auto mean_tensor = at::full({}, mean, output.options()); + auto shape = at::infer_size(mean_tensor.sizes(), std.sizes()); + at::native::resize_output(output, shape); + normal_impl_(output, 0, 1, gen); + // CUDA NB: addcmul_out copies the tensor to be added into the output. + // The previous function here was addcmul_out(output, mean_tensor, output, std, 1); + // The third argument is not a constant reference and hence the samples in output are overwritten. + // Consequently, the computation performed is mean_tensor + mean_tensor * std instead of mean_tensor + output * std + output.mul_(std).add_(mean_tensor); + return output; +} + +template class normal_kernel, typename RNG> +Tensor& normal_out_impl(Tensor& output, const Tensor& mean, const Tensor& std, std::optional gen) { + CHECK_NORMAL_TENSOR_STD(std); + auto shape = at::infer_size(mean.sizes(), std.sizes()); + at::native::resize_output(output, shape); + normal_impl_(output, 0, 1, gen); + // CUDA NB: addcmul_out copies the tensor to be added into the output. + // The previous function here was addcmul_out(output, mean, output, std, 1); + // The third argument is not a constant reference and hence the samples in output are overwritten. + // Consequently, the computation performed is mean + mean * std instead of mean + output * std + output.mul_(std).add_(mean); + return output; +} + +template class normal_kernel, typename RNG> +Tensor normal_impl(const Tensor& mean, double std, std::optional gen) { + CHECK_NORMAL_STD(std); + Tensor ret = at::empty_like(mean, MemoryFormat::Contiguous); + normal_out_impl(ret, mean, std, gen); + return ret; +} + +template class normal_kernel, typename RNG> +Tensor normal_impl(double mean, const Tensor& std, std::optional gen) { + CHECK_NORMAL_TENSOR_STD(std); + Tensor ret = at::empty_like(std, MemoryFormat::Contiguous); + normal_out_impl(ret, mean, std, gen); + return ret; +} + +template class normal_kernel, typename RNG> +Tensor normal_impl(const Tensor& mean, const Tensor& std, std::optional gen) { + CHECK_NORMAL_TENSOR_STD(std); + auto shape = at::infer_size(mean.sizes(), std.sizes()); + Tensor ret = at::empty(shape, mean.options(), MemoryFormat::Contiguous); + normal_out_impl(ret, mean, std, gen); + return ret; +} + +// ==================================================== Uniform ======================================================= + +template class uniform_kernel, typename RNG> +at::Tensor& uniform_impl_(at::Tensor& self, double from, double to, std::optional generator) { + if (self.is_complex()) { + CHECK_EMPTY_AND_RETURN(self); + auto float_tensor = at::view_as_real(self); + uniform_impl_(float_tensor, from, to, generator); + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "check_uniform_bounds", [&] { + [[maybe_unused]] const auto dtype = self.dtype(); + const auto min = static_cast(std::numeric_limits::lowest()); + const auto max = static_cast(std::numeric_limits::max()); + CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype); + CHECK_OUT_OF_BOUNDS(to, "to", min, max, dtype); + TORCH_CHECK(from <= to, "uniform_ expects to return a [from, to) range, but found from=", from, " > to=", to); + TORCH_CHECK((to - from) <= std::numeric_limits::max(), + "uniform_ expects to-from <= std::numeric_limits<", toString(self.scalar_type()), + ">::max(), but found to=", to, " and from=", from, + " which result in to-from to exceed the limit"); + from = std::min(std::max(from, min), max); + to = std::max(std::min(to, max), min); + }); + CHECK_EMPTY_AND_RETURN(self); + auto iter = at::TensorIterator::borrowing_nullary_op(self); + uniform_kernel()(iter, from, to, generator); + } + return self; +} + +// ================================================== LogNormal ======================================================= + +template class log_normal_kernel, typename RNG> +at::Tensor& log_normal_impl_(at::Tensor& self, double mean, double std, std::optional gen) { + TORCH_CHECK(std > 0.0, "log_normal_ expects std > 0.0, but found std=", std); + CHECK_EMPTY_AND_RETURN(self); + auto iter = TensorIterator::borrowing_nullary_op(self); + log_normal_kernel()(iter, mean, std, gen); + return self; +} + +// =================================================== Geometric ====================================================== + +template class geometric_kernel, typename RNG> +Tensor& geometric_impl_(Tensor& self, double p, std::optional gen) { + TORCH_CHECK(0 < p && p < 1, "geometric_ expects p to be in (0, 1), but got p=", p); + CHECK_EMPTY_AND_RETURN(self); + auto iter = TensorIterator::borrowing_nullary_op(self); + geometric_kernel()(iter, p, gen); + return self; +} + +// ================================================== Exponential ===================================================== + +template class exponential_kernel, typename RNG> +Tensor& exponential_impl_(Tensor& self, double lambda, std::optional gen) { + TORCH_CHECK(lambda > 0.0, "exponential_ expects lambda > 0.0, but found lambda=", lambda); + CHECK_EMPTY_AND_RETURN(self); + auto iter = TensorIterator::borrowing_nullary_op(self); + exponential_kernel()(iter, lambda, gen); + return self; +} + +// ==================================================== Cauchy ======================================================== + +template class cauchy_kernel, typename RNG> +Tensor& cauchy_impl_(Tensor& self, double median, double sigma, std::optional gen) { + // TODO: instead of variable name 'sigma', use 'gamma' or 'scale' + // the variance, squared sigma, is undefined for cauchy distribution + TORCH_CHECK(sigma > 0.0, "cauchy_ expects sigma > 0.0, but found sigma=", sigma); + TORCH_CHECK(at::isFloatingType(self.scalar_type()), "Cauchy distribution is a continuous probability distribution. dtype must be a floating point but you specified ", self.dtype()); + CHECK_EMPTY_AND_RETURN(self); + auto iter = TensorIterator::borrowing_nullary_op(self); + cauchy_kernel()(iter, median, sigma, gen); + return self; +} + +// ==================================================== Bernoulli ===================================================== + +template class bernoulli_tensor_kernel, typename RNG> +Tensor& bernoulli_impl_(Tensor& self, const Tensor& p_, std::optional gen) { + CHECK_EMPTY_AND_RETURN(self); + NoNamesGuard guard; + at::assert_no_internal_overlap(self); + bernoulli_tensor_kernel()(self, p_, gen); + return self; +} + +template class bernoulli_scalar_kernel, typename RNG> +Tensor& bernoulli_impl_(Tensor& self, double p, std::optional gen) { + TORCH_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p); + CHECK_EMPTY_AND_RETURN(self); + at::assert_no_internal_overlap(self); + bernoulli_scalar_kernel()(self, p, gen); + return self; +} + +template class bernoulli_tensor_kernel, typename RNG> +Tensor& bernoulli_out_impl(Tensor& result, const Tensor& self, std::optional gen) { + // result.resize_as_(self) requires self to have same dtype as result, so we + // use resize_ instead. + // TODO: Fix resize_as_. See pytorch/pytorch#11665. + result.resize_(self.sizes()); + bernoulli_impl_(result, self, gen); + namedinference::propagate_names(result, self); + return result; +} + +#undef CHECK_OUT_OF_BOUNDS +#undef WARN_OUT_OF_BOUNDS + +} // namespace at::native::templates diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Distributions.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Distributions.h new file mode 100644 index 0000000000000000000000000000000000000000..664e2db3b2dc5b2370929729649da6f4f4086c97 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Distributions.h @@ -0,0 +1,518 @@ +#pragma once + +#include +#include +#include + +// ROCM hcc doesn't work well with using std:: in kernel functions +#if defined(__CUDA_ARCH__) +#include +#define compat_exp c10::cuda::compat::exp +#define compat_ceil c10::cuda::compat::ceil +#define compat_floor c10::cuda::compat::floor +#define compat_log c10::cuda::compat::log +#define compat_pow c10::cuda::compat::pow +#define compat_sqrt c10::cuda::compat::sqrt +#define compat_tan c10::cuda::compat::tan +#define compat_abs c10::cuda::compat::abs +#define compat_log1p c10::cuda::compat::log1p +#elif defined(__HIPCC__) +#include +#define compat_exp c10::hip::compat::exp +#define compat_ceil c10::hip::compat::ceil +#define compat_floor c10::hip::compat::floor +#define compat_log c10::hip::compat::log +#define compat_pow c10::hip::compat::pow +#define compat_sqrt c10::hip::compat::sqrt +#define compat_tan c10::hip::compat::tan +#define compat_abs c10::hip::compat::abs +#define compat_log1p c10::hip::compat::log1p +#else +#define compat_exp std::exp +#define compat_ceil std::ceil +#define compat_floor std::floor +#define compat_log std::log +#define compat_pow std::pow +#define compat_sqrt std::sqrt +#define compat_tan std::tan +#define compat_abs std::abs +#define compat_log1p std::log1p +#endif + +namespace { + +#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__) +// we cannot use std::isnan directly due to some incompatibility of +// gcc constexpr'ing and nvcc +using std::isnan; +#endif + +// Here sampler_t should be function type scalar_t(void). For gpu +// "sampler" is a device function, but since ROCM doesn't have +// equivalent to nvstd::function, we use a template type parameter to +// capture it. +template +struct BaseSampler { + sampler_t sampler; + C10_DEVICE BaseSampler(const sampler_t& sampler): sampler(sampler) {} + C10_DEVICE scalar_t sample() { + return sampler(); + } +}; + +// The function `sample_gamma` is +// is adapted from Numpy's distributions.c implementation. +// It is MIT licensed, so here is the copyright: + +/* Copyright 2005 Robert Kern (robert.kern@gmail.com) + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +template +C10_DEVICE scalar_t sample_gamma(scalar_t alpha, BaseSampler& standard_uniform, BaseSampler& standard_normal) { + accscalar_t scale = 1.0f; + + // Boost alpha for higher acceptance probability. + if (alpha < 1.0f) { + if (alpha == 0.f) return 0.f; + scale *= compat_pow(1 - standard_uniform.sample(), 1.0f / alpha); + alpha += 1.0f; + } + + // This implements the acceptance-rejection method of Marsaglia and Tsang (2000) + // doi:10.1145/358407.358414 + const accscalar_t d = alpha - 1.0f / 3.0f; + const accscalar_t c = 1.0f / compat_sqrt(9.0f * d); + for (;;) { + accscalar_t x, y; + do { + x = standard_normal.sample(); + y = 1.0f + c * x; + } while (y <= 0); + const accscalar_t v = y * y * y; + const accscalar_t u = 1 - standard_uniform.sample(); + const accscalar_t xx = x * x; + if (u < 1.0f - 0.0331f * xx * xx) + return static_cast(scale * d * v); + if (compat_log(u) < 0.5f * xx + d * (1.0f - v + compat_log(v))) + return static_cast(scale * d * v); + } +} + +/* the functions stirling_approx_tail, binomial_inversion, and btrs are adapted + * from TensorFlow's random_binomial_op.cc implementation. That code is under + * copyright: 2019 The TensorFlow Authors. + * + * It was released under the Apache License, Version 2.0 (the "License"), available at: + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +template +C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) { + const static scalar_t kTailValues[] = { + 0.0810614667953272, + 0.0413406959554092, + 0.0276779256849983, + 0.02079067210376509, + 0.0166446911898211, + 0.0138761288230707, + 0.0118967099458917, + 0.0104112652619720, + 0.00925546218271273, + 0.00833056343336287 + }; + if (k <= 9) { + return kTailValues[static_cast(k)]; + } + scalar_t kp1sq = (k + 1) * (k + 1); + return (1.0 / 12 - (1.0 / 360 - 1.0 / 1260 / kp1sq) / kp1sq) / (k + 1); +} + + +template +C10_DEVICE scalar_t binomial_inversion(scalar_t count, scalar_t prob, BaseSampler& standard_uniform) { + accscalar_t U; + accscalar_t geom_sum = 0; + scalar_t num_geom = 0; + + accscalar_t logprob = compat_log1p(-prob); + + while (1) { + U = standard_uniform.sample(); + accscalar_t geom = compat_ceil(compat_log(U) / logprob); + geom_sum += geom; + if (geom_sum > count) { + break; + } + num_geom = num_geom + 1; + } + return num_geom; +} + +template +C10_DEVICE scalar_t btrs(scalar_t count, scalar_t prob, BaseSampler& standard_uniform) { + scalar_t k; + accscalar_t U, V, us; + + // This is spq in the paper. + const accscalar_t stddev = compat_sqrt(count * prob * (1 - prob)); + + // Other coefficients for Transformed Rejection sampling. + const accscalar_t b = 1.15 + 2.53 * stddev; + const accscalar_t a = -0.0873 + 0.0248 * b + 0.01 * prob; + const accscalar_t c = count * prob + 0.5; + const accscalar_t v_r = 0.92 - 4.2 / b; + const accscalar_t r = prob / (1 - prob); + + const accscalar_t alpha = (2.83 + 5.1 / b) * stddev; + const accscalar_t m = compat_floor((count + 1) * prob); + + while (1) { + U = standard_uniform.sample() - 0.5; + V = standard_uniform.sample(); + + us = 0.5 - compat_abs(U); + k = static_cast(compat_floor((2 * a / us + b) * U + c)); + + // Reject non-sensical answers. + if (k < 0 || k > count) { + continue; + } + // Region for which the box is tight, and we can return our calculated value. + // This should happen 0.86 * v_r times. In the limit as n * p is large, + // the acceptance rate converges to ~79% (and in the lower regime it is ~24%). + if (us >= 0.07 && V <= v_r) { + return k; + } + + // This deviates from Hormann's BTRS algorithm, as there is a log missing. + // For all (u, v) pairs outside of the bounding box, this calculates the + // transformed-reject ratio. + V = compat_log(V * alpha / (a / (us * us) + b)); + accscalar_t upperbound = + ((m + 0.5) * compat_log((m + 1) / (r * (count - m + 1))) + + (count + 1) * compat_log((count - m + 1) / (count - k + 1)) + + (k + 0.5) * compat_log(r * (count - k + 1) / (k + 1)) + + stirling_approx_tail(m) + stirling_approx_tail(count - m) - + stirling_approx_tail(k) - stirling_approx_tail(count - k)); + + if (V <= upperbound) { + return k; + } + } +} + +template +C10_DEVICE scalar_t sample_binomial(scalar_t count, scalar_t prob, BaseSampler& standard_uniform) { + if (count <= 0.0 || prob <= 0.0) { + return 0; + } else if (prob >= 1.0) { + return count; + } else if (prob <= 0.5) { + if (count * prob >= 10.0) { + // btrs + return btrs(count, prob, standard_uniform); + } else { + // binomial inversion + return binomial_inversion(count, prob, standard_uniform); + } + } else if (prob > 0.5) { + scalar_t qprob = 1.0 - prob; + if (count * qprob >= 10.0) { + // btrs + return count - btrs(count, qprob, standard_uniform); + } else { + // count - binomial inversion + return count - binomial_inversion(count, qprob, standard_uniform); + } + } else { + // prob is nan? + return static_cast(NAN); + } +} + +/* + * This function is derived from the implementation of the digamma function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library] in ATen/native/Math.h. + */ +template +C10_DEVICE inline scalar_t digamma_one(scalar_t x) { + constexpr accscalar_t PSI_10 = 2.25175258906672110764; + if (x == 0) { + return INFINITY; + } + accscalar_t additional_summand = 0; + int x_is_integer = x == compat_floor(x); + if (x < 0) { + if (x_is_integer) { + return INFINITY; + } + // it is more standard to write this as recursion, but + // nvcc does not like that + additional_summand = -c10::pi / + compat_tan(c10::pi * x); + x = 1 - x; + } + + // Push x to be >= 10 + accscalar_t result = 0; + while (x < 10) { + result -= 1 / x; + x += 1; + } + if (x == 10) { + return result + PSI_10 + additional_summand; + } + + // Compute asymptotic digamma + static const accscalar_t A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2, + }; + + accscalar_t y = 0; + if (x < 1.0e17f) { + accscalar_t z = 1.0 / (x * x); + y = z * polevl(z, A, 6); + } + return static_cast( + result + compat_log(x) - (0.5f / x) - y + additional_summand); +} + +// Computes the reparameterized gradient -(d/dalpha cdf(x;alpha)) / pdf(x;alpha) +// for random number x drawn from a standard Gamma distribution Gamma(alpha). +template +C10_HOST_DEVICE scalar_t standard_gamma_grad_one(scalar_t alpha_, scalar_t x_) { + // Use a Taylor series expansion for small x. + accscalar_t x = static_cast(x_); + accscalar_t alpha = static_cast(alpha_); + if (x < 0.8f) { + accscalar_t numer = 1; + accscalar_t denom = alpha; + auto series1 = numer / denom; + auto series2 = numer / (denom * denom); + for (int i = 1; i <= 5; ++i) { + numer *= -x / static_cast(i); + denom += 1; + series1 += numer / denom; + series2 += numer / (denom * denom); + } + const auto pow_x_alpha = compat_pow(x, alpha); + const auto gamma_pdf = compat_pow(x, alpha - 1) * compat_exp(-x); + const auto gamma_cdf = pow_x_alpha * series1; + const auto gamma_cdf_alpha = + (compat_log(x) - digamma_one(alpha)) * + gamma_cdf - + pow_x_alpha * series2; + const auto result = -gamma_cdf_alpha / gamma_pdf; + return isnan(result) ? static_cast( 0.f ) : static_cast(result); + } + + // Use a Rice saddle point expansion for large alpha. + if (alpha > 8.0f) { + if (0.9f * alpha <= x && x <= 1.1f * alpha) { + const auto numer_1 = 1 + 24 * alpha * (1 + 12 * alpha); + const auto numer_2 = 1440 * (alpha * alpha) + 6 * x * (53 - 120 * x) + - 65 * x * x / alpha + alpha * (107 + 3600 * x); + const auto denom = 1244160 * (alpha * alpha) * (alpha * alpha); + return static_cast(numer_1 * numer_2 / denom); + } + const auto denom = compat_sqrt(8 * alpha); + const auto term2 = denom / (alpha - x); + const auto term3 = compat_pow( + x - alpha - alpha * compat_log(x / alpha), + static_cast(-1.5)); + const auto term23 = (x < alpha) ? term2 - term3 : term2 + term3; + const auto term1 = compat_log(x / alpha) * term23 - + compat_sqrt(2 / alpha) * (alpha + x) / ((alpha - x) * (alpha - x)); + const auto stirling = 1 + 1 / (12 * alpha) * (1 + 1 / (24 * alpha)); + const auto numer = x * term1; + return static_cast(-stirling * numer / denom); + } + + // Use a bivariate rational approximation to the reparameterized gradient. + const auto u = compat_log(x / alpha); + const auto v = compat_log(alpha); + static const accscalar_t coef_uv[3][8] = { + {0.16009398, -0.094634809, 0.025146376, -0.0030648343, + 1, 0.32668115, 0.10406089, 0.0014179084}, + {0.53487893, 0.1298071, 0.065735949, -0.0015649758, + 0.16639465, 0.020070113, -0.0035938915, -0.00058392623}, + {0.040121004, -0.0065914022, -0.0026286047, -0.0013441777, + 0.017050642, -0.0021309326, 0.00085092367, -1.5247877e-07}, + }; + accscalar_t coef_v[8]; + for (int i = 0; i < 8; ++ i) { + coef_v[i] = coef_uv[0][i] + u * (coef_uv[1][i] + u * coef_uv[2][i]); + } + const auto p = coef_v[0] + v * (coef_v[1] + v * (coef_v[2] + v * coef_v[3])); + const auto q = coef_v[4] + v * (coef_v[5] + v * (coef_v[6] + v * coef_v[7])); + return static_cast(compat_exp(p / q)); +} + +// Approximate reparameterized gradient of Beta(x,alpha,beta) wrt alpha. +// Assumes x is close to zero and uses a Taylor expansion. +template +C10_DEVICE inline scalar_t _beta_grad_alpha_small(scalar_t x, scalar_t alpha, scalar_t beta) { + const scalar_t factor = digamma_one(alpha) + - digamma_one(alpha + beta) - compat_log(x); + scalar_t numer = 1; + scalar_t series = numer / alpha * (factor + 1 / alpha); + for (int i = 1; i <= 10; ++i) { + scalar_t casted_i = static_cast(i); + numer *= (casted_i - beta) * x / casted_i; + const scalar_t denom = alpha + casted_i; + series += numer / denom * (factor + 1 / denom); + } + const scalar_t result = x * compat_pow(1 - x, -beta) * series; + return isnan(result) ? static_cast( 0.f ) : result; +} + +// Approximate reparameterized gradient of Beta(x,alpha,beta) wrt beta. +// Assumes x is close to zero and uses a Taylor expansion. +template +C10_DEVICE inline scalar_t _beta_grad_beta_small(scalar_t x, scalar_t alpha, scalar_t beta) { + const scalar_t factor = digamma_one(alpha + beta) - digamma_one(beta); + scalar_t numer = 1, betas = 1, dbetas = 0, series = factor / alpha; + for (int i = 1; i <= 8; ++i) { + scalar_t casted_i = static_cast(i); + numer *= -x / casted_i; + dbetas = dbetas * (beta - casted_i) + betas; + betas = betas * (beta - casted_i); + series += numer / (alpha + casted_i) * (dbetas + factor * betas); + } + const scalar_t result = -compat_pow(1 - x, 1 - beta) * series; + return isnan(result) ? static_cast( 0.f ) : result; +} + +// Approximate reparameterized gradient of Beta(x,alpha,beta) wrt alpha. +// Assumes alpha and beta are both large and uses a Rice saddle point expansion. +// To ensure numerical stability, this computation is performed at higher precision. +template +C10_DEVICE inline scalar_t _beta_grad_alpha_mid(accscalar_t x, accscalar_t alpha, accscalar_t beta) { + const accscalar_t total = alpha + beta; + const accscalar_t mean = alpha / total; + const accscalar_t std = compat_sqrt(alpha * beta / (total + 1)) / total; + if (mean - 0.1 * std <= x && x <= mean + 0.1 * std) { + // Avoid the singularity at x = mean. + const accscalar_t poly = 47 * x * (beta * beta) * (beta * beta) + alpha * ( + (43 + 20 * (16 + 27 * beta) * x) * (beta * beta) * beta + alpha * ( + 3 * (59 + 180 * beta - 90 * x) * (beta * beta) + alpha * ( + (453 + 1620 * beta * (1 - x) - 455 * x) * beta + alpha * ( + 8 * (1 - x) * (135 * beta - 11))))); + const accscalar_t prefactor_num = (1 + 12 * alpha) * (1 + 12 * beta) / (total * total); + const accscalar_t prefactor_den = 12960 * alpha * alpha * alpha * beta * beta * (1 + 12 * total); + return prefactor_num / (1 - x) * poly / prefactor_den; + } + const accscalar_t prefactor = -x / compat_sqrt(2 * alpha * beta / total); + const accscalar_t stirling = (1 + 1 / (12 * alpha) + 1 / (288 * alpha * alpha)) + * (1 + 1 / (12 * beta) + 1 / (288 * beta * beta)) + / (1 + 1 / (12 * total) + 1 / (288 * total * total)); + const accscalar_t term1_num = 2 * (alpha * alpha) * (x - 1) + alpha * beta * (x - 1) - x * (beta * beta); + const accscalar_t axbx = alpha * (x - 1) + beta * x; + const accscalar_t term1_den = compat_sqrt(2 * alpha / beta) * compat_pow(total, static_cast(1.5f)) * axbx * axbx; + const accscalar_t term1 = term1_num / term1_den; + const accscalar_t term2 = 0.5f * compat_log(alpha / (total * x)); + const accscalar_t term3_num = compat_sqrt(8 * alpha * beta / total); + const accscalar_t term3_den = beta * x + alpha * (x - 1); + const accscalar_t term3 = term3_num / term3_den; + const accscalar_t term4_base = beta * compat_log(beta / (total * (1 - x))) + + alpha * compat_log(alpha / (total * x)); + const accscalar_t term4 = compat_pow(term4_base, static_cast(-1.5f)); + const accscalar_t term1234 = term1 + term2 * (term3 + (x < mean ? term4 : -term4)); + return static_cast(stirling * prefactor * term1234); +} + +// Computes a scaled reparameterized gradient +// -(d/dalpha cdf(x;alpha,beta)) / pdf(x;alpha,beta) / (1-x) +// for random number x drawn from a Beta distribution Beta(alpha,beta). +// This function inputs total=alpha+beta to make it easy to implement +// Dirichlet reparameterized gradients in terms of Betas. +template +C10_HOST_DEVICE inline scalar_t dirichlet_grad_one(scalar_t x, scalar_t alpha, scalar_t total) { + accscalar_t x_ = static_cast(x); + accscalar_t alpha_ = static_cast(alpha); + accscalar_t total_ = static_cast(total); + + const scalar_t beta = total - alpha; + const accscalar_t beta_ = total_ - alpha_; + const scalar_t boundary = total * x * (1 - x); + + // Use an asymptotic approximation for x close to 0. + if (x <= 0.5f && boundary < 2.5f) { + return _beta_grad_alpha_small(x, alpha, beta); + } + + // Use an asymptotic approximation for x close to 1. + if (x >= 0.5f && boundary < 0.75f) { + return -_beta_grad_beta_small(1 - x, beta, alpha); + } + + // Use an asymptotic approximation when alpha and (total - alpha) are both large. + if (alpha > 6 && beta > 6) { + return _beta_grad_alpha_mid(x_, alpha_, beta_); + } + + // Use a rational correction to an analytic approximation. + static const accscalar_t c[2][3][3][4] = { + {{{1.003668233, -0.01061107488, -0.0657888334, 0.01201642863}, + {0.6336835991, -0.3557432599, 0.05486251648, -0.001465281033}, + {-0.03276231906, 0.004474107445, 0.002429354597, -0.0001557569013}}, + {{0.221950385, -0.3187676331, 0.01799915743, 0.01074823814}, + {-0.2951249643, 0.06219954479, 0.01535556598, 0.001550077057}, + {0.02155310298, 0.004170831599, 0.001292462449, 6.976601077e-05}}, + {{-0.05980841433, 0.008441916499, 0.01085618172, 0.002319392565}, + {0.02911413504, 0.01400243777, -0.002721828457, 0.000751041181}, + {0.005900514878, -0.001936558688, -9.495446725e-06, 5.385558597e-05}}}, + {{{1, -0.02924021934, -0.04438342661, 0.007285809825}, + {0.6357567472, -0.3473456711, 0.05454656494, -0.002407477521}, + {-0.03301322327, 0.004845219414, 0.00231480583, -0.0002307248149}}, + {{0.5925320577, -0.1757678135, 0.01505928619, 0.000564515273}, + {0.1014815858, -0.06589186703, 0.01272886114, -0.0007316646956}, + {-0.007258481865, 0.001096195486, 0.0003934994223, -4.12701925e-05}}, + {{0.06469649321, -0.0236701437, 0.002902096474, -5.896963079e-05}, + {0.001925008108, -0.002869809258, 0.0008000589141, -6.063713228e-05}, + {-0.0003477407336, 6.959756487e-05, 1.097287507e-05, -1.650964693e-06}}}, + }; + const accscalar_t u = compat_log(x_); + const accscalar_t a = compat_log(alpha_) - u; + const accscalar_t b = compat_log(total_) - a; + const accscalar_t pow_u[3] = {1, u, u * u}; + const accscalar_t pow_a[3] = {1, a, a * a}; + accscalar_t p = 0.0; + accscalar_t q = 0.0; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + const accscalar_t ua = pow_u[i] * pow_a[j]; + p += ua * (c[0][i][j][0] + b * (c[0][i][j][1] + b * (c[0][i][j][2] + b * c[0][i][j][3]))); + q += ua * (c[1][i][j][0] + b * (c[1][i][j][1] + b * (c[1][i][j][2] + b * c[1][i][j][3]))); + } + } + const accscalar_t approx = x_ * (digamma_one(total_) - digamma_one(alpha_)) / beta_; + return static_cast(p / q * approx); +} + +} // namespace diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/EmbeddingBag.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/EmbeddingBag.h new file mode 100644 index 0000000000000000000000000000000000000000..eb29e1171dcd60a5d9fbc5cb35b488603c1b0886 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/EmbeddingBag.h @@ -0,0 +1,153 @@ +#include +#include +#include + +#ifdef USE_FBGEMM +#include +#endif + +namespace at::native { + +enum class EmbeddingBagMode { + SUM = 0, + MEAN = 1, + MAX = 2, +}; + +[[maybe_unused]] static bool operator==(int64_t op1, EmbeddingBagMode op2) { + return op1 == static_cast(op2); +} + +[[maybe_unused]] static bool operator!=(int64_t op1, EmbeddingBagMode op2) { + return !(op1 == op2); +} + +void check_arguments( + const Tensor& weight, + const Tensor& indices, + const Tensor& offsets, + const int64_t mode, + const std::optional& per_sample_weights, + bool include_last_offset); + +void make_bag_size_out( + Tensor& bag_size_out, + const Tensor& offsets, + const Tensor& indices, + const int64_t mode, + const bool include_last_offset, + const bool requires_grad); + +void make_max_indices_out( + Tensor& max_indices_out, + const Tensor& weight, + const Tensor& indices, + const Tensor& offsets, + const Tensor& bag_size, + const int64_t mode, + bool include_last_offset); + +void make_offset2bag_out( + Tensor& offset2bag, + Tensor& output, + const Tensor& weight, + const Tensor& indices, + const Tensor& offsets, + const int64_t mode, + const std::optional& per_sample_weights, + const int64_t padding_idx = -1); + +#ifdef USE_FBGEMM + +template +struct _CallbackAndBlockSize { + using TCallback = typename fbgemm::EmbeddingSpMDMKernelSignature::Type; + + int64_t blockSize = -1; + TCallback callback = nullptr; + + static TCallback generateCallback(int64_t block_size) { + return fbgemm::GenerateEmbeddingSpMDM( + block_size, + has_weight, + /* normalize_by_lengths */false, + /* prefetch */16, + /* is_weight_positional */false, + /* use_offsets */true); + } + + _CallbackAndBlockSize() = default; + + explicit _CallbackAndBlockSize(std::optional maybe_block_size) + : blockSize(maybe_block_size.value_or(-1)) + , callback(maybe_block_size.has_value() ? generateCallback(maybe_block_size.value()) : nullptr) + {} +}; + +template +struct _EmbeddingBagKernelCacheImpl : private StorageMixins... { + + _EmbeddingBagKernelCacheImpl() = default; + // use each of the mixins to store corresponding kernel and block size + explicit _EmbeddingBagKernelCacheImpl(std::optional maybe_block_size) + : StorageMixins(maybe_block_size)... + {} + + // this method is thread safe (call sites may call from different threads) + template + typename _CallbackAndBlockSize::TCallback + getCallback(int64_t block_size) const { + // if the cache doesn't store the kernel for the incoming block size + // (so it is different from the one stored in corresponding mixin) + // regenerate the kernel (not writing it into the cache so we avoid locks) + if (block_size != _CallbackAndBlockSize::blockSize) { + return _CallbackAndBlockSize::generateCallback(block_size); + } + // else retrieve the cached kernel from the corresponding mixin + return _CallbackAndBlockSize::callback; + } +}; + +// instantiate the cache with the list of storage mixins +// for each of the 8 _EmbeddingBagKernelCache* usages in the EmbeddingBag.cpp impl file +using _EmbeddingBagKernelCache = _EmbeddingBagKernelCacheImpl< + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize>; +#else +struct _EmbeddingBagKernelCache { + explicit _EmbeddingBagKernelCache(std::optional /* maybe_block_size */) {} +}; +#endif + +void _embedding_bag_cpu_impl_out(Tensor& output, Tensor& offset2bag, + Tensor& bag_size, Tensor* max_indices, + const Tensor &weight, const Tensor &indices, + const Tensor &offsets, const int64_t mode = 0, + const std::optional& per_sample_weights = std::nullopt, + bool include_last_offset = false, + int64_t padding_idx = -1, + _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr); + +void _embedding_bag_cpu_out( + at::Tensor& output, + at::Tensor& offset2bag, + at::Tensor& bag_size, + at::Tensor* p_max_indices, + const at::Tensor& weight, + const at::Tensor& indices, + const at::Tensor& offsets, + const bool scale_grad_by_freq, + const int64_t mode, + const bool sparse, + const std::optional& per_sample_weights, + const bool include_last_offset, + const std::optional& padding_idx, + _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr); + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ForeachUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ForeachUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..a8fbe13b8da0b9016e88c937ccbd2781eee0e160 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ForeachUtils.h @@ -0,0 +1,396 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +#include +#include + +namespace at::native { +namespace { +// Check if tensor list has either a boolean tensor or a integer tensor +inline bool has_integral_tensor(TensorList tensors, const bool includeBool) { + return std::any_of( + tensors.begin(), tensors.end(), [&includeBool](const auto& t) { + return at::isIntegralType(t.scalar_type(), includeBool); + }); +} +// check if tensor list has bool tensors +inline bool has_bool_tensor(TensorList tensors) { + return std::any_of(tensors.begin(), tensors.end(), [](const auto& t) -> bool { + return t.scalar_type() == ScalarType::Bool; + }); +} + +// Check foreach API restrictions +// - Tensor lists must be non-empty. +// - All TensorLists and ScalarLists must have the same number of elements. +// - Corresponding tensors must have the same size. +inline void check_foreach_api_restrictions(TensorList tensors) { + TORCH_CHECK(!tensors.empty(), "Tensor list must have at least one tensor."); +} + +inline void check_foreach_api_restrictions( + TensorList tensors, + ArrayRef scalars) { + check_foreach_api_restrictions(tensors); + TORCH_CHECK( + tensors.size() == scalars.size(), + "Tensor list must have same number of elements as scalar list."); +} + +inline void check_foreach_api_restrictions( + TensorList tensors1, + TensorList tensors2) { + TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor."); + TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor."); + TORCH_CHECK( + tensors1.size() == tensors2.size(), + "Tensor lists must have the same number of tensors, got ", + tensors1.size(), + " and ", + tensors2.size()); +} + +inline void check_foreach_api_restrictions( + TensorList tensors1, + TensorList tensors2, + TensorList tensors3) { + TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor."); + TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor."); + TORCH_CHECK(!tensors3.empty(), "Tensor list must have at least one tensor."); + TORCH_CHECK( + tensors1.size() == tensors2.size(), + "Tensor lists must have the same number of tensors, got ", + tensors1.size(), + " and ", + tensors2.size()); + TORCH_CHECK( + tensors1.size() == tensors3.size(), + "Tensor lists must have the same number of tensors, got ", + tensors1.size(), + " and ", + tensors3.size()); +} + +inline void check_foreach_api_restrictions( + TensorList tensors1, + TensorList tensors2, + TensorList tensors3, + ArrayRef scalars) { + check_foreach_api_restrictions(tensors1, tensors2, tensors3); + TORCH_CHECK( + tensors1.size() == scalars.size(), + "Tensor list must have same number of elements as scalar list, got ", + tensors1.size(), + " and ", + scalars.size()); +} + +// Helper function called in check_fast_path_restrictions to check whether all +// corresponding tensors (aligning in index across the tensorLists) share the +// same device and dtype. +inline bool _check_tensors_share_device_and_dtype( + ArrayRef tensorLists, + const bool skip_dtype_check = false) { + const auto expected_dtype = tensorLists[0][0].dtype(); + const auto expected_device = tensorLists[0][0].device(); + + auto is_tensor_okay = [&](const Tensor& tensor) { + return (skip_dtype_check || tensor.dtype() == expected_dtype) && + tensor.device() == expected_device && tensor.layout() == at::kStrided && + tensor.is_non_overlapping_and_dense(); + }; + + for (const auto& tensorList : tensorLists) { + for (const auto& tensor : tensorList) { + if (!is_tensor_okay(tensor)) { + return false; + } + } + } + + return true; +} + +// Helper function called in check_fast_path_restrictions to check if +// corresponding tensors in tensor lists have the same sizes and strides. +inline bool _check_tensors_share_sizes_and_strides( + ArrayRef tensorLists) { + auto is_diff_stride = [](const IntArrayRef& size, + const IntArrayRef& left_stride, + const IntArrayRef& right_stride) -> bool { + const size_t size_size = size.size(); + for (const auto dim : c10::irange(size_size)) { + if (size[dim] == 1) + continue; + if (left_stride[dim] != right_stride[dim]) { + return true; + } + } + return false; + }; + for (const auto i : c10::irange(1, tensorLists.size())) { + for (const auto j : c10::irange(tensorLists[0].size())) { + if (tensorLists[0][j].sizes() != tensorLists[i][j].sizes() || + is_diff_stride( + tensorLists[0][j].sizes(), + tensorLists[0][j].strides(), + tensorLists[i][j].strides())) { + return false; + } + } + } + + return true; +} + +// Helper function called in check_fast_path_restrictions to check whether +// all tensors type promote properly with the scalars in scalarList. This +// function assumes that _check_tensors_share_device_and_dtype has already been +// called so that all corresponding tensors in tensorLists have the same dtype. +// Then, it is sufficient to check the type promotion with just one tensorList. +inline bool _check_tensors_do_type_promotion_with_scalars( + TensorList tensorList, + ArrayRef scalarList = {}, + bool does_op_promote_integer_inputs_to_float = false) { + for (const auto i : c10::irange(tensorList.size())) { + // For division, integer inputs will result in float. + if (does_op_promote_integer_inputs_to_float) { + if (at::isIntegralType( + tensorList[i].scalar_type(), /*includeBool*/ true)) { + return false; + } + } + if (!scalarList.empty()) { + const auto& scalar = + scalarList.size() == 1 ? scalarList[0] : scalarList[i]; + const auto& tensor = tensorList[i]; + // note(mkozuki): This check might be responsible for + // `_foreach_add(bool_tensors, bool_tensors)` being pushed to slow path. + if (tensor.scalar_type() != at::native::result_type(scalar, tensor)) { + return false; + } + } + } + + return true; +} + +// To go via 'fast' path, several conditions must be satisfied +// - All tensors in all lists must have the same dtype. +// - All tensors must be on the same device +// - All tensors must have strided layout +// - All tensors must be non-overlapping and dense +// - Resulting tensor must have the same dtype as the input one + +// [note: what's ``does_op_promote_integer_inputs_to_float=true``?] +// ``does_op_promote_integer_inputs_to_float=true`` means that the result of +// the op will be float even if inputs are integer or boolean, which +// currently fast path does not support. In short, this flag, when +// turned on, gatekeeps the op from going down the fastpath. + +// Please, make sure to call check_foreach_api_restrictions before calling this +// method. There is a set of preconditions that have to be satisfied. +inline bool check_fast_path_restrictions( + ArrayRef tensorLists, + ArrayRef scalarList = {}, + bool does_op_promote_integer_inputs_to_float = false) { + return _check_tensors_share_device_and_dtype(tensorLists) && + _check_tensors_share_sizes_and_strides(tensorLists) && + _check_tensors_do_type_promotion_with_scalars( + tensorLists[0], + scalarList, + does_op_promote_integer_inputs_to_float); +} + +inline std::vector convert_tensor_to_scalar_list( + const Tensor& scalarList_, + int64_t expect_length) { + std::vector scalarList; + TORCH_CHECK( + scalarList_.device() == c10::kCPU, + "Expected scalars to be on CPU, got ", + scalarList_.device(), + " instead."); + TORCH_CHECK( + scalarList_.is_contiguous(), "Expected scalars to be contiguous."); + TORCH_CHECK( + scalarList_.dim() == 1, + "Expected packed scalar Tensor to be of dimension 1. Got ", + scalarList_.dim(), + " instead."); + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( + kComplexHalf, + kHalf, + kBool, + kBFloat16, + scalarList_.scalar_type(), + "convert_tensor_to_scalar_list", + [&]() { + const scalar_t* scalar_data = scalarList_.const_data_ptr(); + TORCH_CHECK( + (expect_length == scalarList_.size(0)), + "Expected length of scalars to match input of length ", + expect_length, + " but got ", + scalarList_.size(0), + " instead."); + for (int64_t i = 0; i < scalarList_.size(0); i++) { + scalarList.emplace_back(scalar_data[i]); + } + }); + return scalarList; +} + +// see: [note: what's ``does_op_promote_integer_inputs_to_float=true``?] +inline bool can_use_fast_route( + ArrayRef tensorLists, + ArrayRef scalarList = {}, + bool does_op_promote_integer_inputs_to_float = false) { + return check_fast_path_restrictions( + tensorLists, scalarList, does_op_promote_integer_inputs_to_float); +} + +// see: [note: what's ``does_op_promote_integer_inputs_to_float=true``?] +inline bool can_use_fast_route( + TensorList tensors1, + TensorList tensors2, + bool does_op_promote_integer_inputs_to_float = false) { + return can_use_fast_route( + {tensors1, tensors2}, {}, does_op_promote_integer_inputs_to_float); +} + +using DeviceDtypeKey = std::pair; +using IndicesT = std::vector; +using nested_optional_tensorvec_t = + std::vector>>; +using TensorsAndIndicesT = std::pair; +using FlatMap = std::unordered_map< + DeviceDtypeKey, + TensorsAndIndicesT, + ParamsHash>; + +inline FlatMap _group_tensors_by_first_tensors_device_and_dtype( + const nested_optional_tensorvec_t& nested_tensorlist, + const bool with_indices) { + FlatMap grouped_tensors_with_indices; + + TORCH_CHECK(!nested_tensorlist.empty()); + TORCH_CHECK(!nested_tensorlist[0].empty()); + const auto num_lists = nested_tensorlist.size(); + const auto num_tensors = nested_tensorlist[0].size(); + + TORCH_CHECK(std::all_of( + nested_tensorlist.cbegin(), + nested_tensorlist.cend(), + [&](const auto& tensorlist) -> bool { + // note(crcrpar): Allow empty tensorlists following + // ref: + // https://github.com/pytorch/pytorch/blob/85885301fd3c6adb8b9dc3cf7afadf6945566684/torch/utils/_foreach_utils.py#L21-L24 + return tensorlist.size() == num_tensors || tensorlist.size() == 0; + })); + + for (const auto& tensor_index : c10::irange(num_tensors)) { + const auto key = [&]() -> DeviceDtypeKey { + const auto t = nested_tensorlist[0][tensor_index]; + TORCH_CHECK( + t.has_value(), + "Tensors of the first list of nested Tensor lists are supposed to be defined but ", + "the ", + tensor_index, + "-th Tensor is not."); + return {t->device(), t->scalar_type()}; + }(); + TORCH_CHECK( + std::all_of( + nested_tensorlist.cbegin(), + nested_tensorlist.cend(), + [&](const auto& tensorlist) -> bool { + if (tensorlist.size() == 0) { + return true; + } + const auto& tensor = tensorlist[tensor_index]; + // note(crcrpar): Currently the scope of this function is + // optimizers so there could be `state_steps` and other scalars + // whose elements are float tensors no matter what the parameter's + // dtype is. + if (!tensor.has_value()) { + return true; + } else { + const auto s = tensor->scalar_type(); + const auto d = tensor->device(); + // Note: `step` or `state_step` is float32 by default. + if (key.first == d) { + return key.second == s || s == at::ScalarType::Float || + s == at::ScalarType::Double; + } else if (d.is_cpu()) { + // note(crcrpar): There are some test cases (e.g. + // TestOptim::test_adam) where state_steps are on CPU and the + // others are on CUDA. Currently a state_step Tensor has the + // dtype of float. + return s == at::ScalarType::Float || + s == at::ScalarType::Double; + } else { + return false; + } + } + }), + "Tensors of the same index must be on the same device and the same dtype except `step` tensors that can be CPU and float32/64 notwithstanding"); + if (!grouped_tensors_with_indices.count(key)) { + grouped_tensors_with_indices.insert( + {key, + TensorsAndIndicesT{ + [&]() -> nested_optional_tensorvec_t { + nested_optional_tensorvec_t nested_tensorvec; + nested_tensorvec.reserve(num_lists); + for (const auto& i : c10::irange(num_lists)) { + std::vector> tensors; + if (!nested_tensorlist[i].empty()) { + // NB: num_tensors is the max possible length for any of + // the inner lists of tensor references. Reserving the max + // trades memory for perf. This should not have significant + // impact. + tensors.reserve(num_tensors); + } + nested_tensorvec.emplace_back(tensors); + } + return nested_tensorvec; + }(), + [&]() -> IndicesT { + if (!with_indices) { + return {}; + } else { + IndicesT indices; + indices.reserve(num_tensors); + return indices; + } + }()}}); + } + for (const auto& list_index : c10::irange(num_lists)) { + if (!nested_tensorlist[list_index].empty()) { + grouped_tensors_with_indices[key].first[list_index].emplace_back( + nested_tensorlist[list_index][tensor_index]); + } + } + if (with_indices) { + grouped_tensors_with_indices[key].second.emplace_back(tensor_index); + } + } + + return grouped_tensors_with_indices; +} + +} // namespace +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FusedAdagrad.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FusedAdagrad.h new file mode 100644 index 0000000000000000000000000000000000000000..f1e415ba9a3dd826626223af39264c82ab4899ee --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FusedAdagrad.h @@ -0,0 +1,20 @@ +#include +#include + +namespace at::native { + +using fused_adagrad_fn = void (*)( + const at::Tensor& param, + const at::Tensor& grad, + const at::Tensor& state_sum, + const at::Tensor& state_step, + const double lr, + const double lr_decay, + const double weight_decay, + const double eps, + const bool maximize, + const float* grad_scale_ptr); + +DECLARE_DISPATCH(fused_adagrad_fn, fused_adagrad_stub); + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FusedAdam.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FusedAdam.h new file mode 100644 index 0000000000000000000000000000000000000000..2e26cc2d84c6dab2bde4fd73f4a601c30faa0edd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FusedAdam.h @@ -0,0 +1,27 @@ +#include +#include + +namespace at::native { + +enum class ADAM_MODE : uint8_t { ORIGINAL = 0, ADAMW = 1 }; + +using fused_adam_fn = void (*)( + const at::Tensor& param, + const at::Tensor& grad, + const at::Tensor& exp_avg, + const at::Tensor& exp_avg_sq, + const at::Tensor& max_exp_avg_sq, + const at::Tensor& state_step, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool amsgrad, + const bool maximize, + const float* grad_scale_ptr, + const ADAM_MODE); + +DECLARE_DISPATCH(fused_adam_fn, fused_adam_stub); + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FusedSGD.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FusedSGD.h new file mode 100644 index 0000000000000000000000000000000000000000..c58d10b6a4a4e585cb6196f553c73aa667d5150a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FusedSGD.h @@ -0,0 +1,21 @@ +#include +#include + +namespace at::native { + +using fused_sgd_fn = void (*)( + const at::Tensor& param, + const at::Tensor& grad, + const at::Tensor& momentum_buffer, + const double weight_decay, + const double momentum, + const double lr, + const double dampening, + const bool nesterov, + const bool maximize, + const bool is_first_step, + const float* grad_scale_ptr); + +DECLARE_DISPATCH(fused_sgd_fn, fused_sgd_stub); + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/GridSamplerUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/GridSamplerUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..f783043c7961222302ed0ff2514ffbf138cafac2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/GridSamplerUtils.h @@ -0,0 +1,105 @@ +#pragma once + +// See NOTE: [Tensor vs. TensorBase] +// https://github.com/pytorch/pytorch/pull/66979 +#include +#include +#include + +namespace at::native { + +namespace detail { + +enum class GridSamplerInterpolation {Bilinear, Nearest, Bicubic}; +enum class GridSamplerPadding {Zeros, Border, Reflection}; + +} // namespace detail + +using detail::GridSamplerInterpolation; +using detail::GridSamplerPadding; + +// See NOTE [ grid_sampler Native Functions ]. +inline void check_grid_sampler_common( + const TensorBase& input, + const TensorBase& grid +) { + auto input_opt = input.options(); + auto grid_opt = grid.options(); + + TORCH_CHECK( + input.defined(), + "grid_sampler(): expected input to not be undefined"); + TORCH_CHECK( + grid.defined(), + "grid_sampler(): expected grid to not be undefined"); + TORCH_CHECK( + input_opt.device() == grid_opt.device(), + "grid_sampler(): expected input and grid to be on same device, but input " + "is on ", input_opt.device(), " and grid is on ", grid_opt.device()); + TORCH_CHECK( + input_opt.layout() == kStrided && grid_opt.layout() == kStrided, + "grid_sampler(): expected input and grid to have torch.strided layout, but " + "input has ", input_opt.layout(), " and grid has ", grid_opt.layout()); + TORCH_CHECK( + input.size(0) == grid.size(0), + "grid_sampler(): expected grid and input to have same batch size, but got " + "input with sizes ", input.sizes(), " and grid with sizes ", grid.sizes()); + TORCH_CHECK( + grid.size(-1) == input.dim() - 2, + "grid_sampler(): expected grid to have size ", input.dim() - 2, " in last " + "dimension, but got grid with sizes ", grid.sizes()); + + for (const auto i : c10::irange(2, input.dim())) { + TORCH_CHECK(input.size(i) > 0, + "grid_sampler(): expected input to have non-empty spatial dimensions, " + "but input has sizes ", input.sizes(), " with dimension ", i, " being " + "empty"); + } +} + +// See NOTE [ grid_sampler Native Functions ]. +inline void check_grid_sampler_2d( + const TensorBase& input, + const TensorBase& grid +) { + TORCH_CHECK( + input.dim() == 4 && input.dim() == grid.dim(), + "grid_sampler(): expected 4D input and grid with same number of " + "dimensions, but got input with sizes ", input.sizes(), + " and grid with sizes ", grid.sizes()); +} + +// See NOTE [ grid_sampler Native Functions ]. +inline void check_grid_sampler_3d( + const TensorBase& input, + const TensorBase& grid, + int64_t interpolation_mode +) { + TORCH_CHECK( + input.dim() == 5 && input.dim() == grid.dim(), + "grid_sampler(): expected 5D input and grid with same number of " + "dimensions, but got input with sizes ", input.sizes(), + " and grid with sizes ", grid.sizes()); + TORCH_CHECK( + !(input.dim() == 5 && + static_cast(interpolation_mode) == + GridSamplerInterpolation::Bicubic), + "grid_sampler(): bicubic interpolation only supports 4D input"); +} + +// See NOTE [ grid_sampler Native Functions ]. +// cudnn does not support inputs larger than 1024. +inline bool cond_cudnn_grid_sampler( + const TensorBase& input, + const TensorBase& grid +) { + return ( + at::native::cudnn_is_acceptable(input) && + at::native::cudnn_is_acceptable(grid) && + at::native::canUse32BitIndexMath(input) && + at::native::canUse32BitIndexMath(grid) && + input.dim() == 4 && + input.sym_size(1) <= 1024); +} + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Histogram.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Histogram.h new file mode 100644 index 0000000000000000000000000000000000000000..fee7e06b87258138bcafb360cd48955640b25eeb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Histogram.h @@ -0,0 +1,16 @@ +#pragma once + +#include +#include + +namespace at::native { + +using histogramdd_fn = void(*)(const Tensor&, const std::optional&, bool, Tensor&, const TensorList&); +using histogramdd_linear_fn = void(*)(const Tensor&, const std::optional&, bool, Tensor&, const TensorList&, bool); +using histogram_select_outer_bin_edges_fn = void(*)(const Tensor& input, const int64_t N, std::vector &leftmost_edges, std::vector &rightmost_edges); + +DECLARE_DISPATCH(histogramdd_fn, histogramdd_stub); +DECLARE_DISPATCH(histogramdd_linear_fn, histogramdd_linear_stub); +DECLARE_DISPATCH(histogram_select_outer_bin_edges_fn, histogram_select_outer_bin_edges_stub); + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexKernel.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..8b0a787f0e87a64bced49d9c920ebd66262bd26a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexKernel.h @@ -0,0 +1,41 @@ +#pragma once +#include +#include + +namespace at { +class Tensor; +class TensorBase; +struct TensorIterator; +struct TensorIteratorBase; +} + +namespace c10 { +class Scalar; +} + +namespace at::native { + +using index_fn = void(*)(TensorIteratorBase &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides); +using index_fill_fn = void(*)(TensorIterator & iter, int64_t dim, int64_t self_dim_size, int64_t self_dim_stride, const Scalar& source); +using index_copy_fn = void(*)(TensorIterator & iter, int64_t dim, int64_t self_dim_size, int64_t self_dim_stride); +using index_put_fn = void(*)(TensorIterator &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides, bool accumulate); +using put_fn = void(*)(TensorIterator & iter, const TensorBase& self, const bool accumulate); +using take_fn = void(*)(TensorIterator & iter, const TensorBase& input); +using flip_fn = void(*)(TensorIterator &, const bool); +using masked_fill_fn = void(*)(TensorIterator &, const Scalar& scalar); +using masked_select_fn = void(*)(TensorIterator &, int64_t orig_stride); +using masked_scatter_fn = void(*)(TensorIterator &, const TensorBase &); + +DECLARE_DISPATCH(index_fn, index_stub); +DECLARE_DISPATCH(index_fill_fn, index_fill_stub); +DECLARE_DISPATCH(index_copy_fn, index_copy_stub); +DECLARE_DISPATCH(index_put_fn, index_put_stub); +DECLARE_DISPATCH(put_fn, put_stub); +DECLARE_DISPATCH(take_fn, take_stub); +DECLARE_DISPATCH(flip_fn, flip_stub); +DECLARE_DISPATCH(masked_fill_fn, masked_fill_stub); +DECLARE_DISPATCH(masked_select_fn, masked_select_serial_stub); +DECLARE_DISPATCH(masked_select_fn, masked_select_stub); +DECLARE_DISPATCH(masked_scatter_fn, masked_scatter_stub); + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexingUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexingUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..cef21c3fd80d5d82ebd16a43d58510ed793871c6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexingUtils.h @@ -0,0 +1,160 @@ +#pragma once +#include +#include +#include +#include +#include + +namespace at::native { + +[[noreturn]] +static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask, int64_t maskIdx) { + TORCH_CHECK_INDEX(false, "The shape of the mask ", mask.sizes(), " at index ", maskIdx, + " does not match the shape of the indexed tensor ", self.sizes(), " at index ", idx); +} + + +static C10_UNUSED std::vector expandTensors(const Tensor & self, IOptTensorListRef indices) { + // If indices come in as ByteTensor or BoolTensor (masks), expand them into the equivalent indexing by LongTensors + std::vector result; + for (const auto& index_opt : indices) { + if (!index_opt.has_value()) { + result.emplace_back(); + } else { + const auto& index = *index_opt; + if (index.scalar_type() == kByte || index.scalar_type() == kBool) { + if (index.scalar_type() == kByte) { + TORCH_WARN("indexing with dtype torch.uint8 is now deprecated," \ + " please use a dtype torch.bool instead."); + } + // The sizes of the ByteTensor mask or bool tensor must match the sizes of the + // corresponding dimensions in self + for (const auto j : c10::irange(index.dim())) { + int64_t srcIdx = static_cast(result.size() + j); + if (index.size(j) != self.size(srcIdx)) { + invalid_mask(self, srcIdx, index, j); + } + } + // Replace with nonzeros + auto nonzero = index.nonzero(); + for (const auto j : c10::irange(index.dim())) { + result.emplace_back(nonzero.select(1, j)); + } + } else { + result.emplace_back(index); + } + } + } + return result; +} + +static C10_UNUSED void checkIndexTensorTypes(IOptTensorListRef indices, bool allow_int=false) { + for (const auto& tensor : indices) { + if (tensor.has_value() && tensor->defined()) { + auto scalarType = tensor->scalar_type(); + if (allow_int) { + if (scalarType != kLong && scalarType != kByte && scalarType != kBool && scalarType != kInt) { + TORCH_CHECK_INDEX(false, "tensors used as indices must be long, int, byte or bool tensors"); + } + } else { + if (scalarType != kLong && scalarType != kByte && scalarType != kBool) { + TORCH_CHECK_INDEX(false, "tensors used as indices must be long, byte or bool tensors"); + } + } + } + } +} + +inline torch::List> toListOfOptionalTensors(ArrayRef list) { + torch::List> result; + result.reserve(list.size()); + for (const Tensor& a : list) { + result.push_back(a); + } + return result; +} + +inline torch::List> toListOfOptionalTensors(ArrayRef list) { + torch::List> result; + result.reserve(list.size()); + for (const IValue& a : list) { + result.push_back(a.isTensor() ? std::optional(a.toTensor()) : std::optional()); + } + return result; +} + +static C10_UNUSED bool hasContiguousSubspace(TensorList tl) { + // true if all the non-null tensors are adjacent + auto isDefined = [](const Tensor & tensor){ return tensor.defined(); }; + auto isNull = [](const Tensor & tensor){ return !tensor.defined(); }; + auto start = std::find_if(tl.begin(), tl.end(), isDefined); + auto stop = std::find_if(tl.rbegin(), tl.rend(), isDefined); + auto it = std::find_if(start, stop.base(), isNull); + return it == stop.base(); +} + + +// Transposes the tensor and indices together so that all the non-null indices +// index the first k dimensions of the tensor. Returns the transposed tensor +// and the reordered indices. For example: +// transposeToFront(tensor, {nullptr, a, nullptr, b}) +// returns +// tensor.permute([1, 3, 0, 2]), {a, b, nullptr, nullptr} +static C10_UNUSED std::tuple> +transposeToFront(const Tensor& self, TensorList indices) { + std::vector dims; + std::vector transposedIndices; + dims.reserve(self.dim()); + for (const auto i : c10::irange(self.dim())) { + if (indices[i].defined()) { + dims.push_back(i); + transposedIndices.emplace_back(indices[i]); + } + } + for (const auto i : c10::irange(self.dim())) { + if (!indices[i].defined()) { + dims.push_back(i); + transposedIndices.emplace_back(); + } + } + return std::make_tuple(self.permute(dims), std::move(transposedIndices)); +} + +inline std::tuple, std::vector> +transposeToFrontAndInvPerm(const Tensor& self, TensorList indices) { + std::vector dims; + std::vector invPerm; + std::vector transposedIndices; + dims.reserve(self.dim()); + invPerm.resize(self.dim()); + for (const auto i : c10::irange(self.dim())) { + if (indices[i].defined()) { + dims.push_back(i); + transposedIndices.emplace_back(indices[i]); + } + } + for (const auto i : c10::irange(self.dim())) { + if (!indices[i].defined()) { + dims.push_back(i); + transposedIndices.emplace_back(); + } + } + for (const auto i : c10::irange(self.dim())) { + invPerm[dims[i]] = i; + } + return std::make_tuple(self.permute(dims), std::move(transposedIndices), std::move(invPerm)); +} + +struct AdvancedIndex { + AdvancedIndex(const Tensor& src, TensorList indices); + + Tensor src; + std::vector indices; + DimVector indexed_sizes; + DimVector indexed_strides; + int64_t dims_before; + int64_t dims_after; +}; + + +} //namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Lerp.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Lerp.h new file mode 100644 index 0000000000000000000000000000000000000000..6db4f60b88ea1e19a1cc744f9f8c1e7b31c66a82 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Lerp.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include +#include +#include + +namespace at::native { + +template +C10_HOST_DEVICE C10_ALWAYS_INLINE bool is_lerp_weight_small(scalar_t weight) { + return std::abs(weight) < scalar_t(0.5); +} +template +C10_HOST_DEVICE C10_ALWAYS_INLINE bool is_lerp_weight_small(c10::complex weight) { + // Avoid the sqrt in abs(weight) + return (weight.real() * weight.real() + weight.imag() * weight.imag()) < scalar_t(0.25); +} + +template +C10_HOST_DEVICE C10_ALWAYS_INLINE scalar_t lerp(scalar_t self_, scalar_t end_, weight_t weight_) { + using opmath_t = at::opmath_type; + using opmath_weight_t = at::opmath_type; + + opmath_t self = self_; + opmath_t end = end_; + opmath_weight_t weight = weight_; + + // Conditional for better numeric. This has been discussed in + // https://github.com/pytorch/pytorch/pull/18871 + return is_lerp_weight_small(weight) + ? self + weight * (end - self) + : end - (end - self) * (opmath_t(1) - weight); +} + +using lerp_fn_scalar = void (*)( + at::TensorIteratorBase& iter, + const Scalar& weight); + +using lerp_fn_tensor = void (*)( + at::TensorIteratorBase& iter); + +DECLARE_DISPATCH(lerp_fn_scalar, lerp_kernel_scalar_weight); +DECLARE_DISPATCH(lerp_fn_tensor, lerp_kernel_tensor_weight); + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LinearAlgebra.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LinearAlgebra.h new file mode 100644 index 0000000000000000000000000000000000000000..c613020599c2e03aacb15f80119a42aaf9fa877b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LinearAlgebra.h @@ -0,0 +1,17 @@ +#pragma once + +#include + +namespace c10 { +class Scalar; +} + +namespace at { +struct TensorIterator; +} + +namespace at::native { + +using addr_fn = void (*)(TensorIterator &, const Scalar& beta, const Scalar& alpha); +DECLARE_DISPATCH(addr_fn, addr_stub); +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LinearAlgebraUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LinearAlgebraUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..48d9c31129654815c0542f63bf9d1d9cb9ab1dd3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LinearAlgebraUtils.h @@ -0,0 +1,623 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#include +#include +#include +#endif + +namespace at::native { + +inline c10::MaybeOwned expect_resolved_conj(const Tensor& tensor) { + if (tensor.is_conj()) { + return c10::MaybeOwned::owned(tensor.resolve_conj()); + } else { + return c10::MaybeOwned::borrowed(tensor); + } +} + +inline DimVector batched_matrix_contiguous_strides( + const IntArrayRef sizes, + const bool f_contig = false) { + // f_contig chooses between the strides of a batch of Fortran (F-contiguous) + // and C-contiguous matrices + auto strides = c10::contiguous_strides(sizes); + auto dim = strides.size(); + + if (f_contig && dim >= 2) { + // Fix the strides of the last two dimensions, so that we return + // C-contiguous batches of F-contiguous matrices. + strides[dim - 1] = std::max(sizes[dim - 2], static_cast(1)); + strides[dim - 2] = 1; + } + return strides; +} + +/* + * Clones a Tensor so that the following conditions hold: + * If we think of a Tensor of having size (B, M, N), where B is any number + * of batch dimensions, then: + * - Each (M, N) matrix is in column major form + * - Let Tensor P have size (B, M, N) and Q have size (B, M', N'). + * Then when laid out in memory, the M by N matrix starting at + * P.data_ptr()[B * M * N] is of the same corresponding batch as the M' by N' + * matrix starting at Q.data_ptr()[B * M' * N']. + */ +inline Tensor cloneBatchedColumnMajor(const Tensor& src) { + // If src is already in batched column major format, then + // this will be efficient (no reordering of the data will occur) + // because the first transpose will make the tensor contiguous, + // and cloning a contiguous tensor is fast. + auto result = src.mT().clone(at::MemoryFormat::Contiguous); + result.transpose_(-2, -1); + return result; +} + +/* + * contig chooses between C-contig (true) and F-contig (false) + */ +inline c10::MaybeOwned borrow_else_clone(const bool cond, const Tensor& borrow, const Tensor& clone, const bool contig) { + return cond ? c10::MaybeOwned::borrowed(borrow) + : c10::MaybeOwned::owned(contig ? clone.clone(MemoryFormat::Contiguous) + : cloneBatchedColumnMajor(clone)); +} + +/* + * This method is designed to be a faster alternative to + * `cloneBatchedColumnMajor` with some additional features, + * namely: + * 1. It uses `copy` instead of `clone` which could be much faster. + * 2. `nrows` parameter used to create inputs with the number of rows larger + * than the original input, which is required for some LAPACK/MAGMA methods. + * 3. `desired_batch_size` is used to create copies with the batch size + * which is either the original batch size of the input, or its larger + * broadcasted shape. + */ +inline Tensor copyBatchedColumnMajor(const Tensor& src, int64_t nrows = -1, + at::OptionalIntArrayRef desired_batch_sizes = std::nullopt) { + nrows = (nrows == -1) ? src.size(-2) : nrows; + auto copy_sizes = desired_batch_sizes.has_value() + ? desired_batch_sizes.value().vec() + : IntArrayRef(src.sizes().data(), src.dim() - 2).vec(); + copy_sizes.insert(copy_sizes.end(), {nrows, src.size(-1)}); + const auto copy_strides = batched_matrix_contiguous_strides(copy_sizes, /*f-contig*/true); + auto copy = at::empty_strided(copy_sizes, copy_strides, src.options()); + copy.narrow(-2, 0, src.size(-2)).copy_(src); + return copy; +} + +/* + * Given batches of matrices with arbitrary batch dim, + * computes the number of batches. + */ +inline int64_t batchCount(const Tensor& batched_matrices) { + int64_t result = 1; + for (int64_t i = 0; i < batched_matrices.ndimension() - 2; i++) { + result *= batched_matrices.size(i); + } + return result; +} + +// Computes the number of elements of a matrix in a batched matrix tensor +inline int64_t matrixStride(const Tensor& batched_matrices) { + return batched_matrices.size(-1) * batched_matrices.size(-2); +} + +// Validates input shapes for operations on batches of square matrices (inverse, cholesky, symeig, eig) +inline void checkIsMatrix(const Tensor& A, const char* const f_name, const char* const arg_name = "A") { + TORCH_CHECK(A.dim() >= 2, f_name, ": The input tensor ", arg_name, " must have at least 2 dimensions."); +} +inline void squareCheckInputs(const Tensor& self, const char* const f_name, const char* const arg_name = "A") { + checkIsMatrix(self, f_name, arg_name); + TORCH_CHECK(self.sym_size(-1) == self.sym_size(-2), + f_name, + ": ", arg_name, " must be batches of square matrices, " + "but they are ", self.sym_size(-2), " by ", self.sym_size(-1), " matrices"); +} + +inline void checkInputsSolver(const Tensor& A, + const Tensor& B, + const bool left, + const char* const f_name) { + squareCheckInputs(A, f_name, "A"); + checkIsMatrix(B, f_name, "B"); + TORCH_CHECK(left ? A.size(-2) == B.size(-2) : A.size(-1) == B.size(-1), + f_name, ": Incompatible shapes of A and B for the equation ", + left ? "AX = B" : "XA = B", + " (", A.size(-2), "x", A.size(-1), " and ", B.size(-2), "x", B.size(-1), ")"); +} + +inline bool is_row_or_column_contiguous(const Tensor& t) { + // This could be made more general, similar to how it's checked in matmul, which would allow to + // ellide the copy with strides such as (6, 12, 1, 3) or (3, 1, 9), but this is quite tricky. + // We choose to be conservative for simplicity + return t.is_contiguous() || t.transpose(-2, -1).is_contiguous(); +} + +inline TransposeType to_transpose_type(const bool contig, const bool conj) { + if (conj) { + if (contig) { TORCH_INTERNAL_ASSERT(false, "Invalid transpose type"); } + else { return TransposeType::ConjTranspose; } + } else { + if (contig) { return TransposeType::NoTranspose; } + else { return TransposeType::Transpose; } + } +} + + +// This function is designed to be used with linear algebra methods that minimize +// L(ax - b) = 0, where L is generally the identity map (`solve`, for example) +// or the L2 norm (`lstsq`). +// It is expected that `a` and `b` are contiguous tensors of column-major matrices +// (so that a.view({-1, a.size(-2), a.size(-1)}) succeeds, same for `b`), +// with the following additional properties: +// +// 1. a.dim() == b.dim() +// 2. a.shape[:-2] broadcasts over b.shape[:-2] +// 3. a.size(i) <= b.size(i) for i=0,..., a.dim() - 3 (only for batch dimensions) +// +// MAGMA/LAPACK modify tensor `a` in-place, and the main goal of this method +// is to be memory efficient, which means that if there exists an index i such that +// a.shape[i] < b.shape[i], 0 <= i <= a.dim() - 3, +// then instead of materializing copies of `a` in the broadcasted shape, we keep +// a buffer copy of `a` along with flags that check whether specific batch dimension +// indices for `a` were already accessed. If they were, we copy the data from the buffer +// into `a`. The number of copies does not exceed +// prod(max(a.shape[:-2], b.shape[:-2]) - a.shape[:-2] + 1) +// and this value is attained by tensors with non-empty batch dimensions. +// +// func_t `f` is a callable that is being supplied with +// scalar_t* a_working_ptr, scalar_t* b_working_ptr, int64_t a_linear_batch_idx. +// a_working_ptr and b_working_ptr can directly be passed to LAPACK/MAGMA routines, +// and a_linear_batch_idx is an index in the 3d representation which corresponds to +// the memory a_working_ptr points to, in other words: +// a_working_ptr == a.view({-1, a.size(-2), a.size(-1)}.select(0, a_linear_batch_idx).data_ptr(); +// a_linear_batch_idx is useful to store metadata related to `a`, such as, for example, +// its rank or singular values (see linalg_lstsq). +template +void batch_iterator_with_broadcasting(const Tensor& a, const Tensor& b, const func_t& f) { + IntArrayRef a_batch_sizes(a.sizes().data(), a.dim() - 2); + IntArrayRef b_batch_sizes(b.sizes().data(), b.dim() - 2); + + auto a_linear_batch_idx = at::arange(batchCount(a)).view(a_batch_sizes); + auto b_linear_batch_idx = at::arange(batchCount(b)).view(b_batch_sizes); + + TensorIterator iter = TensorIteratorConfig() + .set_check_mem_overlap(false) + .check_all_same_dtype(false) + .resize_outputs(false) + .add_output(b_linear_batch_idx) + .add_input(a_linear_batch_idx) + .build(); + + auto m = a.size(-2); + auto n = a.size(-1); + auto a_3d = a.view({batchCount(a), m, n}); + auto b_3d = b.view({batchCount(b), b.size(-2), b.size(-1)}); + + auto a_broadcasts_over_b = (a_batch_sizes != b_batch_sizes); + Tensor a_buffer, a_was_accessed, a_buffer_3d; + std::function check_if_copy_needed_for_a + = [](int64_t /*a_curr_linear_batch_idx*/){}; + if (a_broadcasts_over_b) { + a_buffer = at::empty_strided(a.sizes(), a.strides(), a.options()) + .copy_(a); + a_was_accessed = at::zeros(batchCount(a), at::kBool); + a_buffer_3d = a_buffer.view({batchCount(a), m, n}); + check_if_copy_needed_for_a = [&](int64_t a_curr_linear_batch_idx) { + auto* a_was_accessed_flag = a_was_accessed + .select(0, a_curr_linear_batch_idx) + .data_ptr(); + if (!(*a_was_accessed_flag)) { + *a_was_accessed_flag = true; + } + else { + a_3d.select(0, a_curr_linear_batch_idx) + .copy_(a_buffer_3d.select(0, a_curr_linear_batch_idx)); + } + }; + } + + auto loop = [&](char** data, const int64_t* strides, int64_t nelems) { + auto* b_batch_idx_ptr = data[0]; + auto* a_batch_idx_ptr = data[1]; + + for (const auto elem C10_UNUSED : c10::irange(nelems)) { + auto b_curr_linear_batch_idx = *reinterpret_cast(b_batch_idx_ptr); + auto a_curr_linear_batch_idx = *reinterpret_cast(a_batch_idx_ptr); + + check_if_copy_needed_for_a(a_curr_linear_batch_idx); + + auto* a_working_ptr = a_3d.select(0, a_curr_linear_batch_idx) + .data_ptr(); + auto* b_working_ptr = b_3d.select(0, b_curr_linear_batch_idx) + .data_ptr(); + f(a_working_ptr, b_working_ptr, a_curr_linear_batch_idx); + + b_batch_idx_ptr += strides[0]; + a_batch_idx_ptr += strides[1]; + } + }; + iter.serial_for_each(loop, {0, batchCount(b)}); +} + +// Returns the epsilon value for floating types except half +inline double _get_epsilon(const ScalarType& sc_type) { + switch (sc_type) { + case at::ScalarType::Float: + return static_cast(std::numeric_limits::epsilon()); + case at::ScalarType::Double: + return std::numeric_limits::epsilon(); + default: + AT_ERROR("This function doesn't handle types other than float and double"); + } +} + +// Validates input shapes and devices +// for linear solve methods (solve, cholesky_solve, lu_solve, triangular_solve) +inline void linearSolveCheckInputs(const Tensor& self, const Tensor& A, const char* name) { + TORCH_CHECK(self.device() == A.device(), + "Expected b and A to be on the same device, but found b on ", + self.device(), " and A on ", A.device(), " instead."); + + TORCH_CHECK(self.scalar_type() == A.scalar_type(), + "Expected b and A to have the same dtype, but found b of type ", + self.scalar_type(), " and A of type ", A.scalar_type(), " instead."); + + TORCH_CHECK(A.size(-1) == A.size(-2), + "A must be batches of square matrices, " + "but they are ", A.size(-2), " by ", A.size(-1), " matrices"); + + TORCH_CHECK(A.size(-1) == self.size(-2), + "Incompatible matrix sizes for ", name, ": each A " + "matrix is ", A.size(-1), " by ", A.size(-1), + " but each b matrix is ", self.size(-2), " by ", self.size(-1)); +} + +inline void checkFloatingOrComplex(const Tensor& t, const char* const f_name, const bool allow_low_precision_dtypes=true) { + auto dtype = t.scalar_type(); + TORCH_CHECK((at::isFloatingType(dtype) || at::isComplexType(dtype)), + f_name, ": Expected a floating point or complex tensor as input. Got ", dtype); + if (!allow_low_precision_dtypes) { + TORCH_CHECK(dtype == kFloat || dtype == kDouble || dtype == kComplexFloat || dtype == kComplexDouble, + f_name, ": Low precision dtypes not supported. Got ", dtype); + } +} + + +// Checks if all the Tensors in a TensorList are of the same dimensions +inline void checkAllSameDim(TensorList tensors, int64_t dim) { + for (auto &t : tensors) { + TORCH_CHECK(t.dim() == dim, "Tensor dimension is ", t.dim(), ", expected ", dim, " instead."); + } +} + +inline std::tuple, std::vector> _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2) { + // broadcast the batch dimensions of arg1 and arg2. + IntArrayRef arg1_batch_sizes(arg1.sizes().data(), arg1.ndimension() - 2); + IntArrayRef arg2_batch_sizes(arg2.sizes().data(), arg2.ndimension() - 2); + std::vector expand_batch_portion = infer_size(arg1_batch_sizes, arg2_batch_sizes); + + std::vector arg1_expand_size({expand_batch_portion}); + arg1_expand_size.insert(arg1_expand_size.end(), { arg1.size(-2), arg1.size(-1) }); + + std::vector arg2_expand_size({expand_batch_portion}); + arg2_expand_size.insert(arg2_expand_size.end(), { arg2.size(-2), arg2.size(-1) }); + return std::make_tuple(std::move(arg1_expand_size), std::move(arg2_expand_size)); +} + +inline std::tuple _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2, const char* name) { + // If there's no name we assume we don't want to check the errors + if (name != nullptr) { + linearSolveCheckInputs(arg1, arg2, name); + } + + auto [arg1_expand_size, arg2_expand_size] = at::native::_linalg_broadcast_batch_dims(arg1, arg2); + + auto arg1_broadcasted = arg1_expand_size == arg1.sizes() ? arg1 : arg1.expand(arg1_expand_size); + auto arg2_broadcasted = arg2_expand_size == arg2.sizes() ? arg2 : arg2.expand(arg2_expand_size); + return std::make_tuple(arg1_broadcasted, arg2_broadcasted); +} + +inline std::vector broadcast_batch_size(const Tensor& t1, const Tensor& t2, int64_t n_batch_dims) { + IntArrayRef t1_batch_sizes(t1.sizes().data(), n_batch_dims); + IntArrayRef t2_batch_sizes(t2.sizes().data(), n_batch_dims); + auto broadcasted_batch_sizes = infer_size(t1_batch_sizes, t2_batch_sizes); + return broadcasted_batch_sizes; +} + +// Return a permutation with the given axes moved to the end. +inline Tensor _move_to_end(const Tensor& self, IntArrayRef axes) { + const std::vector a = axes.vec(); + const int64_t ndim = self.ndimension(); + std::vector perm; + + for (const auto i : c10::irange(ndim)) { + auto it = std::find(a.begin(), a.end(), i); + if (it == a.end()) { + perm.push_back(i); + } + } + for (auto i : a) { + perm.push_back(i); + } + + TORCH_CHECK((int64_t)perm.size() == ndim, + "duplicate or invalid axis in 'dim' argument for tensor with ndim==", ndim); + + return self.permute(perm); +} + +// parse the "mode" param in linalg_qr: return a tuple of bools (compute_q, reduced) +inline std::tuple _parse_qr_mode(c10::string_view mode) { + bool compute_q; + bool reduced; + if (mode == "reduced") { + compute_q = true; + reduced = true; + } else if (mode == "complete") { + compute_q = true; + reduced = false; + } else if (mode == "r") { + compute_q = false; + reduced = true; // this is actually irrelevant in this mode + } else { + TORCH_CHECK(false, "qr received unrecognized mode '", mode, + "' but expected one of 'reduced' (default), 'r', or 'complete'"); + } + return std::make_tuple(compute_q, reduced); +} + +// Function to compute sizes, strides and the extra columns for the Q matrix in the QR Decomposition +inline std::tuple _compute_geometry_for_Q( + const Tensor& input, + bool reduced) { + int64_t m = input.size(-2), n = input.size(-1); + int64_t n_columns_q; + + // We need to compute the required size of Q based on the `reduced` option + DimVector q_sizes(input.sizes()); + if (!reduced && m > n) { + q_sizes[input.dim() - 1] = m; + n_columns_q = m; + } else { + q_sizes[input.dim() - 1] = n; + n_columns_q = std::min(m, n); + } + auto q_strides = batched_matrix_contiguous_strides(q_sizes, /*f-contig*/true); + return std::make_tuple(q_sizes, q_strides, n_columns_q); +} + +inline bool svd_uses_cusolver(const Tensor& A) { + // if cusolver is available, it is used unconditionally + return A.is_cuda() + && at::globalContext().hasCuSOLVER() + && at::globalContext().linalgPreferredBackend() != at::LinalgBackend::Magma; +} + + +// Function used instead of .to so that the original strides are retained +// .to doesn't retain strides and make the output tensor contiguous +inline Tensor same_stride_to(const Tensor& original_tensor, const at::TensorOptions& options) { + auto strided_to = at::empty_strided(original_tensor.sizes(), + original_tensor.strides(), + options); + strided_to.copy_(original_tensor); + return strided_to; +} + +// Creates a dimension permutation array that can be given to `at::permute()`, which will shift +// the two specified dimensions to the end of a tensor, without changing the order of +// the other dimensions. `dim1` will be placed at the very end, and `dim0` will be +// placed just to the left of it. +// +// For instance, given a 4-D tensor, dimensions 1 and 3 can be shifted to the end by +// calling `create_dim_backshift_permutation(1, 3, 4)`. The resulting vector will +// be `vec(0, 2, 1, 3)`. +inline std::vector create_dim_backshift_permutation(int64_t dim0, int64_t dim1, int64_t ndim) { + TORCH_CHECK( + (dim0 != dim1) && (dim0 < ndim) && (dim0 >= 0) && (dim1 < ndim) && (dim1 >= 0), + "duplicate or invalid dimensions"); + std::vector permutation(ndim); + int64_t cur_permuted_dim = 0; + for (const auto dim_ind : c10::irange(ndim)) { + if ((dim_ind != dim0) && (dim_ind != dim1)) { + permutation[cur_permuted_dim++] = dim_ind; + } + } + permutation[cur_permuted_dim++] = dim0; + permutation[cur_permuted_dim] = dim1; + return permutation; +} + +// Creates a dimension permutation array that can be given to `at::permute()`, which +// will reverse a given permutation. +// The reverse permutation array is created by swapping the indices and their +// associated values from the given permutation array. +inline std::vector create_reverse_permutation(std::vector permutation) { + int64_t ndim = permutation.size(); + std::vector reverse_permutation(ndim); + for (const auto dim_ind : c10::irange(ndim)) { + reverse_permutation[permutation[dim_ind]] = dim_ind; + } + return reverse_permutation; +} + +// Compute R-work array size for MAGMA/LAPACK cgesdd/zgesdd +// See https://github.com/Reference-LAPACK/lapack/blob/122506cd8b6ce050a200920c3d4c0b153b150fd8/SRC/cgesdd.f#L186 +inline int64_t computeLRWorkDim(const char jobz, int64_t m, int64_t n) { + auto mn = std::min(m, n); + auto mx = std::max(m, n); + if (jobz == 'N') { +#ifdef __APPLE__ + // According to `vecLib.framework/Headers/clapack.h` Accelerate.framework is based on LAPACK 3.2.1 + return 7 * mn; +#else + // These setting is valid for on LAPACK 3.6+ + return 5 * mn; +#endif + } + if (mx > 10 * mn) { + return 5 * mn * mn + 5 * mn; + } + return std::max(5 * mn * mn + 5 * mn, 2 * mx * mn + 2 * mn * mn + mn); +} + +// This function checks whether the uplo argument input is valid +// Allowed strings are "u", "U", "l", "L" +inline void checkUplo(const c10::string_view uplo) { + // To use std::toupper safely with plain chars (or signed chars), the argument should first be converted to unsigned char + char uplo_uppercase = static_cast(std::toupper(static_cast(uplo[0]))); + TORCH_CHECK(uplo.size() == 1 && (uplo_uppercase == 'U' || uplo_uppercase == 'L'), + "Expected UPLO argument to be 'L' or 'U', but got ", uplo); +} + +inline void checkSameDevice(const std::string& fn_name, Tensor result, Tensor input, const std::string& result_name = "result") { + TORCH_CHECK( + result.device() == input.device(), + fn_name, + ": Expected ", result_name, " and input tensors to be on the same device, but got ", + result_name, " on ", result.device(), " and input on ", input.device()); +} + +// Check the dtype of result and input tensors (for _out variants). +// Most linear algebra functions have the same dtype for input and output +// (either floating or complex type input), so we can check whether input's dtype can be casted to result's dtype. +// According to https://github.com/pytorch/pytorch/wiki/Developer-FAQ#how-does-out-work-in-pytorch +// c10::canCast is used for checking the "safe copy" dtype requirements. +inline void checkLinalgCompatibleDtype(const std::string& fn_name, Tensor result, Tensor input, const std::string& result_name = "result") { + bool can_cast = c10::canCast(input.scalar_type(), result.scalar_type()); + TORCH_CHECK( + can_cast, + fn_name, + ": Expected ", result_name, " to be safely castable from ", input.scalar_type(), " dtype, but got ", + result_name, " with dtype ", result.scalar_type()); +} + +// Alternatively, we can check whether the specific expected output type (result_type) can be safely casted to out tensor dtype (out_type) +inline void checkLinalgCompatibleDtype(const std::string& fn_name, ScalarType out_type, ScalarType result_type, const std::string& out_name = "result") { + bool can_cast = c10::canCast(result_type, out_type); + TORCH_CHECK( + can_cast, + fn_name, + ": Expected ", out_name, " to be safely castable from ", result_type, " dtype, but got ", + out_name, " with dtype ", out_type); +} + +inline void checkNotComplexTolerance(const Tensor& tol, const c10::string_view f_name, const c10::string_view tol_name) { + TORCH_CHECK(!at::isComplexType(tol.scalar_type()), + f_name, ": ", tol_name, " tensor of complex type is not supported. Got ", tol.scalar_type()); +} + +/* + Two types of 'other' tensors are supported when solving + a system of linear equations matmul(input, x) = other: + * 1-dimensional (1D) tensor or batch of 1D tensors (vector case) + * 2-dimensional (2D) tensor or batch of 2D tensors (matrix case). + The original torch.solve supported only the matrix case, while NumPy works for both cases. + For the batched input we need to be able to distinguish them. + Let input.shape = (batch_dimensions, m, n), then 'other' is of vector type if other.shape == (batch_dimensions, m). + This rule is compatible with NumPy, see https://github.com/numpy/numpy/blob/v1.20.0/numpy/linalg/linalg.py#L384-L389 +*/ +inline bool linalg_solve_is_vector_rhs(const Tensor& input, const Tensor& other) { + auto expected_batched_rhs_shape = SymIntArrayRef(input.sym_sizes().data(), input.dim() - 1); // input.shape[:-1] + bool vector_case = other.dim() == 1 || (input.dim() - 1 == other.dim() && other.sym_sizes().equals(expected_batched_rhs_shape)); + return vector_case; +} + +/* + Computes linear indices for a tensor with original_shape to access its elements like it was a materialized broadcast tensor. +*/ +inline Tensor get_linear_indices(int64_t numel, IntArrayRef original_shape, IntArrayRef broadcast_shape) { + TensorOptions options = at::TensorOptions().dtype(at::kLong).device(at::kCPU); + return at::arange(numel, options).view(original_shape).broadcast_to(broadcast_shape).contiguous(); +} + +class BroadcastLinearIndices { + private: + Tensor linear_indices_; + bool is_broadcasting_; + + public: + BroadcastLinearIndices( + int64_t numel, + IntArrayRef original_shape, + IntArrayRef broadcast_shape) : is_broadcasting_(!original_shape.equals(broadcast_shape)) { + // The assumption is that the broadcast_shape is a materialized broadcast + // shape of the original_shape. We need to compute the linear indices + // compatible with the original_shape to access the elements in the original + // tensor corresponding to the broadcast tensor. + if (is_broadcasting_) { + linear_indices_ = + get_linear_indices(numel, original_shape, broadcast_shape); + } + } + int64_t operator()(int64_t broadcast_linear_index) { + return is_broadcasting_ + ? linear_indices_.data_ptr()[broadcast_linear_index] + : broadcast_linear_index; + } +}; + +inline bool is_blas_compatible_column_major_order(const Tensor& input) { + IntArrayRef input_strides = input.strides(); + IntArrayRef input_sizes = input.sizes(); + auto ndim = input.dim(); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim >= 2); + if (ndim > 3) { + return input.transpose(-2, -1).is_contiguous(); + } + auto leading_dimension = input_strides[ndim - 1]; + auto rows = input_sizes[ndim - 2]; + bool batch_stride_compatible = true; + if (ndim == 3) { + auto cols = input_sizes[ndim - 1]; + batch_stride_compatible = + input_strides[ndim - 3] >= leading_dimension * cols; + } + return (input_strides[ndim - 2] == 1) && + (leading_dimension >= std::max(1, rows)) && + batch_stride_compatible; +} + +inline bool is_blas_compatible_row_major_order(const Tensor& input) { + IntArrayRef input_strides = input.strides(); + IntArrayRef input_sizes = input.sizes(); + auto ndim = input.dim(); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim >= 2); + if (ndim > 3) { + return input.is_contiguous(); + } + auto leading_dimension = input_strides[ndim - 2]; + auto cols = input_sizes[ndim - 1]; + bool batch_stride_compatible = true; + if (ndim == 3) { + auto rows = input_sizes[ndim - 2]; + batch_stride_compatible = + input_strides[ndim - 3] >= leading_dimension * rows; + } + return (input_strides[ndim - 1] == 1) && + (leading_dimension >= std::max(1, cols)) && + batch_stride_compatible; +} + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MathBitsFallback.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MathBitsFallback.h new file mode 100644 index 0000000000000000000000000000000000000000..de2296634e04511497827128c4483c5e75c6a674 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MathBitsFallback.h @@ -0,0 +1,157 @@ +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include + +#include +#endif + +namespace at::native { +// This fallback should only be used for operations that are self inverse and have a corresponding tensor +// bit (internally implemented using DispatchKey) to maintain the state on tensor using tensor bit. +// Currently there are two tensor bits that trigger this fallback: conjugate bit and negative bit. +// Conjugate bit is set on a tensor when `.conj()` is called and neg bit is set on a tensor when `.conj().imag` is called. + +// NOTE: To use this fallback, `clone` and `copy_` should fully understand and be able to correctly handle the semantic of your math bit. +struct MathOpFallback { + MathOpFallback(DispatchKey key_, string op_name_) : key(key_), op_name(std::move(op_name_)) {} + virtual bool is_bit_set(const Tensor&) = 0; + void fallback_impl(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) { + /* + Situations to handle: + 1. Out-of-place operation. Easy: materialize all inputs and + call it a day. + 2. Inplace operation. Desugar x.add_(2) into x.conj_().add_(2).conj_(). + Materialize other inputs as in (1). + 3. out= operation. Desugar add(x, 2, out=y) into y.copy_(add(x, 2)) + Materialize other inputs as in (1). + + It is important to be able to tell if we READ from an argument and if we + WRITE to an argument. Conservative approach is to assume that we always + READ from an argument, but in out= operations you can skip + conjugating inputs on entry that never get used. In the current schema we + can't easily tell if the operation is in in-place or out= operation. + + Note: + 1. Mutable tensorlists containing tensors whose math bit set to true are disallowed. + 2. Mutable tensors with math bit set to true are unconditionally cloned to ensure + correct behavior in the case when the mutable tensor shares memory with non mutable arguments. + + If we were to in-place resolve the math bit for mutable inputs, then the non-mutable inputs sharing partial or full memory + with these mutable inputs would read into wrong values in the following cases: + 1. Non mutable inputs have their math bit set to false. + 2. Math bit for mutable input(s) is resolved before the non mutable inputs (with bit set to true and sharing memory + with one or more mutable arg(s)) are cloned. + At the end, the final value of the mutable arguments from the stack are copied into the original input mutable tensor inputs. + */ + const auto& arguments = op.schema().arguments(); + const auto num_arguments = arguments.size(); + const auto stack_start = stack->size() - num_arguments; + + std::optional is_write; + for (const auto i : c10::irange(num_arguments)) { + // Three possible states: + // 1. alias_info has no value --> out-of-place operation + // 2. alias_info does have a value, alias_info->is_write=True --> in-place or out= operation + // 3. alias_info does have a value, alias_info->is_write=False --> view operation + const AliasInfo* alias_info = arguments[i].alias_info(); + if (alias_info != nullptr) { + if (is_write.has_value()) { + TORCH_CHECK(*is_write == alias_info->isWrite(), + "Unsupported operator for ", op_name, " fallback: ", op.schema().name(), + op_name, " fallback doesn't work for operators with a mix " + "mutable and non-mutable inputs that alias with outputs, " + "this must be implemented manually. " + "If you got this error on a core op, please report a bug to PyTorch."); + } else { + is_write = alias_info->isWrite(); + } + } + } + + if (is_write.has_value() && !*is_write) { + // We assume that view operators automatically handle the math bit + // correctly by propagating the dispatch key in key_set. + // This is not necessarily always right, so you should test these cases. + op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, key), stack); + return; + } + + // Mutable inputs with math bit set to True and their clones + std::vector> mutable_inputs_with_their_clones; + for (const auto i : c10::irange(num_arguments)) { + auto& ivalue = (*stack)[stack_start + i]; + if (!(ivalue.isTensor() || ivalue.isTensorList())) { + continue; + } + const auto& argument = arguments[i]; + bool mut_arg = false; + if (argument.alias_info()) { + // Was already tested by is_write loop above + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(argument.alias_info()->isWrite()); + mut_arg = true; + } + if (ivalue.isTensor()) { + if (!is_bit_set(ivalue.toTensor())) { + continue; + } + auto tensor = std::move(ivalue).toTensor(); + auto resolved_tensor = at::clone(tensor); + if (mut_arg) { + TORCH_CHECK(mutable_inputs_with_their_clones.empty(), op_name, " fallback does not support operators with more than one mutable tensors with ", + op_name, "bit set to true."); + mutable_inputs_with_their_clones.emplace_back(std::move(tensor), resolved_tensor); + } + (*stack)[stack_start + i] = std::move(resolved_tensor); + } else if (ivalue.isTensorList()) { + auto tensors = std::move(ivalue).toTensorList(); + for(const auto j : c10::irange(tensors.size())) { + const auto& tensor = tensors[j]; + if (!is_bit_set(tensor)) { + continue; + } + TORCH_CHECK(!mut_arg, " fallback doesn't currently support mutable TensorLists with ", + op_name, " inputs. Please materialize all the ", op_name, " input tensor(s) in the mutable TensorList inputs before calling ", + op.schema().name()); + tensors[j] = at::clone(tensor); + } + (*stack)[stack_start + i] = std::move(tensors); + } + } + + op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, key), stack); + + TORCH_INTERNAL_ASSERT(mutable_inputs_with_their_clones.size() <= 1); + + for (std::pair mut_tensors: mutable_inputs_with_their_clones) { + auto& mutable_input = mut_tensors.first; + auto& cloned_mutable_input = mut_tensors.second; + auto& ivalue = (*stack)[stack_start]; + auto returned_output = std::move(ivalue).toTensor(); + + // sanity check to ensure that the tensor in stack aliases the cloned_mutable_input + TORCH_INTERNAL_ASSERT(cloned_mutable_input.is_same(returned_output)); + + // necessary for out= arg + at::native::resize_output(mutable_input, returned_output.sizes()); + + mutable_input.copy_(returned_output); + (*stack)[stack_start] = std::move(mutable_input); + } + } + + virtual ~MathOpFallback() = default; + + DispatchKey key; + string op_name; +}; + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MaxPooling.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MaxPooling.h new file mode 100644 index 0000000000000000000000000000000000000000..7044b6ee3dc21abd1197850f3117e407418a13c6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MaxPooling.h @@ -0,0 +1,97 @@ +#pragma once + +#include +#include +#include +#include + +namespace at::native { + +inline void check_max_pool1d( + const Tensor& self, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode) { + + TORCH_CHECK( + self.dim() == 2 || self.dim() == 3, + "max_pool1d() Expected 2D or 3D input tensor, but got ", self.sym_sizes()); + TORCH_CHECK( + kernel_size.size() == 1, + "max_pool1d() kernel_size must be an int, list of ints or tuple of ints of size 1 but got size ", + kernel_size.size()); + TORCH_CHECK( + stride.empty() || stride.size() == 1, + "max_pool1d() stride must be None, an int, list of ints, or tuple of ints of size 1 but got size ", + stride.size()); + TORCH_CHECK( + padding.size() == 1, + "max_pool1d() padding must be an int, list of ints, or tuple of ints of size 1 but got size ", + padding.size()); + TORCH_CHECK( + dilation.size() == 1, + "max_pool1d() dilation must be an int, list of ints or tuple of ints of size 1 but got size ", + dilation.size()); + + // If stride=None then set it to kernel_size + if (stride.empty()) { + stride = kernel_size; + } + + TORCH_CHECK( + kernel_size[0] > 0, + "max_pool1d() kernel_size must be greater than zero, but got ", + kernel_size[0]); + TORCH_CHECK( + stride[0] > 0, "max_pool1d() stride must be greater than zero, but got ", stride[0]); + TORCH_CHECK( + padding[0] >= 0, "max_pool1d() padding must be non-negative, but got ", padding[0]); + TORCH_CHECK( + padding[0] <= kernel_size[0] / 2, + "max_pool1d() padding should be at most half of kernel size, but got padding=", + padding[0], + " and kernel_size=", + kernel_size[0]); + TORCH_CHECK( + dilation[0] > 0, "max_pool1d() dilation must be greater than zero, but got ", dilation[0]); + + const int64_t OW = pooling_output_shape(self.sym_size(-1).guard_int(__FILE__, __LINE__), kernel_size[0], padding[0], stride[0], dilation[0], ceil_mode); + TORCH_CHECK(OW > 0, "max_pool1d() Invalid computed output size: ", OW); +} + +// TODO(Heitor) Template by dimension +struct PoolingParams1D { + int64_t NB; // Number of batches + int64_t NC; // Number of channels + int64_t IW; // Input width + int64_t OW; // Output width + int64_t KW; // Kernel width + int64_t SJ; // Column stride + int64_t PJ; // Column padding + int64_t DJ; // Column dilation + + // Return index of input element for the given kernel and output index + inline int64_t index(int64_t kj, int64_t oj) const { + return oj * SJ + kj * DJ - PJ; + } + + // Return index of first output within bounds for this kernel index + inline int64_t valid_output_start(int64_t kj) const { + int64_t ij = index(kj, 0);; + return ij < 0 ? at::divup(-ij, SJ) : 0; + } + + // Return index one past last output within bounds for this kernel index + inline int64_t valid_output_end(int64_t kj) const { + int64_t ij = index(kj, OW - 1); + return ij >= IW ? OW - at::divup(ij - (IW - 1), SJ) : OW; + } +}; + +using pooling_fn = void (*)(Tensor&, const Tensor&, const PoolingParams1D&); + +DECLARE_DISPATCH(pooling_fn, max_pool1d_stub); + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/NonSymbolicBC.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/NonSymbolicBC.h new file mode 100644 index 0000000000000000000000000000000000000000..fbcfd3a13f9c3c71a5291fdad2e6b95b186cf681 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/NonSymbolicBC.h @@ -0,0 +1,26 @@ +#pragma once +#include +#include +#include + +namespace at::native { +// This file contains non-symbolic signatures for ops that we have sym-intified the signature of. +// However, in certain cases (such as static runtime), we call the native versions of the ops directly. +// In those cases, we will duplicate the signature here with non-symbolic ints, and also duplicate the C++ implementation. +TORCH_API at::Tensor reshape(const at::Tensor& self, at::IntArrayRef proposed_shape); +TORCH_API at::Tensor narrow(const at::Tensor& self, int64_t dim, int64_t start, int64_t length); +TORCH_API at::Tensor _sparse_coo_tensor_unsafe(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, std::optional dtype=std::nullopt, std::optional layout=std::nullopt, std::optional device=std::nullopt, std::optional pin_memory=std::nullopt, std::optional is_coalesced=std::nullopt); +TORCH_API at::Tensor nll_loss(const at::Tensor & self, const at::Tensor & target, const std::optional& weight_opt, int64_t reduction, int64_t ignore_index); +TORCH_API at::Tensor nll_loss2d(const at::Tensor & self, const at::Tensor & target, const std::optional& weight_opt, int64_t reduction, int64_t ignore_index); +// The below ops don't get a duplicated C++ implementation. +// They are backward ops, which make them very unlikely to be called directly +// by external code (at::native::trace_backward). +// They get their own declaration for BC purposes however. +TORCH_API at::Tensor _embedding_bag_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse, const std::optional & per_sample_weights, int64_t padding_idx=-1); +TORCH_API at::Tensor _embedding_bag_sparse_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const std::optional & per_sample_weights, int64_t padding_idx=-1); +TORCH_API at::Tensor value_selecting_reduction_backward(const at::Tensor & grad, int64_t dim, const at::Tensor & indices, at::IntArrayRef sizes, bool keepdim); +TORCH_API at::Tensor trace_backward(const at::Tensor & grad, at::IntArrayRef sizes); +TORCH_API at::Tensor index_select_backward(const at::Tensor & grad, at::IntArrayRef self_sizes, int64_t dim, const at::Tensor & index); +TORCH_API at::Tensor select(const at::Tensor& self, int64_t dim, int64_t index); +TORCH_API std::vector tensor_split(const Tensor& self, IntArrayRef indices, int64_t dim); +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/RangeFactories.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/RangeFactories.h new file mode 100644 index 0000000000000000000000000000000000000000..df3b43856e0980841cace5500a3e009e1501c8a0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/RangeFactories.h @@ -0,0 +1,12 @@ +#include +#include + +namespace at { +struct TensorIterator; + +namespace native { + +DECLARE_DISPATCH(void(*)(TensorIterator&, const Scalar&, const Scalar&, const Scalar&), arange_stub); +DECLARE_DISPATCH(void(*)(TensorIterator&, const Scalar&, const Scalar&, int64_t), linspace_stub); + +}} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReduceAllOps.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReduceAllOps.h new file mode 100644 index 0000000000000000000000000000000000000000..b3ece0328fe35f5b32faae79d8a66020f5045b1e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReduceAllOps.h @@ -0,0 +1,16 @@ +#pragma once + +#include + +namespace at { +class Tensor; +} + +namespace at::native { + +using reduce_all_fn = void (*)(Tensor & result, const Tensor & self); +using reduce_min_max_fn = void (*)(Tensor & max_result, Tensor & min_result, const Tensor & self); +DECLARE_DISPATCH(reduce_all_fn, min_all_stub); +DECLARE_DISPATCH(reduce_all_fn, max_all_stub); + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReductionType.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReductionType.h new file mode 100644 index 0000000000000000000000000000000000000000..fe28f21cb750139f583af1951cbeaa77fa0a7ed0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReductionType.h @@ -0,0 +1,40 @@ +#pragma once + +#include + +namespace at::native { + +enum class ReductionType {MAX, MEAN, MIN, SUM, PROD}; + +inline ReductionType get_reduction_enum(const c10::string_view& reduce) { + if (reduce == "max" || reduce == "amax") { + return ReductionType::MAX; + } else if (reduce == "mean") { + return ReductionType::MEAN; + } else if (reduce == "min" || reduce == "amin") { + return ReductionType::MIN; + } else if (reduce == "sum") { + return ReductionType::SUM; + } else if (reduce == "prod") { + return ReductionType::PROD; + } else { + TORCH_CHECK(false, "reduce argument must be either sum, prod, mean, amax or amin, got ", reduce); + } +} + +// used for `scatter_reduce`, old options for BC. +inline ReductionType get_operator_enum(const c10::string_view reduce, bool use_new_options) { + if (use_new_options) { + return get_reduction_enum(reduce); + } else { + if (reduce == "add") { + return ReductionType::SUM; + } else if (reduce == "multiply") { + return ReductionType::PROD; + } else { + TORCH_CHECK(false, "reduce argument must be either add or multiply.") + } + } +} + +} // at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Repeat.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Repeat.h new file mode 100644 index 0000000000000000000000000000000000000000..b8d6f92553a4c9bde2f90574a23108fd9667a26c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Repeat.h @@ -0,0 +1,48 @@ +#pragma once + +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + +namespace at::native { + +template < + typename index_t, + void compute(const index_t*, const int64_t*, index_t*, int64_t, int64_t)> +static inline Tensor repeat_interleave_common( + const Tensor& repeats, + std::optional output_size) { + TORCH_CHECK( + repeats.dim() == 1, "repeat_interleave only accept 1D vector as repeat"); + TORCH_CHECK( + repeats.scalar_type() == at::kLong || repeats.scalar_type() == at::kInt, + "repeats has to be Long or Int tensor"); + if (repeats.size(0) == 0) { + return at::empty_like(repeats, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + Tensor repeats_ = repeats.contiguous(); + Tensor cumsum = repeats.cumsum(0); + int64_t total = 0; + if (output_size.has_value()) { + total = output_size.value(); + } else { + total = cumsum[-1].item(); + TORCH_CHECK( + (repeats >= 0).all().item(), "repeats can not be negative"); + } + + Tensor result = at::empty({total}, repeats.options()); + const index_t* repeat_ptr = repeats_.const_data_ptr(); + const int64_t* cumsum_ptr = cumsum.const_data_ptr(); + index_t* result_ptr = result.data_ptr(); + compute(repeat_ptr, cumsum_ptr, result_ptr, repeats.size(0), total); + return result; +} + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Resize.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Resize.h new file mode 100644 index 0000000000000000000000000000000000000000..951a08e84c7793d33b76203d0dfb738fa91056e0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Resize.h @@ -0,0 +1,173 @@ +#pragma once + +#include +#include +#include +#include + +#include + +#include + + +namespace at::native { + +// TODO: make all operations that resize given outputs use this function +// for consistency and maintainability. +// Some operations like `cat` might not be able to make the use of +// resize_output directly. For more details to understand how it works in `cat`, +// see https://github.com/pytorch/pytorch/pull/62560#discussion_r687363362 +// Resizes outputs +// Functions accepting output tensors, like with the "out" kwarg, should +// call this function to handle resizing their output tensor. +// Issues a warning if the output tensor has one or more elements and +// needs resizing +// NOTE: In the future the warning will become an error +// Returns a bool saying whether or not the resize actually happened or not +TORCH_API bool resize_output(const Tensor& output, IntArrayRef shape); +// WARNING: Do NOT call this directly. If you are resizing an output and want +// to support dynamic shapes call at::resize__symint and resize_output_check_symint. +// For more details, see: https://github.com/pytorch/pytorch/pull/111530/files#r1365845272 +TORCH_API bool resize_output_symint(const Tensor& output, SymIntArrayRef shape); + +// Utility for resize_output +// Returns a bool saying resize should happen or not and +// raises a warning if resizing for one or more elements +TORCH_API bool resize_output_check(const Tensor& output, IntArrayRef shape); +TORCH_API bool resize_output_check_symint(const Tensor& output, SymIntArrayRef shape); + +TORCH_API void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes); +TORCH_API void resize_bytes_meta(StorageImpl* storage, c10::SymInt size_bytes); +TORCH_API void resize_bytes_nocuda(const Storage& storage, const c10::SymInt& size_bytes); + +inline void maybe_resize_storage_cpu(TensorImpl* self, size_t new_size_bytes) { + // It does not make sense to try to resize a storage + // to hold 0 elements, and this can break + // if storage_offset is positive but + // new_size is 0, so just bail in that case + // (same comment is in cuda/Resize.h) + if (self->numel() == 0) { + return; + } + + const Storage& storage = self->unsafe_storage(); + if (!storage) { + auto new_storage = c10::make_intrusive( + StorageImpl::use_byte_size_t(), + new_size_bytes, + c10::GetCPUAllocator(), + true); + self->set_storage_keep_dtype(std::move(new_storage)); + } else if (new_size_bytes > storage.nbytes()) { + resize_bytes_cpu(storage.unsafeGetStorageImpl(), new_size_bytes); + } +} + +TORCH_API TensorImpl* resize_impl_cpu_( + TensorImpl* self, + IntArrayRef size, + at::OptionalIntArrayRef stride, + bool resize_storage = true); + +template +T maybe_convert_symint(c10::SymInt) = delete; + +template <> +inline c10::SymInt maybe_convert_symint(c10::SymInt x) { return x; } + +template <> +inline int64_t maybe_convert_symint(c10::SymInt x) { return x.guard_int(__FILE__, __LINE__); } + +template +inline void checkInBoundsForStorage( + ArrayRef size, + ArrayRef stride, + T storage_offset, + const caffe2::TypeMeta& data_type, + const Storage& new_storage) { + T storage_size_bytes = + at::detail::computeStorageNbytes(size, stride, data_type.itemsize()); + T storage_offset_bytes = storage_offset * data_type.itemsize(); + if (storage_size_bytes == 0) { + // NB: (a tensor with arbitrary 0 dims)'s storage can have any numel. + return; + } + T new_storage_size_bytes = maybe_convert_symint(new_storage.sym_nbytes()); + TORCH_CHECK( + storage_size_bytes + storage_offset_bytes <= new_storage_size_bytes, + "setStorage: sizes ", + size, + ", strides ", + stride, + "," + " storage offset ", + storage_offset, + ", and itemsize ", + data_type.itemsize(), + " requiring a storage size of ", + storage_size_bytes + storage_offset_bytes, + " are out of bounds for storage of size ", + new_storage_size_bytes); +} + +template +inline void checkSetStorage(Tensor& result, Storage storage, T storage_offset, + ArrayRef size, ArrayRef stride) { + // FIXME: stride should be optional + if (stride.data()) { + TORCH_CHECK(size.size() == stride.size(), "unequal size length (", size.size(), + ") and stride length (", stride.size(), ")"); + } + +#ifdef DEBUG + TORCH_CHECK(size.size() <= INT_MAX, "size length (", size.size(), ") greater than INT_MAX"); +#endif + + // storage: note this can't be replaced with result.set_(storage) as the semantics of that + // function is to set the tensor size to be equal to the size of the storage. + if (!result.storage().is_alias_of(storage)) { + // Caffe2 might have tensors whose storages are null, but we + // don't allow it in PyTorch. + TORCH_INTERNAL_ASSERT(storage); + TORCH_INTERNAL_ASSERT(result.storage()); + + // We used to allow this, but this breaks device caching. + // Let's put an actual error message for this one. + TORCH_CHECK(result.storage().device() == storage.device(), + "Attempted to set the storage of a tensor on device \"", result.storage().device(), + "\" to a storage on different device \"", storage.device(), + "\". This is no longer allowed; the devices must match."); + result.unsafeGetTensorImpl()->set_storage_keep_dtype(std::move(storage)); + } + + // storageOffset + TORCH_CHECK(storage_offset >= 0, "Tensor: invalid storage offset ", storage_offset); +} + +/** + * Set self's sizes, strides, and storage_offset. + * (size, stride, storage_offset) must be in bounds for self's storage. + */ +template +inline void setStrided( + const Tensor& self, + ArrayRef size, + ArrayRef stride, + T storage_offset) { + TORCH_CHECK(size.size() == stride.size(), "mismatch in length of strides and shape"); + for (const auto& val : stride) { + TORCH_CHECK(val >= 0, + "as_strided: Negative strides are not supported at the moment, " + "got strides: ", stride); + } + + auto* self_ = self.unsafeGetTensorImpl(); + checkInBoundsForStorage( + size, stride, storage_offset, self_->dtype(), self_->storage()); + + /* storage offset */ + TORCH_CHECK(storage_offset >= 0, "Tensor: invalid storage offset ", storage_offset); + self_->set_sizes_and_strides(size, stride, std::make_optional(storage_offset)); +} + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ResizeCommon.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ResizeCommon.h new file mode 100644 index 0000000000000000000000000000000000000000..cea2612a22127d00532da761cf8e773164db5ca4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ResizeCommon.h @@ -0,0 +1,75 @@ +#pragma once + +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +namespace at::native { + +template +inline T storage_size_for(ArrayRef size, ArrayRef stride) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(size.size() == stride.size(), + "storage_size_for(size, stride) requires that size and stride ", + "have the same size as a precondition."); + T storage_size = 1; + for (const auto dim : c10::irange(size.size())) { + if (size[dim] == 0) { + storage_size = 0; + break; + } + storage_size += (size[dim] - 1) * stride[dim]; + } + return storage_size; +} + +inline const Tensor& resize_named_tensor_( + const Tensor& self, + IntArrayRef size, + std::optional optional_memory_format) { + TORCH_INTERNAL_ASSERT(self.has_names()); + TORCH_CHECK( + self.sizes() == size, + "Cannot resize named tensor with resize_ or resize_as_ (tried to resize " + "Tensor", + self.names(), + " with size ", + self.sizes(), + " to ", + size, + "). This may be caused by passing a named tensor ", + "as an `out=` argument; please ensure that the sizes are the same. "); + TORCH_CHECK( + !optional_memory_format.has_value(), + "Unsupported memory format for named tensor resize ", + optional_memory_format.value()); + return self; +} + +// For deterministic output, fill new elements that were added after a storage +// resize with NaN or MAX_INT. `old_storage_nbytes` is the size of the storage +// before the resize happened. +inline const Tensor& fill_resize_deterministic_(const Tensor& tensor, int64_t old_storage_nbytes) { + const at::Storage& storage = tensor.unsafeGetTensorImpl()->unsafe_storage(); + int64_t new_storage_nbytes = storage.nbytes(); + int64_t old_storage_numel = old_storage_nbytes / tensor.itemsize(); + int64_t new_storage_numel = new_storage_nbytes / tensor.itemsize(); + if (new_storage_numel > old_storage_numel) { + at::Tensor tensor_view = at::empty({}, at::TensorOptions().dtype(tensor.scalar_type()).device(tensor.device())); + tensor_view.set_( + storage, + /*storage_offset=*/old_storage_numel, + /*size=*/{new_storage_numel - old_storage_numel}, + /*stride=*/{1}); + at::native::fill_empty_deterministic_(tensor_view); + } + return tensor; +} + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ScatterGatherChecks.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ScatterGatherChecks.h new file mode 100644 index 0000000000000000000000000000000000000000..94816f5cfb6c7368fc616fe23868740e8bc242f6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ScatterGatherChecks.h @@ -0,0 +1,128 @@ +#pragma once + +#include +#include +#include +#include + +namespace at::native { + +namespace { + +// checks whether index.dtype == int64 +// and self.dtype == src.dtype if src is a Tensor +inline void scatter_gather_dtype_check( + const std::string& method_name, + const Tensor& self, + const Tensor& index, + const std::optional& src_opt = std::nullopt +) { + if (index.numel() != 0) { + TORCH_CHECK( + index.scalar_type() == at::ScalarType::Long, + method_name, "(): Expected dtype int64 for index" + ); + } + + if (src_opt.has_value()) { + const auto& src = src_opt.value(); + TORCH_CHECK( + self.scalar_type() == src.scalar_type(), + method_name, "(): Expected self.dtype to be equal to src.dtype" + ); + } +} + +// Used for `gather`-like methods +// Note: self means the input tensor here +// Test: +// 1. index.size(d) <= self.size(d) for all d != dim +// 2. index.dim() == self.dim() +inline void gather_shape_check(const Tensor& self, int64_t dim, + const Tensor& index +) { + auto self_dims = ensure_nonempty_dim(self.dim()); + TORCH_CHECK(self_dims == ensure_nonempty_dim(index.dim()), + "Index tensor must have the same number of dimensions as input tensor" + ); + + for (const auto i : c10::irange(self_dims)) { + if (i != dim) { + TORCH_CHECK( + ensure_nonempty_size(index, i) <= ensure_nonempty_size(self, i), + "Size does not match at dimension ", i, + " expected index ", index.sizes(), + " to be smaller than self ", self.sizes(), + " apart from dimension ", dim + ); + } + } +} + +// Used for `scatter` and `scatter_add` +// Tests: +// 1. index.size(d) <= self.size(d) for all d != dim +// 2. index.size(d) <= src.size(d) for all d if src is a Tensor +// 3. index.dim() == self.dim() == src.dim() +inline void scatter_shape_check( + const Tensor& self, int64_t dim, const Tensor& index, + const std::optional& src_opt = std::nullopt +) { + if (index.numel() == 0) return; + TORCH_CHECK( + ensure_nonempty_dim(self.dim()) == ensure_nonempty_dim(index.dim()), + "Index tensor must have the same number of dimensions as self tensor" + ); + + bool is_wrong_shape = false; + int64_t self_dims = ensure_nonempty_dim(self.dim()); + + // Check: index.size(d) <= self.size(d) for all d != dim + for (const auto d : c10::irange(self_dims)) { + int64_t index_d_size = ensure_nonempty_size(index, d); + if (d == dim) continue; + if (index_d_size > ensure_nonempty_size(self, d)) { + is_wrong_shape = true; + break; + } + } + + // Check: index.size(d) <= src.size(d) for all d if src is Tensor + if (!is_wrong_shape && src_opt.has_value()) { + const auto& src = src_opt.value(); + for (const auto d : c10::irange(self_dims)) { + int64_t index_d_size = ensure_nonempty_size(index, d); + if (index_d_size > ensure_nonempty_size(src, d)) { + is_wrong_shape = true; + break; + } + } + } + + if (src_opt.has_value()) { + const auto& src = src_opt.value(); + + TORCH_CHECK( + ensure_nonempty_dim(src.dim()) == ensure_nonempty_dim(index.dim()), + "Index tensor must have the same number of dimensions as src tensor" + ); + + TORCH_CHECK(!is_wrong_shape, + "Expected index ", index.sizes(), + " to be smaller than self ", self.sizes(), + " apart from dimension ", dim, + " and to be smaller size than src ", src.sizes() + ); + } + else { + TORCH_CHECK(!is_wrong_shape, + "Expected index ", index.sizes(), + " to be smaller than self ", self.sizes(), + " apart from dimension ", dim + ); + } +} + +} // anonymous namespace + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SharedReduceOps.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SharedReduceOps.h new file mode 100644 index 0000000000000000000000000000000000000000..5b7167ee93dd29edd028a76040a6f1937c2de35d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SharedReduceOps.h @@ -0,0 +1,544 @@ +#pragma once +// Please note that this file is +// used across both CPU and GPU. + +#include +#include +#include +#include +#include +#if defined(__CUDACC__) +#include +#include +#elif defined(__HIPCC__) +#include +#include +#endif +#if defined(__CUDACC__) || defined(__HIPCC__) +#include +#else +#include +#define device_sqrt std::sqrt +#endif +#if defined(__CUDACC__) || defined(__HIPCC__) +template +inline C10_DEVICE scalar_t max_propagate_nan(scalar_t a, scalar_t b) { +#if defined(__HIPCC__) + // TODO: remove this special case for HIP when issue is fixed: + // https://github.com/ROCm-Developer-Tools/HIP/issues/2209 + scalar_t max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max(a, b)); +#else + scalar_t max = at::_isnan(b) ? b : std::max(a, b); +#endif + return max; +} +template +inline C10_DEVICE scalar_t min_propagate_nan(scalar_t a, scalar_t b) { +#if defined(__HIPCC__) + // TODO: remove this special case for HIP when issue is fixed: + // https://github.com/ROCm-Developer-Tools/HIP/issues/2209 + scalar_t min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min(a, b)); +#else + scalar_t min = at::_isnan(b) ? b : std::min(a, b); +#endif + return min; +} +#define MAX(X, Y) max_propagate_nan(X,Y) +#define MIN(X, Y) min_propagate_nan(X,Y) +#else +#include +#define MAX(X, Y) max_impl(X,Y) +#define MIN(X, Y) min_impl(X,Y) +#endif + +// ROCM hcc doesn't work well with using std:: in kernel functions +#if defined(__CUDA_ARCH__) +#include +#define compat_pow c10::cuda::compat::pow +#elif defined(__HIPCC__) +#include +#define compat_pow c10::hip::compat::pow +#else +#define compat_pow std::pow +#endif + +namespace at { namespace native { + +namespace detail { + +#if defined(__CUDACC__) || defined(__HIPCC__) +template using pair = thrust::pair; +#else +template using pair = std::pair; +#endif + +} // namespace detail + +template +struct WelfordData { + scalar_t mean; + scalar_t m2; + index_t n; + scalar_t nf; + + C10_HOST_DEVICE WelfordData() : mean(0), m2(0), n(0), nf(0) {} + + C10_HOST_DEVICE WelfordData( + scalar_t mean, + scalar_t m2, + index_t n, + scalar_t nf) + : mean(mean), m2(m2), n(n), nf(nf) {} +}; + + +template +struct WelfordOps { + acc_scalar_t correction; + bool take_sqrt; + public: + using acc_t = WelfordData; + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const { + // We accumulate n in index_t to avoid cumulative rounding error, but still + // need nf for use in combine where int32 may overflow. + index_t new_n = acc.n + 1; + acc_scalar_t new_nf = static_cast(new_n); + acc_scalar_t delta = data - acc.mean; + acc_scalar_t new_mean = acc.mean + delta / new_nf; + acc_scalar_t new_delta = data - new_mean; + return { + new_mean, + acc.m2 + delta * new_delta, + new_n, + new_nf, + }; + } + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + if (a.nf == 0) { + return b; + } + if (b.nf == 0) { + return a; + } + acc_scalar_t delta = b.mean - a.mean; + acc_scalar_t new_count = a.nf + b.nf; + acc_scalar_t nb_over_n = b.nf / new_count; + return { + a.mean + delta * nb_over_n, + a.m2 + b.m2 + delta * delta * a.nf * nb_over_n, + // setting acc.n as -1 since acc.n might not be able to represent the count + // correctly within its range, setting it to -1 to avoid confusion + -1, + new_count + }; + } + inline C10_DEVICE res_t project(acc_t acc) const __ubsan_ignore_float_divide_by_zero__ { + const auto mean = static_cast(acc.mean); + const auto divisor = acc.nf > correction ? acc.nf - correction : 0; + const auto var = acc.m2 / divisor; + res_t results(take_sqrt ? device_sqrt(var) : var, mean); + return results; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline __device__ acc_t warp_shfl_down(acc_t acc, int offset) const { + return { + WARP_SHFL_DOWN(acc.mean, offset) + , WARP_SHFL_DOWN(acc.m2, offset) + , WARP_SHFL_DOWN(acc.n, offset) + , WARP_SHFL_DOWN(acc.nf, offset) + }; + } +#endif + C10_HOST_DEVICE WelfordOps(acc_scalar_t correction, bool take_sqrt) + : correction(correction), take_sqrt(take_sqrt) {} +}; + +template +struct MeanOps { + factor_t factor; + + inline C10_DEVICE acc_t reduce(acc_t a, scalar_t b, int64_t /*idx*/) const { + return combine(a, static_cast(b)); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline C10_DEVICE out_t project(acc_t a) const { + return a * factor; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const { + return WARP_SHFL_DOWN(data, offset); + } +#endif + + MeanOps(factor_t factor): factor(factor) { + } +}; + +// This accumulator template is used to calculate the minimum absolute value of +// a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template +struct AbsMinOps { + + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return MIN(acc, static_cast(std::abs(data))); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return MIN(a, b); + } + + inline C10_DEVICE out_t project(acc_t a) const { + return a; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } +#endif +}; + +// This accumulator template is used to calculate the maximum absolute value of +// a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template +struct AbsMaxOps { + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return MAX(acc, static_cast(std::abs(data))); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return MAX(a, b); + } + + inline C10_DEVICE out_t project(acc_t a) const { + return a; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } +#endif +}; + +// This accumulator template is used to calculate the norm of the absolute value +// of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template +struct NormOps { + acc_t norm_; + + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return acc + compat_pow(static_cast(std::abs(data)), norm_); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline C10_DEVICE out_t project(acc_t a) const { + return compat_pow(a, static_cast(1.0) / norm_); + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } +#endif + + NormOps(acc_t norm_): norm_(norm_) { + } +}; + +// This accumulator template is used to calculate the order zero norm of the +// absolute value of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template +struct NormZeroOps { + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return acc + (data == static_cast(0) ? static_cast(0) : static_cast(1)); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline C10_DEVICE out_t project(acc_t a) const { + return a; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } +#endif +}; + +// This accumulator template is used to calculate the order one norm of the +// absolute value of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template +struct NormOneOps { + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return acc + static_cast(std::abs(data)); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline C10_DEVICE out_t project(acc_t a) const { + return a; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } +#endif +}; + + +template +struct AbsSwitch {}; + +template +inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch) { + return static_cast(data); +} + +template +inline C10_DEVICE acc_t abs_if_complex(std::complex data, AbsSwitch) { + return static_cast(std::abs(data)); +} + +template +inline C10_DEVICE acc_t abs_if_complex(c10::complex data, AbsSwitch) { + return static_cast(std::abs(data)); +} + +// This accumulator template is used to calculate the order two norm of the +// absolute value of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template +struct NormTwoOps { + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + acc_t data_ = abs_if_complex(data, AbsSwitch()); + return acc + data_ * data_; + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline C10_DEVICE out_t project(acc_t a) const { + return device_sqrt(a); + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } +#endif +}; + +template +struct NanSumOps { + inline C10_DEVICE acc_t reduce(acc_t a, data_t b, int64_t /*idx*/) const { + return a + (at::_isnan(b) ? acc_t{0.} : acc_t{b}); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline C10_DEVICE data_t project(acc_t a) const { + return data_t{a}; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const { + return WARP_SHFL_DOWN(data, offset); + } +#endif +}; + +namespace detail { + +template +struct LessOrNan { + C10_DEVICE bool operator () (scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) const { + // If (a == b), then choose the one with lower idx, else min(a, b) + if (at::_isnan(a)) { + if (at::_isnan(b)) { + return idx_a < idx_b; + } + return true; + } + return (a == b) ? idx_a < idx_b : (a < b); + } +}; + +template +struct GreaterOrNan { + C10_DEVICE bool operator () (scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) const { + // If (a == b), then choose the one with lower idx, else max(a, b) + if (at::_isnan(a)) { + if (at::_isnan(b)) { + return idx_a < idx_b; + } + return true; + } + return (a == b) ? idx_a < idx_b : (a > b); + } +}; + +template +struct MinMaxReductionOps { + using scalar_t = typename binary_function_traits::arg1_t; + using index_t = int64_t; + using arg_t = detail::pair; + + static C10_DEVICE arg_t project(arg_t arg) { + return arg; + } + + static C10_DEVICE arg_t reduce(arg_t arg, scalar_t val, int64_t idx) { + return comp_t{}(arg.first, val, arg.second, idx) ? arg : arg_t(val, idx); + } + + static C10_DEVICE arg_t combine(arg_t a, arg_t b) { + return comp_t{}(a.first, b.first, a.second, b.second) ? a : b; + } + + static C10_DEVICE arg_t translate_idx(arg_t a, int64_t base_idx) { + return {a.first, a.second + base_idx}; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + static C10_DEVICE arg_t warp_shfl_down(arg_t arg, int offset) { + return arg_t(WARP_SHFL_DOWN(arg.first, offset), + WARP_SHFL_DOWN(arg.second, offset)); + } +#endif +}; + +template +struct ArgReductionOps : public MinMaxReductionOps { + using typename MinMaxReductionOps::scalar_t; + using typename MinMaxReductionOps::index_t; + using typename MinMaxReductionOps::arg_t; + + static C10_DEVICE index_t project(arg_t arg) { + return arg.second; + } +}; + +} // namespace detail + +template +struct ArgMaxOps : + public detail::ArgReductionOps> { +}; + +template +struct ArgMinOps : + public detail::ArgReductionOps> { +}; + +template +struct MinOps : + public detail::MinMaxReductionOps> { +}; + +template +struct MaxOps : + public detail::MinMaxReductionOps> { +}; + +template +struct MinMaxOps { + using acc_t = detail::pair; + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const { + return combine(acc, {data, data}); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + auto min_val = (at::_isnan(a.first) || a.first < b.first) ? a.first : b.first; + auto max_val = (at::_isnan(a.second) || a.second > b.second) ? a.second : b.second; + + return {min_val, max_val}; + } + + inline C10_DEVICE acc_t project(acc_t acc) const { + return acc; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return { + WARP_SHFL_DOWN(acc.first, offset), WARP_SHFL_DOWN(acc.second, offset) + }; + } +#endif +}; + +}} // namespace at::native + +#undef MAX +#undef MIN diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Sorting.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Sorting.h new file mode 100644 index 0000000000000000000000000000000000000000..1ab806645fbf144fe9df5f41497f4ca753e8573f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Sorting.h @@ -0,0 +1,28 @@ +#pragma once + +#include +#include + +namespace at { +class TensorBase; +} + +namespace at::native { + +enum class QUANTILE_INTERPOLATION_MODE : uint8_t { + LINEAR, + LOWER, + HIGHER, + MIDPOINT, + NEAREST +}; + +using sort_fn = void(*)(const TensorBase&, const TensorBase&, const TensorBase&, int64_t, bool, bool); +using topk_fn = void(*)(const TensorBase&, const TensorBase&, const TensorBase&, int64_t, int64_t, bool, bool); + +DECLARE_DISPATCH(sort_fn, sort_stub); +DECLARE_DISPATCH(topk_fn, topk_stub); + +void _fill_indices(const TensorBase &indices, int64_t dim); + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SortingUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SortingUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..a0f9cfa8bf0ce554d198b7e55079ed13e1798175 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SortingUtils.h @@ -0,0 +1,88 @@ +#pragma once + +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +namespace at::native { + +// ensure we get good values and indices for kthvalue, mode +// this will always be with the reducing dim as 1-d +inline void _reduction_with_indices_allocate_or_resize_output( + Tensor& values, + Tensor& indices, + const Tensor& self, + int64_t dim_, + bool keepdim) { + int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true); + auto result_sizes = self.sizes().vec(); + if (!result_sizes.empty()) { + result_sizes[dim] = 1; + } + if (values.defined()) { + TORCH_CHECK( + self.options().type_equal(values.options()), + "output values must be of same type as input"); + if (!keepdim && values.dim() == self.dim() - 1) { + // unsqueeze to preserve passed in noncontiguous tensor in resize + values.unsqueeze_(dim); + } + resize_output(values, result_sizes); + } else { + values = at::empty(result_sizes, self.options()); + } + if (indices.defined()) { + TORCH_CHECK( + indices.dtype() == kLong, "output indices must be of scalar type Long"); + TORCH_CHECK( + indices.device() == self.device(), + "output indices must be on same device as input"); + if (!keepdim && indices.dim() == self.dim() - 1) { + // unsqueeze to preserve passed in noncontiguous tensor in resize + indices.unsqueeze_(dim); + } + resize_output(indices, result_sizes); + } else { + indices = at::empty(result_sizes, self.options().dtype(kLong)); + } +} + +// ensure we get good values and indices for topk +inline void _allocate_or_resize_output_with_indices( + Tensor& values, + Tensor& indices, + const Tensor& self, + int64_t dim_, + int64_t k) { + int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true); + auto result_sizes = self.sizes().vec(); + if (!result_sizes.empty()) { + result_sizes[dim] = k; + } + if (values.defined()) { + TORCH_CHECK( + self.options().type_equal(values.options()), + "output values must be of same type as input"); + values.resize_(result_sizes); + } else { + values = at::empty(result_sizes, self.options()); + } + if (indices.defined()) { + TORCH_CHECK( + indices.dtype() == kLong, "output indices must be of scalar type Long"); + TORCH_CHECK( + indices.device() == self.device(), + "output indices must be on same device as input"); + indices.resize_(result_sizes); + } else { + indices = at::empty(result_sizes, self.options().dtype(kLong)); + } +} + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SparseTensorUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SparseTensorUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..3953ca0d17b57f2547bafd2d08bec577b83b713e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SparseTensorUtils.h @@ -0,0 +1,190 @@ +#pragma once + +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + +namespace at::sparse { + +// Just for documentary purposes +using SparseTensor = Tensor; +using SparseType = Type; + +// This is an internal utility function for getting at the SparseTensorImpl, +// so that we can write sparse tensor specific accessors for special fields +// in SparseTensor. You should only use this for writing low level +// setters/getters for SparseTensorImpl fields; otherwise, you should use +// the low level setters/getters that were implemented using this. +// +// This may be called repeatedly, so make sure it's pretty cheap. +inline SparseTensorImpl* get_sparse_impl(const SparseTensor& self) { + TORCH_INTERNAL_ASSERT( + self.is_sparse(), "_internal_get_SparseTensorImpl: not a sparse tensor"); + return static_cast(self.unsafeGetTensorImpl()); +} + +// Takes indices and values and directly puts them into the sparse tensor, no +// copy. This used to be called THSTensor_(_move) +inline void alias_into_sparse( + const SparseTensor& self, + const Tensor& indices, + const Tensor& values) { + get_sparse_impl(self)->set_indices_and_values_unsafe(indices, values); +} + +// Take indices and values and makes a (data) copy of them to put into the +// sparse indices/values. This used to be called THSTensor_(_set) +inline void copy_into_sparse( + const SparseTensor& self, + const Tensor& indices, + const Tensor& values, + bool non_blocking) { + alias_into_sparse( + self, + indices.to(self._indices().options(), non_blocking, /*copy=*/true), + values.to(self._values().options(), non_blocking, /*copy=*/true)); +} + +// TODO: put this into the public API +inline bool is_same_tensor(const Tensor& lhs, const Tensor& rhs) { + return lhs.unsafeGetTensorImpl() == rhs.unsafeGetTensorImpl(); +} + +inline bool is_same_density(const SparseTensor& self, const SparseTensor& src) { + return self.sparse_dim() == src.sparse_dim() && + self.dense_dim() == src.dense_dim(); +} + +// Give us a new values tensor, with the same dimensionality +// as 'values' but with a new number of non-zero elements. +// TODO: Expose this for real in ATen, some day? +// NB: Doesn't preserve data. +inline Tensor new_values_with_size_of(const Tensor& values, int64_t nnz) { + std::vector size = values.sizes().vec(); + size[0] = nnz; + return at::empty(size, values.options()); +} + +// NOTE [ Flatten Sparse Indices ] +// This helper function flattens a sparse indices tensor (a Tensor) into a 1D +// indices tensor. E.g., +// input = [[2, 4, 0], +// [3, 1, 10]] +// full_size = [2, 12] +// output = [ 2 * 12 + 3, 4 * 12 + 1, 0 * 12 + 10 ] = [27, 49, 10] +// +// In other words, assuming that each `indices[i, :]` is a valid index to a +// tensor `t` of shape `full_size`. This returns the corresponding indices to +// the flattened tensor `t.reshape( prod(full_size[:indices.size(0)]), -1 )`. +// if forceClone is true, the result will forced to be a clone of self. +// if force_clone is true, the result will forced to be a clone of self. +TORCH_API Tensor flatten_indices( + const Tensor& indices, + IntArrayRef full_size, + bool force_clone = false); + +// Flatten sparse tensor's indices from nD to 1D, similar to NOTE [ Flatten +// Sparse Indices ], except this one allows partial flatten: only flatten on +// specified dims. Note that the flatten indices might be uncoalesced if +// dims_to_flatten.size() < sparse_dim. Also if input indices is already +// coalesced, the flattened indices will also be sorted. +// +// args: +// indices: sparse tensor indices +// sizes: sparse tensor sizes +// dims_to_flatten: a list of dim index to flatten +// +// Ex1: +// indices = [[2, 4, 0], +// [3, 1, 3]] +// sizes = [2, 12] +// dims_to_flatten = [0, 1] +// new_indices = [ 2 * 12 + 3, 4 * 12 + 1, 0 * 12 + 3 ] = [27, 49, 3] +// +// Ex2: +// dims_to_flatten = [1] +// new_indices = [ 3, 1, 3 ] # uncoalesced +TORCH_API Tensor flatten_indices_by_dims( + const Tensor& indices, + const IntArrayRef& sizes, + const IntArrayRef& dims_to_flatten); + +// Find the CSR representation for a row `indices` from the COO format +TORCH_API Tensor coo_to_csr(const int64_t* indices, int64_t dim, int64_t nnz); + +TORCH_API Tensor zeros_like_with_indices(const Tensor& t); + +template +class TensorGeometryHolder { + using geometry_holder_t = std::array; + + public: + explicit TensorGeometryHolder( + IntArrayRef sizes, + IntArrayRef strides, + TensorOptions options = {}) { + std::copy(sizes.begin(), sizes.end(), t_sizes.begin()); + std::copy(strides.begin(), strides.end(), t_strides.begin()); + } + + explicit TensorGeometryHolder(const Tensor& t) + : TensorGeometryHolder(t.sizes(), t.strides()) {} + + auto operator*() const { + return std::make_tuple(t_sizes, t_strides); + } + + private: + geometry_holder_t t_sizes; + geometry_holder_t t_strides; +}; + +template <> +class TensorGeometryHolder<0> { + using geometry_holder_t = Tensor; + + public: + explicit TensorGeometryHolder( + IntArrayRef sizes, + IntArrayRef strides, + TensorOptions options) { + const int64_t t_ndims = sizes.size(); + const auto cpu_options = TensorOptions(options).dtype(kLong).device(kCPU); + Tensor t_sizes_and_strides_cpu = at::empty({2, t_ndims}, cpu_options); + t_sizes_and_strides_cpu.select(0, 0).copy_(at::tensor(sizes, cpu_options)); + t_sizes_and_strides_cpu.select(0, 1).copy_( + at::tensor(strides, cpu_options)); + const Tensor t_sizes_and_strides = + t_sizes_and_strides_cpu.to(options.device()); + t_sizes = t_sizes_and_strides.select(0, 0); + t_strides = t_sizes_and_strides.select(0, 1); + } + + explicit TensorGeometryHolder(const Tensor& t) + : TensorGeometryHolder(t.sizes(), t.strides(), t.options()) {} + + auto operator*() const { + return std::make_tuple( + t_sizes.template data_ptr(), + t_strides.template data_ptr()); + } + + private: + geometry_holder_t t_sizes; + geometry_holder_t t_strides; +}; + +// Return all indices of a tensor with the given shape. +// +// full_coo_indices(shape) is equivalent to +// torch.ones(shape).nonzero().transpose(-2, -1) but much faster. +TORCH_API Tensor full_coo_indices(IntArrayRef sizes, TensorOptions options); + +} // namespace at::sparse diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorAdvancedIndexing.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorAdvancedIndexing.h new file mode 100644 index 0000000000000000000000000000000000000000..7b02b4201ffaa175b58edeb8288b667b393dc07d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorAdvancedIndexing.h @@ -0,0 +1,49 @@ +#pragma once + +// Indexing tensors by tensors + +#include +#include +#include +#include + +namespace at { +struct TensorIterator; +} + +namespace at::native { + +using index_put_with_sort_fn = void(*)(Tensor &, const c10::List> &, const Tensor &, bool accumulate, bool unsafe); +using index_put_with_sort_quantized_fn = void(*)(Tensor& self, const c10::List>& indices, const Tensor& value, double scale, int zero_point, bool unsafe); +using gather_fn = void (*)(const Tensor & result, const Tensor & self, int64_t dim, const Tensor & index); +using scatter_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src); +using scatter_fill_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Scalar& src); +using scatter_add_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src); +using scatter_reduce_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index, + const Tensor& src, const ReductionType& reduce); +using scatter_scalar_reduce_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index, + const Scalar& value, const ReductionType& reduce); +using scatter_reduce_two_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index, + const Tensor& src, const ReductionType& reduce); + +DECLARE_DISPATCH(index_put_with_sort_fn, index_put_with_sort_stub); +DECLARE_DISPATCH(index_put_with_sort_quantized_fn, index_put_with_sort_quantized_stub); +DECLARE_DISPATCH(gather_fn, gather_stub); +DECLARE_DISPATCH(scatter_fn, scatter_stub); +DECLARE_DISPATCH(scatter_fill_fn, scatter_fill_stub); +DECLARE_DISPATCH(scatter_add_fn, scatter_add_stub); +DECLARE_DISPATCH(scatter_reduce_fn, scatter_reduce_stub); +DECLARE_DISPATCH(scatter_scalar_reduce_fn, scatter_scalar_reduce_stub); +DECLARE_DISPATCH(scatter_reduce_two_fn, scatter_reduce_two_stub); + +TORCH_API Tensor& index_out(Tensor& result, const Tensor & self, const c10::List>& indices); + +using scatter_add_expanded_index_fn = void(*)(const Tensor&, const Tensor&, const Tensor&); +using scatter_reduce_expanded_index_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const ReductionType& reduce, bool); +using gather_expanded_index_fn = void (*)(const Tensor&, const Tensor&, const Tensor&); + +DECLARE_DISPATCH(scatter_add_expanded_index_fn, scatter_add_expanded_index_stub); +DECLARE_DISPATCH(scatter_reduce_expanded_index_fn, scatter_reduce_expanded_index_stub); +DECLARE_DISPATCH(gather_expanded_index_fn, gather_expanded_index_stub); + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorAdvancedIndexingUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorAdvancedIndexingUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..f9b616013ddb26e20bd26d33651e76863886e129 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorAdvancedIndexingUtils.h @@ -0,0 +1,94 @@ +#pragma once +#include +#include +#include + +namespace at::native { +namespace { +#ifndef STRIP_ERROR_MESSAGES +inline std::string shapes_as_str(TensorList tensors) { + std::ostringstream os; + bool first = true; + for (auto& tensor : tensors) { + if (tensor.defined()) { + if (!first) { + os << ", "; + } + os << tensor.sizes(); + first = false; + } + } + return os.str(); +} +#endif +} // anonymous namespace + +inline std::tuple canDispatchToMaskedFill(const Tensor& self, const torch::List>& indices, +const Tensor& value){ + if (!(value.numel() ==1 && value.device().is_cpu())){ + return std::make_tuple(false,Tensor()); + } + int64_t num_ind = 0; + Tensor mask; + auto self_device = self.device(); + for (const std::optional& i: indices) { + if (!i.has_value() || !(*i).defined()){ + num_ind++; + } else { + const Tensor &index = *i; + if ((index.scalar_type() != kByte && index.scalar_type() != kBool) || + index.device() != self_device || mask.defined()){ + return std::make_tuple(false, Tensor()); + } else { + mask = index; + for (const auto j : c10::irange(index.dim())) { + int64_t srcIdx = num_ind + j; + TORCH_CHECK_INDEX(index.size(j) == self.size(srcIdx), "The shape of the mask ", index.sizes(), " at index ", j, + " does not match the shape of the indexed tensor ", self.sizes(), " at index ", srcIdx); + } + num_ind += mask.ndimension(); + } + } + } + for (C10_UNUSED const auto i : c10::irange(num_ind, self.ndimension())) { + mask = mask.unsqueeze(-1); + } + return std::make_tuple(true, mask); +} + +inline AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) { + checkIndexTensorTypes(orig, /*allow_int*/ true); + // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors + auto indices = expandTensors(self, orig); + // next broadcast all index tensors together + try { + indices = expand_outplace(indices); + } catch (std::exception& e) { + TORCH_CHECK_INDEX(false, "shape mismatch: indexing tensors could not be broadcast together" + " with shapes ", shapes_as_str(indices)); + } + // add missing null Tensors so that it matches self.dim() + while (indices.size() < (size_t)self.dim()) { + indices.emplace_back(); + } + // if the non-null indices are not all adjacent, transpose self and indices + // together so that they're adjacent at the front + if (!hasContiguousSubspace(indices)) { + std::tie(self, indices) = transposeToFront(self, indices); + } + // Ensure indices are on the same device as self + for (auto & indice : indices) { + if (indice.defined() && indice.device() != self.device()) { + indice = indice.to(self.device()); + } + } + for (auto & indice : indices) { + if (indice.defined() && indice.dtype() == at::kInt) { + indice = indice.to(at::kLong); + } + } + + return AdvancedIndex(self, indices); +} + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorDimApply.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorDimApply.h new file mode 100644 index 0000000000000000000000000000000000000000..4d52446446316af9546bf3a1f5c0eca58a6e608f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorDimApply.h @@ -0,0 +1,55 @@ +#pragma once +#include +#include + +namespace at::native { +//input tensors are non-zero dim and non-empty +template + +void tensor_dim_apply3(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim, Function func) { + int ndims = self.dim(); + int tensor_dim_apply_has_finished = 0; + std::vector counter(ndims, 0); + const T1* self_data = self.const_data_ptr(); + T1* values_data = values.data_ptr(); + T2* indices_data = indices.data_ptr(); + int64_t self_stride = self.stride(dim); + int64_t values_stride = values.stride(dim); + int64_t indices_stride = indices.stride(dim); + int self_dim_size = self.size(dim); + + while (!tensor_dim_apply_has_finished) { + func(self_data, values_data, indices_data, self_dim_size, self_stride, values_stride, indices_stride); + if (ndims == 1) { + break; + } + for (const auto dim_i : c10::irange(ndims)) { + if (dim_i == dim) { + if (dim_i == (ndims - 1)) { + tensor_dim_apply_has_finished = 1; + break; + } + continue; + } + counter[dim_i]++; + self_data += self.stride(dim_i); + values_data += values.stride(dim_i); + indices_data += indices.stride(dim_i); + + if (counter[dim_i] == self.size(dim_i)) { + if (dim_i == ndims-1) { + tensor_dim_apply_has_finished = 1; + break; + } else { + self_data -= counter[dim_i]*self.stride(dim_i); + values_data -= counter[dim_i]*values.stride(dim_i); + indices_data -= counter[dim_i]*indices.stride(dim_i); + counter[dim_i] = 0; + } + } else { + break; + } + } + } +} +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorFactories.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorFactories.h new file mode 100644 index 0000000000000000000000000000000000000000..32d0a1dc535615e9baf96f02a4afa7c79b6d397a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorFactories.h @@ -0,0 +1,142 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +namespace at::native { +// Different combinations of row, col, and offset can lead to two cases: +// +// Case 1 - Trapezoid (Triangle as a special case): row + offset <= col +// Example A: offset > 0 +// 1 1 0 0 0 +// 1 1 1 0 0 +// 1 1 1 1 0 +// Example B: offset <= 0 +// 0 0 0 +// 1 0 0 +// 1 1 0 +// In this case, we calculate the number of elements in the first row and +// last row of the tril respectively, and then compute the tril size. +// +// Case 2 - Trapezoid + Rectangle: row + offset > col +// Example: +// 1 1 0 +// 1 1 1 +// 1 1 1 +// In this case, we first calculate the size of top trapezoid, and then +// calculate the size of the bottom rectangle. +inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset) { + // If either dimension is 0 then the there is no tril + if (row == 0 || col == 0) { + return 0; + } + // number of elements in the first row of the tril + auto m_first_row = offset > 0 ? + std::min(col, 1 + offset) : // upper bounded by col + row + offset > 0; // either 0 or 1 + // number of elements in the last row of the tril, bounded by [0, col] + auto m_last_row = std::max(0, std::min(col, row + offset)); + // number of rows, bounded by [0, row] + auto n_row_all = std::max(0, std::min(row, row + offset)); + auto n_row_trapezoid = (m_last_row - m_first_row + 1); + + // calculate # of elements in the top trapezoid + auto tril_size = (m_first_row + m_last_row) * n_row_trapezoid >> 1; + + // calculate # of elements in the bottom rectangle if there is any + auto diff_row = n_row_all - n_row_trapezoid; + if (diff_row > 0) { + tril_size += diff_row * col; + } + + return tril_size; +} + +inline void check_args( + int64_t row, int64_t col, std::optional layout_opt) { + TORCH_CHECK(row >= 0, "row must be non-negative, got", row); + TORCH_CHECK(col >= 0, "col must be non-negative, got", col); + if (layout_opt.has_value()) { + TORCH_CHECK( + *layout_opt == at::kStrided, + "only support layout=torch.strided, got", + *layout_opt) + } +} + +using at::check_size_nonnegative; + +// assumes maximum value in created tensor is n-1 (e.g., torch.randperm(n)) +inline void check_supported_max_int_with_precision(int64_t n, const Tensor& tensor) { + // match defined() to behavior of checks below + TORCH_CHECK(at::scalar_tensor(n>0?n-1:n, tensor.options()).defined(), + "n is too large for result tensor type: '", tensor.toString(), "'"); + + // Ensure sufficient precision for floating point representation. + switch (tensor.scalar_type()) { + case at::ScalarType::Half: + TORCH_CHECK(n <= (int64_t(1) << 11) + 1, "n cannot be greater than 2049 for Half type."); + break; + case at::ScalarType::Float: + TORCH_CHECK(n <= (int64_t(1) << 24) + 1, "n cannot be greater than 2^24+1 for Float type."); + break; + case at::ScalarType::Double: // Unlikely to happen, but doesn't hurt to check + TORCH_CHECK(n <= (int64_t(1) << 53) + 1, "n cannot be greater than 2^53+1 for Double type."); + break; + default: + break; + } +} + +// Called by `empty*` functions when deterministic algorithms are enabled to +// fill the tensor with NaN if it is floating point or complex type, or fill +// with max value if it is integer type +inline Tensor& fill_empty_deterministic_(Tensor& tensor) { + if (tensor.is_floating_point() || tensor.is_complex()) { + AT_DISPATCH_V2( + tensor.scalar_type(), "fill_empty_deterministic_", AT_WRAP([&]() { + tensor.fill_(std::numeric_limits::quiet_NaN()); + }), AT_EXPAND(AT_FLOATING_TYPES), AT_EXPAND(AT_COMPLEX_TYPES), AT_EXPAND(AT_FLOAT8_TYPES), kBFloat16, kHalf); + } else { + AT_DISPATCH_V2( + tensor.scalar_type(), "fill_empty_deterministic_", AT_WRAP([&]() { + tensor.fill_(std::numeric_limits::max()); + }), kBool, AT_EXPAND(AT_INTEGRAL_TYPES_V2)); + } + return tensor; +} + +// The ZeroTensor allocator ignores whatever allocation is requested and always +// gives you nullptr +struct ZeroTensorAllocator final : public at::Allocator { + ZeroTensorAllocator(at::Device device) : device_(device) {}; + ~ZeroTensorAllocator() override = default; + static void deleter(void* const pointer) { + TORCH_INTERNAL_ASSERT(!pointer); + } + DataPtr allocate(const size_t /*nbytes*/) override { + return {nullptr, nullptr, &deleter, device_}; + } + DeleterFnPtr raw_deleter() const override { + return deleter; + } + void copy_data(void* dest [[maybe_unused]], const void* src [[maybe_unused]], std::size_t count [[maybe_unused]]) const final {} + at::Device device_; +}; + +using binary_fn = void (*)(TensorIterator&); + +DECLARE_DISPATCH(binary_fn, complex_stub); +DECLARE_DISPATCH(binary_fn, polar_stub); + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIteratorDynamicCasting.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIteratorDynamicCasting.h new file mode 100644 index 0000000000000000000000000000000000000000..a2bdd6eb13e4bde62cba1f9b5c65726abb5bb7d0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIteratorDynamicCasting.h @@ -0,0 +1,52 @@ +#pragma once + +#include +#include +#include +#include +#include + + +// This file includes utilities for dynamic_casting done by TensorIterator, see CUDALoops.cuh and Loops.h. + +// dynamic_casting handles when the types expected by the iterator do not match the types of the arguments +// to the function that is being called. +// On CUDA, the cast is currently pushed down into the kernel (for performance reasons). +// On CPU, there is currently an internal assert that a dynamic_cast is not needed. + +namespace at::native { + +// `needs_dynamic_casting` compares the types expected by iterator +// (i.e. dtypes of the operands) with the actual type of the arguments +// (and returns) of func_t +template::arity> +struct needs_dynamic_casting { + static bool check(TensorIteratorBase& iter) { + using traits = function_traits; + using cpp_type = typename traits::template arg::type; + using cpp_map = c10::CppTypeToScalarType; + + if (iter.input_dtype(nargs-1) != cpp_map::value) { + return true; + } + return needs_dynamic_casting::check(iter); + } +}; + +template +struct needs_dynamic_casting { + static bool check(TensorIteratorBase& iter) { + using traits = function_traits; + using cpp_type = typename traits::result_type; + + // we could assert output numbers are correct here, but checks + // (including arity) are currently pushed outside of this struct. + if constexpr (std::is_void_v) { + return false; + } else { + return iter.dtype(0) != c10::CppTypeToScalarType::value; + } + } +}; + +} //namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorShape.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorShape.h new file mode 100644 index 0000000000000000000000000000000000000000..c35023d076e73d1c812b87b1b308286ebc6a6b07 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorShape.h @@ -0,0 +1,105 @@ +#pragma once +#include +#include +#include + +namespace at::native { + +TORCH_API at::Tensor clone_preserve_strides(const at::Tensor& self); + +inline bool cat_should_skip_tensor(const Tensor& t) { + return t.sym_numel() == 0 && t.dim() == 1; +} + + // Check to see if the shape of tensors is compatible + // for being concatenated along a given dimension. +inline void check_cat_shape_except_dim(const Tensor & first, const Tensor & second, int64_t dimension, int64_t index) { + int64_t first_dims = first.dim(); + int64_t second_dims = second.dim(); + TORCH_CHECK(first_dims == second_dims, "Tensors must have same number of dimensions: got ", + first_dims, " and ", second_dims); + for (const auto dim : c10::irange(first_dims)) { + if (dim == dimension) { + continue; + } + int64_t first_dim_size = first.sizes()[dim]; + int64_t second_dim_size = second.sizes()[dim]; + TORCH_CHECK(first_dim_size == second_dim_size, "Sizes of tensors must match except in dimension ", + dimension, ". Expected size ", static_cast(first_dim_size), " but got size ", static_cast(second_dim_size), " for tensor number ", index, " in the list."); + } + } + +inline void check_cat_no_zero_dim(const MaterializedITensorListRef& tensors) { + int64_t i = 0; + for(const Tensor& t : tensors) { + TORCH_CHECK(t.dim() > 0, + "zero-dimensional tensor (at position ", i, ") cannot be concatenated"); + i++; + } +} + +inline int64_t get_num_splits(const Tensor& self, int64_t split_size, int64_t dim) { + TORCH_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor"); + TORCH_CHECK(split_size >= 0, "split expects split_size be non-negative, but got split_size=", split_size); + int64_t dim_size = self.size(dim); + TORCH_CHECK(split_size > 0 || dim_size == 0, + "split_size can only be 0 if dimension size is 0, " + "but got dimension size of ", dim_size); + // if split_size is 0 and dimension size is 0, there is 1 split. + int64_t num_splits = 1; + if (split_size != 0) { + // ensuring num_splits is at least 1 makes consistent the case where split_size > dim_size + // (returns a single split). We might want to error here, but keep it for BC. + num_splits = std::max((dim_size + split_size - 1) / split_size, 1); + } + return num_splits; +} + +inline bool have_same_ndims(TensorList tensors) { + auto ndim = tensors[0].dim(); + for (const auto tensor_idx : c10::irange(tensors.size())) { + if(tensors[tensor_idx].dim() != ndim) { + return false; + } + } + return true; +} + +inline void leading_dimension_matches(TensorList tensors, int64_t dim) { + auto tensor_zero_size = tensors[0].sizes(); + std::vector leading_dim_sizes(tensor_zero_size.begin(), tensor_zero_size.begin() + dim); + for (const auto i : c10::irange(tensors.size())) { + at::Tensor tensor = tensors[i]; + for(const auto j : c10::irange(dim)) { + TORCH_CHECK( + tensor.size(j) == leading_dim_sizes[j], + "_chunk_cat expects same sizes of 0,...,dim-1 dimensions for all tensors" + ); + } + } +} + +inline int64_t preprocess_chunk_cat_inputs(TensorList tensors, int64_t dim, int64_t num_chunks) { + TORCH_CHECK(num_chunks >= 1, "_chunk_cat expects positive num_chunks"); + TORCH_CHECK(!tensors.empty(), + "_chunk_cat expects a non-empty input tensor list"); + auto expected_dtype = tensors[0].dtype(); + auto expected_device = tensors[0].device(); + for(const auto i : c10::irange(tensors.size())) { + TORCH_CHECK(tensors[i].numel() > 0, "_chunk_cat expects non-empty tensor"); + TORCH_CHECK(tensors[i].dtype() == expected_dtype, "_chunk_cat expects all input tensors with the same dtype"); + TORCH_CHECK(tensors[i].device() == expected_device, "_chunk_cat expects all inputs tensors on the same device"); + } + if (have_same_ndims(tensors)) { + dim = maybe_wrap_dim(dim, tensors[0].dim()); + } else { + TORCH_CHECK(dim >= 0, "_chunk_cat expects non-negative dim when input tensors have different ndims") + for(const auto i : c10::irange(tensors.size())) { + TORCH_CHECK(dim < tensors[i].ndimension(), "_chunk_cat expects dim < ndim for all input tensors"); + } + } + leading_dimension_matches(tensors, dim); + return dim; +} + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TriangularOpsUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TriangularOpsUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..cc56fa6457e75bc980747afc9d2d72257d6c093b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TriangularOpsUtils.h @@ -0,0 +1,57 @@ +#include +#include + +namespace at::native { + +/* + * Given batches of matrices with arbitrary batch dim, + * computes the number of batches for Triu and Tril. This ignores stride 0 dimension + */ +static inline int64_t batchCountTrilTriu(const Tensor& batched_matrices) { + int64_t result = 1; + for (int64_t i = 0; i < batched_matrices.ndimension() - 2; i++) { + if (batched_matrices.stride(i) != 0) { + result *= batched_matrices.size(i); + } + } + return result; +} + +/* Checks a necessary property for the triu and tril implementations, hence the name. + * Here batch contiguity is checked for tensors with greater than 4 dimensions. + * Contiguous tensors and tensors with less than 3 dimensions pass this check + */ +static inline std::tuple checkTrilTriuBatchContiguous(const Tensor& tensor, bool allow_zero_stride) { + // Complete contiguity is the most desired property, which is why + // we return true if the tensor is contiguous + if (tensor.is_contiguous()) { + auto default_strides_for_size = batched_matrix_contiguous_strides(tensor.sizes()); + if (tensor.strides() == default_strides_for_size) { + return std::make_tuple(true, tensor); + } else { + return std::make_tuple(false, tensor.as_strided(tensor.sizes(), default_strides_for_size)); + } + } + + int64_t dims = tensor.dim(); + + // Tensors with dimension less than 4 are handled by default + if (allow_zero_stride && dims <= 3) { + return std::make_tuple(true, tensor); + } + + int64_t expected_stride = tensor.size(-1) * tensor.size(-2); + for (int64_t i = dims - 3; i >= 0; i--) { + // Skip trivial dimension; + if (allow_zero_stride && i == 0 && (tensor.stride(i) == 0 || tensor.size(i) == 1)) { + continue; + } + if (expected_stride != tensor.stride(i)) { + return std::make_tuple(false, tensor.contiguous()); + } + expected_stride *= tensor.size(i); + } + return std::make_tuple(true, tensor); +} + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnaryOps.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnaryOps.h new file mode 100644 index 0000000000000000000000000000000000000000..3d99fdc40d048b208be65b4b44d6f0e59c9e1a62 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnaryOps.h @@ -0,0 +1,130 @@ +#pragma once + +#include +#include +#include +#include + +namespace at { +class Tensor; +class TensorBase; +struct TensorIteratorBase; +} + +namespace at::native { + +using unary_fn = void(*)(TensorIteratorBase&); +using unary_fn_with_scalar = void(*)(TensorIteratorBase&, const Scalar& a); + +inline namespace CPU_CAPABILITY { +void conj_kernel(TensorIteratorBase &iter); +void neg_kernel(TensorIteratorBase &iter); +void reciprocal_kernel(TensorIteratorBase &iter); +void rsqrt_kernel(TensorIteratorBase& iter); +void sqrt_kernel(TensorIteratorBase& iter); +} // namespace CPU_CAPABILITY + +DECLARE_DISPATCH(unary_fn, abs_stub); +DECLARE_DISPATCH(unary_fn, angle_stub); +DECLARE_DISPATCH(unary_fn, conj_physical_stub); +DECLARE_DISPATCH(unary_fn, acos_stub); +DECLARE_DISPATCH(unary_fn, acosh_stub); +DECLARE_DISPATCH(unary_fn, asinh_stub); +DECLARE_DISPATCH(unary_fn, atanh_stub); +DECLARE_DISPATCH(unary_fn, asin_stub); +DECLARE_DISPATCH(unary_fn, atan_stub); +DECLARE_DISPATCH(unary_fn, bitwise_not_stub); +DECLARE_DISPATCH(unary_fn, logical_not_stub); +DECLARE_DISPATCH(unary_fn, ceil_stub); +DECLARE_DISPATCH(unary_fn, cos_stub); +DECLARE_DISPATCH(unary_fn, cosh_stub); +DECLARE_DISPATCH(unary_fn, digamma_stub); +DECLARE_DISPATCH(unary_fn, special_entr_stub); +DECLARE_DISPATCH(unary_fn, special_erfcx_stub); +DECLARE_DISPATCH(unary_fn, erf_stub); +DECLARE_DISPATCH(unary_fn, erfc_stub); +DECLARE_DISPATCH(unary_fn, erfinv_stub); +DECLARE_DISPATCH(unary_fn, exp_stub); +DECLARE_DISPATCH(unary_fn, exp2_stub); +DECLARE_DISPATCH(unary_fn, expm1_stub); +DECLARE_DISPATCH(unary_fn, floor_stub); +DECLARE_DISPATCH(unary_fn, frac_stub); +DECLARE_DISPATCH(unary_fn, frexp_stub); +DECLARE_DISPATCH(unary_fn, i0_stub); +DECLARE_DISPATCH(unary_fn, special_i0e_stub); +DECLARE_DISPATCH(unary_fn, special_i1_stub); +DECLARE_DISPATCH(unary_fn, special_i1e_stub); +DECLARE_DISPATCH(unary_fn, log_stub); +DECLARE_DISPATCH(unary_fn, log10_stub); +DECLARE_DISPATCH(unary_fn, log1p_stub); +DECLARE_DISPATCH(unary_fn, log2_stub); +DECLARE_DISPATCH(unary_fn, special_ndtri_stub); +DECLARE_DISPATCH(unary_fn, special_log_ndtr_stub); +DECLARE_DISPATCH(unary_fn, neg_stub); + +DECLARE_DISPATCH(unary_fn, reciprocal_stub); +DECLARE_DISPATCH(unary_fn, round_stub); +DECLARE_DISPATCH(unary_fn, rsqrt_stub); +DECLARE_DISPATCH(unary_fn, sigmoid_stub); +DECLARE_DISPATCH(unary_fn_with_scalar, logit_stub); +DECLARE_DISPATCH(unary_fn, sign_stub); +DECLARE_DISPATCH(unary_fn, signbit_stub); +DECLARE_DISPATCH(unary_fn, sgn_stub); +DECLARE_DISPATCH(unary_fn, sin_stub); +DECLARE_DISPATCH(unary_fn, sinc_stub); +DECLARE_DISPATCH(unary_fn, sinh_stub); +DECLARE_DISPATCH(unary_fn, sqrt_stub); +DECLARE_DISPATCH(unary_fn, tan_stub); +DECLARE_DISPATCH(unary_fn, tanh_stub); +DECLARE_DISPATCH(unary_fn, trigamma_stub); +DECLARE_DISPATCH(unary_fn, trunc_stub); +DECLARE_DISPATCH(unary_fn, lgamma_stub); +DECLARE_DISPATCH(unary_fn, special_airy_ai_stub); +DECLARE_DISPATCH(unary_fn, special_bessel_j0_stub); +DECLARE_DISPATCH(unary_fn, special_bessel_j1_stub); +DECLARE_DISPATCH(unary_fn, special_bessel_y0_stub); +DECLARE_DISPATCH(unary_fn, special_bessel_y1_stub); +DECLARE_DISPATCH(unary_fn, special_modified_bessel_i0_stub); +DECLARE_DISPATCH(unary_fn, special_modified_bessel_i1_stub); +DECLARE_DISPATCH(unary_fn, special_modified_bessel_k0_stub); +DECLARE_DISPATCH(unary_fn, special_modified_bessel_k1_stub); +DECLARE_DISPATCH(unary_fn, special_scaled_modified_bessel_k0_stub); +DECLARE_DISPATCH(unary_fn, special_scaled_modified_bessel_k1_stub); +DECLARE_DISPATCH(unary_fn, special_spherical_bessel_j0_stub); + +// NB: these are actually defined in Distribution +DECLARE_DISPATCH(void(*)(const TensorBase&, const TensorBase&, std::optional), bernoulli_tensor_stub); +DECLARE_DISPATCH(void(*)(const TensorBase&, const double, std::optional), bernoulli_scalar_stub); +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, std::optional), cauchy_stub); +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, std::optional), exponential_stub); +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, std::optional), geometric_stub); +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, std::optional), log_normal_stub); +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, std::optional), uniform_stub); +DECLARE_DISPATCH(void(*)(const TensorBase&, const double, const double, std::optional), normal_stub); +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const uint64_t, const int64_t, std::optional), random_from_to_stub); +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, std::optional), random_full_64_bits_range_stub); +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, std::optional), random_stub); + +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t, const double), kaiser_window_stub); +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t), polygamma_stub); +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const Scalar& a, const Scalar& b), clamp_stub); +DECLARE_DISPATCH( + void (*)(Tensor&, const Tensor&, int64_t, std::optional), + multinomial_with_replacement_stub); +DECLARE_DISPATCH( + void (*)( + TensorIteratorBase&, + std::optional, + std::optional, + std::optional), + nan_to_num_stub); +DECLARE_DISPATCH(void (*)(TensorIteratorBase&, int64_t), round_decimals_stub); + +// Missing unary functions +// digamma +// lgamma +// erfinv +// clone +// contiguous +// zero +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Unfold2d.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Unfold2d.h new file mode 100644 index 0000000000000000000000000000000000000000..e5fe7d4468217b16818f4bcfc06fb0481175be38 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Unfold2d.h @@ -0,0 +1,48 @@ +#pragma once + +#include +#include +#include + +namespace at::native { + +using unfold2d_copy_fn = void (*)( + ScalarType dtype, + void *finput, + const void *input, + int64_t kH, + int64_t kW, + int64_t dH, + int64_t dW, + int64_t padH, + int64_t padW, + int64_t n_input_plane, + int64_t input_height, + int64_t input_width, + int64_t output_height, + int64_t output_width, + bool is_channels_last +); + +using unfold2d_acc_fn = void (*)( + ScalarType dtype, + void *finput, + void *input, + int64_t kH, + int64_t kW, + int64_t dH, + int64_t dW, + int64_t padH, + int64_t padW, + int64_t n_input_plane, + int64_t input_height, + int64_t input_width, + int64_t output_height, + int64_t output_width, + bool is_channels_last +); + +DECLARE_DISPATCH(unfold2d_copy_fn, unfolded2d_copy_stub); +DECLARE_DISPATCH(unfold2d_acc_fn, unfolded2d_acc_stub); + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Unfold3d.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Unfold3d.h new file mode 100644 index 0000000000000000000000000000000000000000..90ead9d1f7ad48187b8c0a28af2fd59915887d17 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Unfold3d.h @@ -0,0 +1,49 @@ +#pragma once + +#include + +namespace at::native { + +void Unfold3dCopyCPU( + ScalarType dtype, + const void *src, + int64_t C, + int64_t X_D, + int64_t X_H, + int64_t X_W, + int64_t Y_D, + int64_t Y_H, + int64_t Y_W, + int64_t kernel_d, + int64_t kernel_h, + int64_t kernel_w, + int64_t stride_d, + int64_t stride_h, + int64_t stride_w, + int64_t pad_d, + int64_t pad_h, + int64_t pad_w, + void* dst); + +void Unfold3dAccCPU( + ScalarType dtype, + const void *src, + int64_t C, + int64_t X_D, + int64_t X_H, + int64_t X_W, + int64_t Y_D, + int64_t Y_H, + int64_t Y_W, + int64_t kernel_d, + int64_t kernel_h, + int64_t kernel_w, + int64_t stride_d, + int64_t stride_h, + int64_t stride_w, + int64_t pad_d, + int64_t pad_h, + int64_t pad_w, + void *dst); + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnfoldBackward.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnfoldBackward.h new file mode 100644 index 0000000000000000000000000000000000000000..44e05c125913ec234247938a25b47e5feeb9f183 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnfoldBackward.h @@ -0,0 +1,112 @@ +#pragma once + +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +namespace at::native { + +using unfold_backward_fn = void (*)( + Tensor& grad_in, + const Tensor& grad, + int64_t dim, + int64_t size, + int64_t step +); + +DECLARE_DISPATCH(unfold_backward_fn, unfold_backward_stub); + +namespace { + +// Note on naming: it is unconventional. +// grad_in does not mean that it is a gradient wrt to input, +// grad_in/grad_out is just an input/output of unfold_backward kernel. + +static C10_UNUSED TensorIterator _make_unfold_backward_iter_over_grad_out( + Tensor& grad_out, + const Tensor& grad_in, + int64_t dim, + int64_t size, + int64_t step +) { + dim = maybe_wrap_dim(dim, grad_out.dim()); + // last dim stores the folds + + auto grad_out_dim_size = ensure_nonempty_size(grad_out, dim); + auto grad_in_dim_size = ensure_nonempty_size(grad_in, dim); + // dictates the number of elements to iterate over + // in dimension `dim` + auto iter_dim_size = std::min( + grad_out_dim_size, + (grad_in_dim_size - 1) * step + size + ); + + /* prepare grad_out for TensorIterator { */ + auto grad_out_strides = ensure_nonempty_vec(grad_out.strides().vec()); + auto grad_out_sizes = ensure_nonempty_vec(grad_out.sizes().vec()); + grad_out_sizes[dim] = iter_dim_size; + auto grad_out_restrided = grad_out.as_strided( + grad_out_sizes, grad_out_strides + ); + /* } */ + + /* prepare grad_in for TensorIterator { */ + auto grad_in_strides = ensure_nonempty_vec(grad_in.strides().vec()); + auto grad_in_sizes = ensure_nonempty_vec(grad_in.sizes().vec()); + + // set strides for dim to 0 + // and size to 1 because + // this dimension is indexed inside the kernel + grad_in_strides[dim] = 0; + grad_in_sizes[dim] = 1; + + grad_in_strides.pop_back(); + grad_in_sizes.pop_back(); + + auto grad_in_restrided = grad_in.squeeze(-1).as_strided( + grad_in_sizes, grad_in_strides + ); + /* } */ + + // During the TensorIterator iteration we have to know + // i_dim in grad_out[i_1,...,i_dim,...i_n], + // idx_dim stores this information + /* prepare idx_dim for TensorIterator { */ + auto idx_dim = at::arange( + 0, iter_dim_size, grad_in.options().dtype(at::kLong) + ); + + auto grad_out_dim = ensure_nonempty_dim(grad_out.dim()); + + auto idx_dim_strides = std::vector(grad_out_dim, 0); + auto idx_dim_sizes = std::vector(grad_out_dim, 1); + + idx_dim_strides[dim] = 1; + idx_dim_sizes[dim] = iter_dim_size; + + // idx_dim size will broadcast over determined by grad_out sizes in TensorIterator + auto idx_dim_restrided = idx_dim.as_strided(idx_dim_sizes, idx_dim_strides); + /* } */ + + auto iter = TensorIteratorConfig() + .set_check_mem_overlap(false) + .check_all_same_dtype(false) + .resize_outputs(false) + .add_owned_output(grad_out_restrided) + .add_owned_const_input(grad_in_restrided) + .add_owned_const_input(idx_dim_restrided) + .build(); + + return iter; +} + +} + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UpSample.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UpSample.h new file mode 100644 index 0000000000000000000000000000000000000000..033ef2b7fad3ec39a2e9f9ec6ae176b94903bb34 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UpSample.h @@ -0,0 +1,505 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * Note [compute_scales_value] + * Note [area_pixel_compute_scale] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * Interpolate with scale_factor can have different behaviors + * depending on the value of recompute_scale_factor: + * + * - With recompute_scale_factor = True (current default behavior): + * the scale_factor, when provided by the user, are used to calculate + * the output size. The input size and the computed output_size + * are then used to infer new values for the scales which are + * used in the interpolation. Because floating-point math is not exact, + * this may be a different value from the user-supplied scales. + * + * - With recompute_scale_factor = False (which will be the default + * behavior starting 1.5.0): + * the behavior follows opencv logic, and the scales provided by + * the user are the ones used in the interpolation calculations. + * + * If the scales are not provided or if they are provided but + * recompute_scale_factor is set to True (default behavior), the scales + * are computed from the input and the output size; + * + * + * When the scales are inferred from the input and output sizes, + * we view each pixel as an area, idx + 0.5 as its center index. + * Here is an example formula in 1D case. + * if align_corners: center of two corner pixel areas are preserved, + * (0.5, 0.5) -> (0.5, 0.5), + * (input_size - 0.5, 0.5) -> (output_size - 0.5) + * scale = (input_size - 0.5 - 0.5) / (output_size - 0.5 - 0.5) + * src_index + 0.5 - 0.5 = scale * (dst_index + 0.5 - 0.5) + * if not align_corners: the whole range is scaled accordingly + * scale = input_size / output_size + * src_idx + 0.5 = scale * (dst_index + 0.5) + */ + +namespace at::native { + +namespace upsample { + +TORCH_API c10::SmallVector compute_output_size( + c10::IntArrayRef input_size, // Full input tensor size. + at::OptionalIntArrayRef output_size, + std::optional> scale_factors); + +inline std::optional get_scale_value(std::optional> scales, int idx) { + if (!scales) { + return std::nullopt; + } + return scales->at(idx); +} + +} // namespace upsample + +using scale_t = std::optional; +using upsampling_nearest1d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_w); +using _upsampling_nearest_exact1d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_w); +using upsampling_nearest2d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_h, scale_t scales_w); +using _upsampling_nearest_exact2d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_h, scale_t scales_w); +using upsampling_nearest3d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_d, scale_t scales_h, scale_t scales_w); +using _upsampling_nearest_exact3d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_d, scale_t scales_h, scale_t scales_w); +using upsampling_linear1d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_w); +using upsampling_bilinear2d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w); +using _upsampling_bilinear2d_aa = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w); +using upsampling_trilinear3d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_d, scale_t scales_h, scale_t scales_w); +using upsampling_bicubic2d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w); +using _upsampling_bicubic2d_aa = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w); +DECLARE_DISPATCH(upsampling_nearest1d, upsample_nearest1d_kernel); +DECLARE_DISPATCH(_upsampling_nearest_exact1d, _upsample_nearest_exact1d_kernel); +DECLARE_DISPATCH(upsampling_nearest2d, upsample_nearest2d_kernel); +DECLARE_DISPATCH(_upsampling_nearest_exact2d, _upsample_nearest_exact2d_kernel); +DECLARE_DISPATCH(upsampling_nearest3d, upsample_nearest3d_kernel); +DECLARE_DISPATCH(_upsampling_nearest_exact3d, _upsample_nearest_exact3d_kernel); +DECLARE_DISPATCH(upsampling_nearest1d, upsample_nearest1d_backward_kernel); +DECLARE_DISPATCH(_upsampling_nearest_exact1d, _upsample_nearest_exact1d_backward_kernel); +DECLARE_DISPATCH(upsampling_nearest2d, upsample_nearest2d_backward_kernel); +DECLARE_DISPATCH(_upsampling_nearest_exact2d, _upsample_nearest_exact2d_backward_kernel); +DECLARE_DISPATCH(upsampling_nearest3d, upsample_nearest3d_backward_kernel); +DECLARE_DISPATCH(_upsampling_nearest_exact3d, _upsample_nearest_exact3d_backward_kernel); +DECLARE_DISPATCH(upsampling_linear1d, upsample_linear1d_kernel); +DECLARE_DISPATCH(upsampling_bilinear2d, upsample_bilinear2d_kernel); +DECLARE_DISPATCH(_upsampling_bilinear2d_aa, _upsample_bilinear2d_aa_kernel); +DECLARE_DISPATCH(upsampling_trilinear3d, upsample_trilinear3d_kernel); +DECLARE_DISPATCH(upsampling_linear1d, upsample_linear1d_backward_kernel); +DECLARE_DISPATCH(upsampling_bilinear2d, upsample_bilinear2d_backward_kernel); +DECLARE_DISPATCH(_upsampling_bilinear2d_aa, _upsample_bilinear2d_aa_backward_kernel); +DECLARE_DISPATCH(upsampling_trilinear3d, upsample_trilinear3d_backward_kernel); +DECLARE_DISPATCH(upsampling_bicubic2d, upsample_bicubic2d_kernel); +DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_kernel); +DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_backward_kernel); + +inline C10_UNUSED std::array upsample_1d_common_check(IntArrayRef input_size, IntArrayRef output_size) { + TORCH_CHECK( + output_size.size() == 1, + "It is expected output_size equals to 1, but got size ", + output_size.size()); + + TORCH_CHECK( + input_size.size() == 3, + "It is expected input_size equals to 3, but got size ", + input_size.size()); + + int64_t output_width = output_size[0]; + + int64_t nbatch = input_size[0]; + int64_t channels = input_size[1]; + int64_t input_width = input_size[2]; + + TORCH_CHECK( + input_width > 0 && output_width > 0, + "Input and output sizes should be greater than 0, but got input (W: ", + input_width, + ") and output (W: ", + output_width, + ")"); + + return {nbatch, channels, output_width}; +} + +inline C10_UNUSED std::array upsample_2d_common_check(IntArrayRef input_size, IntArrayRef output_size) { + TORCH_CHECK( + output_size.size() == 2, + "It is expected output_size equals to 2, but got size ", + output_size.size()); + + TORCH_CHECK( + input_size.size() == 4, + "It is expected input_size equals to 4, but got size ", + input_size.size()); + + int64_t output_height = output_size[0]; + int64_t output_width = output_size[1]; + + int64_t nbatch = input_size[0]; + int64_t channels = input_size[1]; + int64_t input_height = input_size[2]; + int64_t input_width = input_size[3]; + + TORCH_CHECK( + input_height > 0 && input_width > 0 && output_height > 0 && + output_width > 0, + "Input and output sizes should be greater than 0," + " but got input (H: ", + input_height, + ", W: ", + input_width, + ") output (H: ", + output_height, + ", W: ", + output_width, + ")"); + + return {nbatch, channels, output_height, output_width}; +} + +inline C10_UNUSED +std::array upsample_3d_common_check(IntArrayRef input_size, IntArrayRef output_size) { + TORCH_CHECK( + output_size.size() == 3, + "It is expected output_size equals to 3, but got size ", + output_size.size()); + + TORCH_CHECK( + input_size.size() == 5, + "It is expected input_size equals to 5, but got size ", + input_size.size()); + + int64_t output_depth = output_size[0]; + int64_t output_height = output_size[1]; + int64_t output_width = output_size[2]; + + int64_t nbatch = input_size[0]; + int64_t channels = input_size[1]; + int64_t input_depth = input_size[2]; + int64_t input_height = input_size[3]; + int64_t input_width = input_size[4]; + + TORCH_CHECK( + input_depth > 0 && input_height > 0 && input_width > 0 && + output_depth > 0 && output_height > 0 && output_width > 0, + "Input and output sizes should be greater than 0, but got input (D: ", + input_depth, + ", H: ", + input_height, + ", W: ", + input_width, + ") output (D: ", + output_depth, + ", H: ", + output_height, + ", W: ", + output_width, + ")"); + + + return {nbatch, channels, output_depth, output_height, output_width}; +} + +inline void upsample_2d_shape_check( + const Tensor& input, + const Tensor& grad_output, + int64_t nbatch, + int64_t nchannels, + int64_t input_height, + int64_t input_width, + int64_t output_height, + int64_t output_width) { + TORCH_CHECK( + input_height > 0 && input_width > 0 && output_height > 0 && + output_width > 0, + "Input and output sizes should be greater than 0," + " but got input (H: ", + input_height, + ", W: ", + input_width, + ") output (H: ", + output_height, + ", W: ", + output_width, + ")"); + + if (input.defined()) { + // Allow for empty batch size but not other dimensions + TORCH_CHECK( + (input.numel() != 0 || + (input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0) + ) && + input.dim() == 4, + "Non-empty 4D data tensor expected but got a tensor with sizes ", + input.sizes()); + } else if (grad_output.defined()) { + check_dim_size(grad_output, 4, 0, nbatch); + check_dim_size(grad_output, 4, 1, nchannels); + check_dim_size(grad_output, 4, 2, output_height); + check_dim_size(grad_output, 4, 3, output_width); + } +} + +template +inline scalar_t compute_scales_value( + const std::optional scale, + int64_t input_size, + int64_t output_size) { + // see Note [compute_scales_value] + // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults. + return (scale.has_value() && scale.value() > 0.) + ? static_cast(1.0 / scale.value()) + : (static_cast(input_size) / output_size); +} + +template +inline scalar_t area_pixel_compute_scale( + int64_t input_size, + int64_t output_size, + bool align_corners, + const std::optional scale) { + // see Note [area_pixel_compute_scale] + if(align_corners) { + if(output_size > 1) { + return static_cast(input_size - 1) / (output_size - 1); + } else { + return static_cast(0); + } + } else { + return compute_scales_value(scale, input_size, output_size); + } +} + +template +inline scalar_t area_pixel_compute_source_index( + scalar_t scale, + int64_t dst_index, + bool align_corners, + bool cubic) { + if (align_corners) { + return scale * dst_index; + } else { + scalar_t src_idx = scale * (dst_index + static_cast(0.5)) - + static_cast(0.5); + // [Note] Follow Opencv resize logic: + // We allow negative src_idx here and later will use + // dx = src_idx - floorf(src_idx) + // to compute the "distance"(which affects weights). + // For linear modes, weight distribution doesn't matter + // for negative indices as they use 2 pixels to interpolate. + // For example, [-1, 0], they both use pixel 0 value so it + // doesn't affect if we bound the src_idx to 0 or not. + // TODO: Our current linear mode impls use unbound indices + // where we should and then remove this cubic flag. + // This matters in cubic mode, as we might need [-1, 0, 1, 2] + // to interpolate and the weights can be affected. + return (!cubic && src_idx < static_cast(0)) ? scalar_t(0) + : src_idx; + } +} + +inline int64_t nearest_neighbor_compute_source_index( + const float scale, + int64_t dst_index, + int64_t input_size) { + // Index computation matching OpenCV INTER_NEAREST + // which is buggy and kept for BC + const int64_t src_index = + std::min(static_cast(floorf(dst_index * scale)), input_size - 1); + return src_index; +} + +inline int64_t nearest_neighbor_exact_compute_source_index( + const float scale, + int64_t dst_index, + int64_t input_size) { + // index_f32 = (output_index + 0.5) * scale - 0.5 + // input_index = round(index_f32) + // Same as Pillow and Scikit-Image/Scipy ndi.zoom + const int64_t src_index = + std::min(static_cast(floorf((dst_index + 0.5) * scale)), input_size - 1); + return src_index; +} + +inline int64_t nearest_idx( + int64_t output_index, + int64_t input_size, + int64_t output_size, + std::optional scales) { + // This method specificly treats cases: output_size == input_size or + // output_size == 2 * input_size, that we would like to get rid of + // We keep this method for BC and consider as deprecated. + // See nearest_exact_idx as replacement + if (output_size == input_size) { + // scale_factor = 1, simply copy + return output_index; + } else if (output_size == 2 * input_size) { + // scale_factor = 2, shift input index + return output_index >> 1; + } else { + float scale = compute_scales_value(scales, input_size, output_size); + return nearest_neighbor_compute_source_index(scale, output_index, input_size); + } +} + +inline int64_t nearest_exact_idx( + int64_t output_index, + int64_t input_size, + int64_t output_size, + std::optional scales) { + float scale = compute_scales_value(scales, input_size, output_size); + return nearest_neighbor_exact_compute_source_index(scale, output_index, input_size); +} + +// Define a typedef to dispatch to nearest_idx or nearest_exact_idx +typedef int64_t (*nearest_idx_fn_t)(int64_t, int64_t, int64_t, std::optional); + +template +scalar_t upsample_get_value_bounded( + scalar_t* data, + int64_t width, + int64_t height, + int64_t x, + int64_t y) { + int64_t access_x = std::max(std::min(x, width - 1), static_cast(0)); + int64_t access_y = std::max(std::min(y, height - 1), static_cast(0)); + return data[access_y * width + access_x]; +} + +template +void upsample_increment_value_bounded( + scalar_t* data, + int64_t width, + int64_t height, + int64_t x, + int64_t y, + scalar_t value) { + int64_t access_x = std::max(std::min(x, width - 1), static_cast(0)); + int64_t access_y = std::max(std::min(y, height - 1), static_cast(0)); + data[access_y * width + access_x] += value; +} + +// Based on +// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm +template +scalar_t cubic_convolution1(scalar_t x, scalar_t A) { + return ((A + 2) * x - (A + 3)) * x * x + 1; +} + +template +scalar_t cubic_convolution2(scalar_t x, scalar_t A) { + return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; +} + +template +void get_cubic_upsample_coefficients( + scalar_t coeffs[4], + scalar_t t) { + scalar_t A = -0.75; + + scalar_t x1 = t; + coeffs[0] = cubic_convolution2(x1 + 1.0, A); + coeffs[1] = cubic_convolution1(x1, A); + + // opposite coefficients + scalar_t x2 = 1.0 - t; + coeffs[2] = cubic_convolution1(x2, A); + coeffs[3] = cubic_convolution2(x2 + 1.0, A); +} + +template +inline scalar_t cubic_interp1d( + scalar_t x0, + scalar_t x1, + scalar_t x2, + scalar_t x3, + scalar_t t) { + scalar_t coeffs[4]; + get_cubic_upsample_coefficients(coeffs, t); + + return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; +} + +// when `real_input_index` becomes larger than the range the floating point +// type can accurately represent, the type casting to `int64_t` might exceed +// `input_size`, causing overflow. So we guard it with `std::min` below. +template +inline void guard_index_and_lambda(const opmath_t& real_input_index, const int64_t& input_size, int64_t& input_index, scalar_t& lambda) { + input_index = std::min(static_cast(floorf(real_input_index)), input_size - 1); + lambda = std::min( + std::max(real_input_index - input_index, static_cast(0)), + static_cast(1) + ); +} + +template +inline void compute_source_index_and_lambda( + int64_t& input_index0, + int64_t& input_index1, + scalar_t& lambda0, + scalar_t& lambda1, + opmath_t ratio, + int64_t output_index, + int64_t input_size, + int64_t output_size, + bool align_corners) { + if (output_size == input_size) { + // scale_factor = 1, simply copy + input_index0 = output_index; + input_index1 = output_index; + lambda0 = static_cast(1); + lambda1 = static_cast(0); + } else { + const auto real_input_index = + area_pixel_compute_source_index( + ratio, output_index, align_corners, /*cubic=*/false); + guard_index_and_lambda(real_input_index, input_size, input_index0, lambda1); + int64_t offset = (input_index0 < input_size - 1) ? 1 : 0; + input_index1 = input_index0 + offset; + lambda0 = static_cast(1.) - lambda1; + } +} + +// It will not be used by data types other than BFloat16 and Half. +template || !std::is_same::value, int> = 0> +void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) { + TORCH_CHECK((is_reduced_floating_point_v), + "Upsample backward only support BFloat16 and Half in the lower precision data types on CPU.") + TORCH_CHECK((std::is_same::value), + "Upsample backward should use float as acc buffer for BFloat16 and Half grad input on CPU.") + return; +} + +template && std::is_same::value, int> = 0> +void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) { + using bVec = Vectorized; + using fVec = Vectorized; + int64_t d = 0; + for (; d < size - (size % bVec::size()); d += bVec::size()) { + bVec gin_bvec = bVec::loadu(gin + d); + auto [gin_fvec0, gin_fvec1] = convert_to_float(gin_bvec); + gin_fvec0 += fVec::loadu(buffer_ptr + d); + gin_fvec1 += fVec::loadu(buffer_ptr + d + fVec::size()); + fVec(0).store(buffer_ptr + d); + fVec(0).store(buffer_ptr + d + fVec::size()); + convert_from_float(gin_fvec0, gin_fvec1).store(gin + d); + } + for (; d < size; d++) { + gin[d] += buffer_ptr[d]; + buffer_ptr[d] = 0; + } +} + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/batch_norm.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/batch_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..eba4b0a963241d8ed97eda561d1b3838b961716f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/batch_norm.h @@ -0,0 +1,38 @@ +#pragma once + +#include +#include + +namespace at::native { + +using batch_norm_fn = void (*)(Tensor&, const Tensor&, const Tensor&, + const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, bool, double); +using batch_norm_collect_stats_fn = void (*)(Tensor&, Tensor&, const Tensor&); +using batch_norm_backward_fn = void(*)(Tensor&, Tensor&, Tensor&, const Tensor&, + const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, bool, double); + +DECLARE_DISPATCH(batch_norm_fn, batch_norm_cpu_stub); +DECLARE_DISPATCH(batch_norm_collect_stats_fn, batch_norm_cpu_collect_stats_stub); +DECLARE_DISPATCH(batch_norm_backward_fn, batch_norm_cpu_backward_stub); + +// TensorAccessor when it is defined to work around undefined... +template +static TensorAccessor conditional_accessor_1d(const Tensor& t) { + if (! t.defined()) { + return TensorAccessor(nullptr, nullptr, nullptr); + } + return t.accessor(); +} + +template +static scalar_t* conditional_data_ptr(const Tensor& t) { + if constexpr (std::is_const_v) { + return t.defined() ? t.contiguous().const_data_ptr() + : nullptr; + } else { + return t.defined() ? t.contiguous().data_ptr() + : nullptr; + } +} + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h new file mode 100644 index 0000000000000000000000000000000000000000..6bcd57027d51725aa3f315b9c35b9db126fc76c9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h @@ -0,0 +1,494 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace at { namespace native { namespace detail { + +// Enum representing the FFT type +enum class CuFFTTransformType : int8_t { + C2C, // Complex-to-complex + R2C, // Real-to-complex + C2R, // Complex-to-real +}; + +// This struct is used to let us easily compute hashes of the +// parameters. +// It will be the **key** to the plan cache. +struct CuFFTParams +{ + int64_t signal_ndim_; // between 1 and max_rank, i.e., 1 <= signal_ndim <= 3 + // These include additional batch dimension as well. + int64_t sizes_[max_rank + 1]; + int64_t input_strides_[max_rank + 1]; + int64_t output_strides_[max_rank + 1]; + CuFFTTransformType fft_type_; + ScalarType value_type_; + + CuFFTParams() = default; + + CuFFTParams(IntArrayRef in_strides, IntArrayRef out_strides, + IntArrayRef signal_sizes, CuFFTTransformType fft_type, ScalarType value_type) { + // Padding bits must be zeroed for hashing + memset(this, 0, sizeof(*this)); + signal_ndim_ = signal_sizes.size() - 1; + fft_type_ = fft_type; + value_type_ = value_type; + + TORCH_INTERNAL_ASSERT(in_strides.size() == signal_sizes.size()); + TORCH_INTERNAL_ASSERT(out_strides.size() == signal_sizes.size()); + TORCH_INTERNAL_ASSERT(1 <= signal_ndim_ && signal_ndim_ <= max_rank); + + std::copy(signal_sizes.cbegin(), signal_sizes.cend(), sizes_); + std::copy(in_strides.cbegin(), in_strides.cend(), input_strides_); + std::copy(out_strides.cbegin(), out_strides.cend(), output_strides_); + } +}; + +static_assert(std::is_trivial::value, ""); + +// Returns true if the transform type has complex input +inline bool cufft_complex_input(CuFFTTransformType type) { + switch (type) { + case CuFFTTransformType::C2C: + case CuFFTTransformType::C2R: + return true; + + case CuFFTTransformType::R2C: + return false; + } + TORCH_INTERNAL_ASSERT(false); +} + +// Returns true if the transform type has complex output +inline bool cufft_complex_output(CuFFTTransformType type) { + switch (type) { + case CuFFTTransformType::C2C: + case CuFFTTransformType::R2C: + return true; + + case CuFFTTransformType::C2R: + return false; + } + TORCH_INTERNAL_ASSERT(false); +} + +// Create transform type enum from bools representing if input and output are complex +inline CuFFTTransformType GetCuFFTTransformType(bool complex_input, bool complex_output) { + if (complex_input && complex_output) { + return CuFFTTransformType::C2C; + } else if (complex_input && !complex_output) { + return CuFFTTransformType::C2R; + } else if (!complex_input && complex_output) { + return CuFFTTransformType::R2C; + } + TORCH_INTERNAL_ASSERT(false, "Real to real FFTs are not supported"); +} + + +class CuFFTHandle { + ::cufftHandle handle_; +public: + + CuFFTHandle() { + CUFFT_CHECK(cufftCreate(&handle_)); + } + + ::cufftHandle & get() { return handle_; } + const ::cufftHandle & get() const { return handle_; } + + ~CuFFTHandle() { +// Not using fftDestroy() for rocFFT to work around double freeing of handles +#if !defined(USE_ROCM) + cufftDestroy(handle_); +#endif + } +}; + +__forceinline__ +static bool is_pow_of_two(int64_t x) { + return (x & (x - 1)) == 0; +} + +using cufft_size_type = long long int; + +using CuFFTDimVector = c10::SmallVector; + +// Struct representing a tensor in CuFFT's data layout for planning transforms +// See NOTE [ cuFFT Embedded Strides ]. +struct CuFFTDataLayout { + CuFFTDimVector embed; + cufft_size_type stride, dist; + bool must_clone, simple; +}; + +// Returns a cufft embedding for a contiguous signal of the given size. +// e.g. if the input is cloned, this will be the resulting data layout +// See NOTE [ cuFFT Embedded Strides ]. +inline CuFFTDataLayout cufft_simple_embed(IntArrayRef sizes, bool onesided) { + CuFFTDataLayout layout; + layout.simple = true; + layout.must_clone = false; + layout.embed.assign(sizes.cbegin() + 1, sizes.cend()); + if (onesided) { + layout.embed.back() = sizes.back() / 2 + 1; + } + layout.stride = 1; + layout.dist = 1; + for (const auto& len : layout.embed) { + layout.dist *= len; + } + return layout; +} + +// Convert strides to a CuFFT embedded representation. +// If strides cannot be embedded, returns a simple layout and sets must_clone flag +// See NOTE [ cuFFT Embedded Strides ]. +inline CuFFTDataLayout as_cufft_embed(IntArrayRef strides, IntArrayRef sizes, bool onesided) { + const auto signal_ndim = strides.size() - 1; + CuFFTDataLayout layout; + auto last_stride = strides[signal_ndim]; + layout.must_clone = (last_stride <= 0); + + const auto last_dim_size = onesided ? + sizes[signal_ndim] / 2 + 1 : sizes[signal_ndim]; + const auto signal_numel = c10::multiply_integers(sizes.slice(1, sizes.size() - 2)) * last_dim_size; + + // Zero stides are not allowed, even if the batch size is one. + // If that happens just set a dummy case + if (sizes[0] == 1) { + layout.dist = signal_numel; + } else if (strides[0] == 0) { + layout.must_clone = true; + } else { + layout.dist = strides[0]; + } + + // Calculate the embedding shape, or set must_clone if the strides cannot be embedded + layout.embed.resize(signal_ndim); + for (auto i = signal_ndim - 1; !layout.must_clone && i > 0; i--) { + auto stride = strides[i]; + if (sizes[i] == 1) { + layout.embed[i] = 1; + } else if (stride > 0 && stride % last_stride == 0) { + layout.embed[i] = stride / last_stride; + last_stride = stride; + } else { + layout.must_clone = true; + } + } + + if (layout.must_clone) { + // If the input needs to be cloned, assume it will be contiguous + layout = cufft_simple_embed(sizes, onesided); + layout.must_clone = true; + } else { + layout.embed[0] = sizes[1]; + layout.stride = strides[signal_ndim]; + // Determine if layout represents a simple embedding (contiguous data) + layout.simple = [&] { + for (const auto i : c10::irange(1, signal_ndim - 1)) { + if (layout.embed[i] != sizes[i + 1]) { + return false; + } + } + + return (layout.stride == 1 && layout.dist == signal_numel && + layout.embed.back() == last_dim_size); + }(); + } + return layout; +} + +// This class contains all the information needed to execute a cuFFT plan: +// 1. the plan +// 2. whether to clone input before executing the plan +// 3. the workspace size needed +// +// This class will be the **value** in the plan cache. +// It **owns** the raw plan via a unique_ptr. +class CuFFTConfig { +public: + + // Only move semantics is enought for this class. Although we already use + // unique_ptr for the plan, still remove copy constructor and assignment op so + // we don't accidentally copy and take perf hit. + CuFFTConfig(const CuFFTConfig&) = delete; + CuFFTConfig& operator=(CuFFTConfig const&) = delete; + + explicit CuFFTConfig(const CuFFTParams& params): + CuFFTConfig( + IntArrayRef(params.input_strides_, params.signal_ndim_ + 1), + IntArrayRef(params.output_strides_, params.signal_ndim_ + 1), + IntArrayRef(params.sizes_, params.signal_ndim_ + 1), + params.fft_type_, + params.value_type_) {} + + // For complex types, strides are in units of 2 * element_size(dtype) + // sizes are for the full signal, including batch size and always two-sided + CuFFTConfig(IntArrayRef in_strides, IntArrayRef out_strides, + IntArrayRef sizes, CuFFTTransformType fft_type, ScalarType dtype): + fft_type_(fft_type), value_type_(dtype) { + + // signal sizes (excluding batch dim) + CuFFTDimVector signal_sizes(sizes.begin() + 1, sizes.end()); + + // input batch size + const int64_t batch = sizes[0]; + const int64_t signal_ndim = sizes.size() - 1; + + // Since cuFFT has limited non-unit stride support and various constraints, we + // use a flag to keep track throughout this function to see if we need to + // input = input.clone(); + +#if defined(USE_ROCM) + // clone input to avoid issues with hipfft clobering the input and failing tests + clone_input = true; +#else + clone_input = false; +#endif + + // For half, base strides on the real part of real-to-complex and + // complex-to-real transforms are not supported. Since our output is always + // contiguous, only need to check real-to-complex case. + if (dtype == ScalarType::Half) { + // cuFFT on half requires compute capability of at least SM_53 + auto dev_prop = at::cuda::getCurrentDeviceProperties(); + TORCH_CHECK(dev_prop->major >= 5 && !(dev_prop->major == 5 && dev_prop->minor < 3), + "cuFFT doesn't support signals of half type with compute " + "capability less than SM_53, but the device containing input half " + "tensor only has SM_", dev_prop->major, dev_prop->minor); + for (const auto i : c10::irange(signal_ndim)) { + TORCH_CHECK(is_pow_of_two(sizes[i + 1]), + "cuFFT only supports dimensions whose sizes are powers of two when" + " computing in half precision, but got a signal size of", + sizes.slice(1)); + } + clone_input |= in_strides.back() != 1; + } + + CuFFTDataLayout in_layout; + if (clone_input) { + in_layout = cufft_simple_embed(sizes, fft_type == CuFFTTransformType::C2R); + } else { + in_layout = as_cufft_embed(in_strides, sizes, fft_type == CuFFTTransformType::C2R); + } + auto out_layout = as_cufft_embed(out_strides, sizes, fft_type == CuFFTTransformType::R2C); + TORCH_INTERNAL_ASSERT(!out_layout.must_clone, "Out strides cannot be represented as CuFFT embedding"); + clone_input |= in_layout.must_clone; + + // Check if we can take advantage of simple data layout. + // + // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu. + + const bool simple_layout = in_layout.simple && out_layout.simple; + cudaDataType itype, otype, exec_type; + const auto complex_input = cufft_complex_input(fft_type); + const auto complex_output = cufft_complex_output(fft_type); + if (dtype == ScalarType::Float) { + itype = complex_input ? CUDA_C_32F : CUDA_R_32F; + otype = complex_output ? CUDA_C_32F : CUDA_R_32F; + exec_type = CUDA_C_32F; + } else if (dtype == ScalarType::Double) { + itype = complex_input ? CUDA_C_64F : CUDA_R_64F; + otype = complex_output ? CUDA_C_64F : CUDA_R_64F; + exec_type = CUDA_C_64F; + } else if (dtype == ScalarType::Half) { + itype = complex_input ? CUDA_C_16F : CUDA_R_16F; + otype = complex_output ? CUDA_C_16F : CUDA_R_16F; + exec_type = CUDA_C_16F; + } else { + TORCH_CHECK(false, "cuFFT doesn't support tensor of type: ", dtype); + } + + // disable auto allocation of workspace to use THC allocator + CUFFT_CHECK(cufftSetAutoAllocation(plan(), /* autoAllocate */ 0)); + + size_t ws_size_t; + + // make plan + if (simple_layout) { + // If with unit-stride, we tell cuFFT by setting inembed == onembed == NULL. + // In such case, cuFFT ignores istride, ostride, idist, and odist + // by assuming istride = ostride = 1. + // + // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu. + CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(), + /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype, + /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype, + batch, &ws_size_t, exec_type)); + } else { + CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(), + in_layout.embed.data(), in_layout.stride, in_layout.dist, itype, + out_layout.embed.data(), out_layout.stride, out_layout.dist, otype, + batch, &ws_size_t, exec_type)); + } + ws_size = static_cast(ws_size_t); + } + + const cufftHandle &plan() const { return plan_ptr.get(); } + + CuFFTTransformType transform_type() const { return fft_type_; } + ScalarType data_type() const { return value_type_; } + bool should_clone_input() const { return clone_input; } + int64_t workspace_size() const { return ws_size; } + +private: + CuFFTHandle plan_ptr; + bool clone_input; + int64_t ws_size; + CuFFTTransformType fft_type_; + ScalarType value_type_; +}; + +#if defined(USE_ROCM) + // Note that the max plan number for CUDA version < 10 has to be 1023 + // due to a bug that fails on the 1024th plan + constexpr int64_t CUFFT_MAX_PLAN_NUM = 1023; + constexpr int64_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM; +#else + constexpr int64_t CUFFT_MAX_PLAN_NUM = std::numeric_limits::max(); + // The default max cache size chosen for CUDA version > 10 is arbitrary. + // This number puts a limit on how big of a plan cache should we maintain by + // default. Users can always configure it via cufft_set_plan_cache_max_size. + constexpr int64_t CUFFT_DEFAULT_CACHE_SIZE = 4096; +#endif +static_assert(0 <= CUFFT_MAX_PLAN_NUM && CUFFT_MAX_PLAN_NUM <= std::numeric_limits::max(), + "CUFFT_MAX_PLAN_NUM not in size_t range"); +static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 && CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM, + "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range"); + +// This cache assumes that the mapping from key to value never changes. +// This is **NOT** thread-safe. Please use a mutex when using it **AND** the +// value returned from try_emplace_value. +// The contract of using this cache is that try_emplace_value should only be +// used when the max_size is positive. +class CuFFTParamsLRUCache { +public: + using kv_t = typename std::pair; + using map_t = typename std::unordered_map, + typename std::list::iterator, + ParamsHash, + ParamsEqual>; + using map_kkv_iter_t = typename map_t::iterator; + + + CuFFTParamsLRUCache() : CuFFTParamsLRUCache(CUFFT_DEFAULT_CACHE_SIZE) {} + + CuFFTParamsLRUCache(int64_t max_size) { + _set_max_size(max_size); + } + + CuFFTParamsLRUCache(CuFFTParamsLRUCache&& other) noexcept : + _usage_list(std::move(other._usage_list)), + _cache_map(std::move(other._cache_map)), + _max_size(other._max_size) {} + + CuFFTParamsLRUCache& operator=(CuFFTParamsLRUCache&& other) noexcept { + _usage_list = std::move(other._usage_list); + _cache_map = std::move(other._cache_map); + _max_size = other._max_size; + return *this; + } + + // If key is in this cache, return the cached config. Otherwise, emplace the + // config in this cache and return it. + // Return const reference because CuFFTConfig shouldn't be tampered with once + // created. + const CuFFTConfig &lookup(CuFFTParams params) { + AT_ASSERT(_max_size > 0); + + map_kkv_iter_t map_it = _cache_map.find(params); + // Hit, put to list front + if (map_it != _cache_map.end()) { + _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second); + return map_it->second->second; + } + + // Miss + // remove if needed + if (_usage_list.size() >= _max_size) { + auto last = _usage_list.end(); + last--; + _cache_map.erase(last->first); + _usage_list.pop_back(); + } + + // construct new plan at list front, then insert into _cache_map + _usage_list.emplace_front(std::piecewise_construct, + std::forward_as_tuple(params), + std::forward_as_tuple(params)); + auto kv_it = _usage_list.begin(); + _cache_map.emplace(std::piecewise_construct, + std::forward_as_tuple(kv_it->first), + std::forward_as_tuple(kv_it)); + return kv_it->second; + } + + void clear() { + _cache_map.clear(); + _usage_list.clear(); + } + + void resize(int64_t new_size) { + _set_max_size(new_size); + auto cur_size = _usage_list.size(); + if (cur_size > _max_size) { + auto delete_it = _usage_list.end(); + for (size_t i = 0; i < cur_size - _max_size; i++) { + delete_it--; + _cache_map.erase(delete_it->first); + } + _usage_list.erase(delete_it, _usage_list.end()); + } + } + + size_t size() const { return _cache_map.size(); } + + size_t max_size() const noexcept { return _max_size; } + + std::mutex mutex; + +private: + // Only sets size and does value check. Does not resize the data structures. + void _set_max_size(int64_t new_size) { + // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since + // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check + // first. + TORCH_CHECK(new_size >= 0, + "cuFFT plan cache size must be non-negative, but got ", new_size); + TORCH_CHECK(new_size <= CUFFT_MAX_PLAN_NUM, + "cuFFT plan cache size can not be larger than ", CUFFT_MAX_PLAN_NUM, ", but got ", new_size); + _max_size = static_cast(new_size); + } + + std::list _usage_list; + map_t _cache_map; + size_t _max_size; +}; + +// Since ATen is separated into CPU build and CUDA build, we need a way to call +// these functions only when CUDA is loaded. We use CUDA hooks for this purpose +// (at cuda/detail/CUDAHooks.cpp), and call the hooked functions from the actual +// native function counterparts (at native/SpectralOps.cpp), i.e., +// _cufft_get_plan_cache_max_size, _cufft_set_plan_cache_max_size +// _cufft_get_plan_cache_size, and _cufft_clear_plan_cache. +int64_t cufft_get_plan_cache_max_size_impl(DeviceIndex device_index); +void cufft_set_plan_cache_max_size_impl(DeviceIndex device_index, int64_t max_size); +int64_t cufft_get_plan_cache_size_impl(DeviceIndex device_index); +void cufft_clear_plan_cache_impl(DeviceIndex device_index); + +}}} // namespace at::native::detail diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..0d8d45c1defb90af4da7d2c39d914d3d88ddafc3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh @@ -0,0 +1,22 @@ +#pragma once +#include +#include +#include +#include + +namespace at { +namespace native { + +Tensor embedding_backward_cuda_kernel( + const Tensor &grad, + const Tensor &orig_indices, + const Tensor &sorted_indices, + const Tensor &count, + int64_t num_weights, + int padding_idx = -1, + bool mode_mean = false, + const Tensor &offset2bag = Tensor(), + const Tensor &bag_size = Tensor(), + const Tensor &per_sample_weights = Tensor()); + +}} diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ForeachMinMaxFunctors.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ForeachMinMaxFunctors.cuh new file mode 100644 index 0000000000000000000000000000000000000000..9b08911b1d9500f200d55e031ffff7658d5491f0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ForeachMinMaxFunctors.cuh @@ -0,0 +1,22 @@ +#pragma once + +#include + +namespace at::native { + +// std:: does not have clamp functors +template +struct minimum { + __device__ T operator()(const T& a, const T& b) const { + return (_isnan(a) || a < b) ? a : b; + } +}; + +template +struct maximum { + __device__ T operator()(const T& a, const T& b) const { + return (_isnan(a) || a > b) ? a : b; + } +}; + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.h new file mode 100644 index 0000000000000000000000000000000000000000..aace9c30b0a7e9d08de71c4baf1490d45ff6d36e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.h @@ -0,0 +1,32 @@ +#pragma once +#include +#include + +namespace at { +class TensorBase; +} + +namespace at { +namespace native { + +void launch_grid_sampler_2d_forward_kernel( + const TensorBase &output, const TensorBase &input, const TensorBase &grid, + int64_t interpolation_mode, int64_t padding_mode, bool align_corners); + +void launch_grid_sampler_3d_forward_kernel( + const TensorBase &output, const TensorBase &input, const TensorBase &grid, + int64_t interpolation_mode, int64_t padding_mode, bool align_corners); + +void launch_grid_sampler_2d_backward_kernel( + const TensorBase &grad_input, const TensorBase &grad_grid, + const TensorBase &grad_output, const TensorBase &input, + const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode, + bool align_corners, std::array output_mask); + +void launch_grid_sampler_3d_backward_kernel( + const TensorBase &grad_input, const TensorBase &grad_grid, + const TensorBase &grad_output, const TensorBase &input, + const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode, + bool align_corners, std::array output_mask); + +}} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh new file mode 100644 index 0000000000000000000000000000000000000000..6f350c550ce93667bdf21b8b0c7d9798fdf5f15f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh @@ -0,0 +1,187 @@ +#pragma once + +#include + +#if AT_USE_JITERATOR() + +#include + +#include +#include +#include + +#include + +#include + +namespace at { +namespace native { + +/* Note [Jiterator] +The "jiterator" simply just-in-time compiles the same kernels that +Loops.cuh (and CUDALoops.cuh) usually build. This reduces build time, +build size, and initial CUDA context size. + +By default on non-Windows systems, it also caches compiled kernels in ~/.cache/torch/kernels. +This behavior is controlled with two environment variables: + - USE_PYTORCH_KERNEL_CACHE, if set to zero then this will disable all cache use + - PYTORCH_KERNEL_CACHE_PATH, if set specifies the folder to use for cached kernels + +The jiterator currently has some limitations, however. It cannot: + - handle math on complex datatypes + - handle kernels with scalar parameters + +These improvements will likely come soon. + +For examples of how to use the jiterator see the i1 and gcd kernel +implementations, which pass jittable strings implementing their +operations instead of the typical CUDA functors. + +To pass a runtime argument (similar to lambda captures in non-JIT kernels), +we need to pass to additional arguments to `jitted_gpu_kernel` by value. +Currently only primitive C++ types used for computation are valid. +The order of these extra arguments should be same as the order they appear +in kernel's function signature. (look at polygamma for example) + +NOTE: One big restriction being that these arguments should be after the +arguments provided by TensorIterator. Eg. While capturing `n`, where +`scalar_t x` and `scalar_t y` are provided by TensorIterator, +* foo(scalar_t x, scalar_t y, int n) works! +* foo(int n, scalar_t x, scalar_y) doesn't work +* foo(scalar_t x, int n, scalar_y) doesn't work + +*/ + +// Entrypoint for jitted GPU kernels. +// Only handles elementwise unary and binary kernels with a +// common dtype and a single output. +// NOTE: this assumes the op's iterator has a common_dtype. +// NOTE: We use std::tuple instead of parameter pack +// for `extra_args` due to following +// bug on older versions of clang +// https://bugs.llvm.org/show_bug.cgi?id=23029 +template < + char const* name, + typename return_type, + typename f_inputs_type, + int arity, + typename... Args> +void jitted_gpu_kernel( + TensorIteratorBase& iter, + const std::string& f, + at::cuda::jit::BinaryFuncVariant scalar_pos = + at::cuda::jit::BinaryFuncVariant::NoScalar, + at::opmath_type scalar_val = 0, + std::tuple extra_args = std::make_tuple()) { + // TODO: much of preamble is common to both jitted_gpu_kernel and gpu_kernel + // Maybe it could be refactored? + for (int arg = 0; arg < iter.ntensors(); arg++) { + TORCH_INTERNAL_ASSERT( + iter.device(arg).is_cuda(), + "argument ", arg, ": expected a CUDA device but found ", iter.device(arg)); + } + + if (iter.numel() == 0) { + return; + } + + if (!iter.can_use_32bit_indexing()) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + jitted_gpu_kernel( + sub_iter, f, scalar_pos, scalar_val, extra_args); + } + + return; + } + + // Computes if dynamic casting is needed + // Dynamic casting is needed if an input's dtype differs from the common dtype + // or if the result dtype differs from the output's dtype + // Note: this is intentionally divergent from calling needs_dynamic_casting, + // which is more general and inspects a lambda to determine if dynamic + // casting is needed. + bool needs_dynamic_casting = false; + + // Checks output + const ScalarType return_scalar_type = c10::CppTypeToScalarType::value; + const auto dtype0 = iter.dtype(0); + if (dtype0 != return_scalar_type) { + needs_dynamic_casting = true; + } + + // Checks input(s) + const ScalarType inputs_scalar_type = c10::CppTypeToScalarType::value; + for (auto i = decltype(arity){1}; i < (arity + 1); ++i) { + const auto dtypei = iter.dtype(i); + if (dtypei != inputs_scalar_type) { + needs_dynamic_casting = true; + break; + } + } + if (scalar_pos == at::cuda::jit::BinaryFuncVariant::NoScalar) { + // NOTE: With `scalar_pos=NoScalar`,`scalar_val` is not used + // for computation in the generated code and hence we pass a dummy + // value of `0`. + jitted_gpu_kernel_impl< + /*name*/ name, + /*return_type=*/return_type, + /*f_inputs_type=*/f_inputs_type, + arity, + at::cuda::jit::BinaryFuncVariant::NoScalar>( + iter, f, needs_dynamic_casting, /*scalar_val=*/scalar_val, extra_args); + } else if (scalar_pos == at::cuda::jit::BinaryFuncVariant::RhsScalar) { + jitted_gpu_kernel_impl< + /*name*/ name, + /*return_type=*/return_type, + /*f_inputs_type=*/f_inputs_type, + arity, + at::cuda::jit::BinaryFuncVariant::RhsScalar>( + iter, + f, + needs_dynamic_casting, + scalar_val, + extra_args); + + } else { + jitted_gpu_kernel_impl< + /*name*/ name, + /*return_type=*/return_type, + /*f_inputs_type=*/f_inputs_type, + arity, + at::cuda::jit::BinaryFuncVariant::LhsScalar>( + iter, + f, + needs_dynamic_casting, + scalar_val, + extra_args); + } +} + +// TODO: support runtime state capture similar to `jitted_gpu_kernel`. +template +void opmath_jitted_gpu_kernel_with_scalars(TensorIteratorBase& iter, const std::string& f) { + TORCH_INTERNAL_ASSERT(iter.ntensors() == 3); + //currently jiterator only handles binary functions where both inputs are of the same type (f_inputs_type) + using opmath_t = at::opmath_type; + if (iter.is_cpu_scalar(1)) { + auto scalar_val = iter.scalar_value(1); + iter.remove_operand(1); + // TODO: When all kernels that use gpu_kernel_with_scalars are + // ported to structured, this device guard can be deleted. This + // works around incorrect device guard generation for pre-structured + // kernels device guards, but structured kernels do it right and + // we can assume the device is already set correctly + const OptionalDeviceGuard device_guard(iter.device(1)); + jitted_gpu_kernel(iter, f, at::cuda::jit::BinaryFuncVariant::LhsScalar, scalar_val); + } else if (iter.is_cpu_scalar(2)) { + auto scalar_val = iter.scalar_value(2); + iter.remove_operand(2); + jitted_gpu_kernel(iter, f, at::cuda::jit::BinaryFuncVariant::RhsScalar, scalar_val); + } else { + jitted_gpu_kernel(iter, f); + } +} + +}} // at::native + +#endif // AT_USE_JITERATOR() diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Loops.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Loops.cuh new file mode 100644 index 0000000000000000000000000000000000000000..cb14f275e21718db44bc5f175ce6e650426966ff --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Loops.cuh @@ -0,0 +1,326 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +#include + + +namespace at { namespace native { + +template +static OffsetCalculator make_input_offset_calculator(const TensorIteratorBase& iter) { + // array size can not be 0, this happens when N == 0 + constexpr int array_size = std::max(N, 1); + TORCH_INTERNAL_ASSERT(N == iter.ntensors() - iter.noutputs()); + std::array strides; + int64_t element_sizes[array_size]; + for (int i = 0; i < N; i++) { + strides[i] = iter.strides(i + iter.noutputs()).data(); + element_sizes[i] = iter.element_size(i + iter.noutputs()); + } + return OffsetCalculator(iter.ndim(), iter.shape().data(), strides.data(), element_sizes); +} + +template +static OffsetCalculator make_output_offset_calculator(const TensorIteratorBase& iter) { + TORCH_INTERNAL_ASSERT(num_outputs == iter.noutputs()); + std::array strides; + int64_t element_sizes[num_outputs]; + for (int i = 0; i < num_outputs; i++) { + strides[i] = iter.strides(i).data(); + element_sizes[i] = iter.element_size(i); + } + return OffsetCalculator(iter.ndim(), iter.shape().data(), strides.data(), element_sizes); +} + +template +__device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) { + using traits = function_traits; + using return_t = typename traits::result_type; + using args_t = typename traits::ArgsTuple; + + int idx = blockIdx.x; + + return_t results[thread_work_size()]; + args_t args[thread_work_size()]; + + // load + policy.load(args, idx); + + // compute + #pragma unroll + for (int i = 0; i < thread_work_size(); i++) { + if (policy.check_inbounds(i)) { + results[i] = c10::guts::apply(f, args[i]); + } + } + + // store + policy.store(results, idx); +} + +}} // namespace at::native + +#include + +namespace at:: native { + +template +void gpu_kernel_nocast(TensorIteratorBase& iter, const func_t& f) { + + for (int arg = 0; arg < iter.ntensors(); arg++) { + TORCH_INTERNAL_ASSERT( + iter.device(arg).is_cuda(), + "argument ", arg, ": expected a CUDA device but found ", iter.device(arg)); + } + + if (iter.numel() == 0) { + return; + } + + if (!iter.can_use_32bit_indexing()) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + gpu_kernel_nocast(sub_iter, f); + } + return; + } + + gpu_kernel_impl_nocast(iter, f); +} + +template +void gpu_kernel(TensorIteratorBase& iter, const func_t& f) { + + for (int arg = 0; arg < iter.ntensors(); arg++) { + TORCH_INTERNAL_ASSERT( + iter.device(arg).is_cuda(), + "argument ", arg, ": expected a CUDA device but found ", iter.device(arg)); + } + + if (iter.numel() == 0) { + return; + } + + if (!iter.can_use_32bit_indexing()) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + gpu_kernel(sub_iter, f); + } + return; + } + + gpu_kernel_impl(iter, f); +} + +template +struct AUnaryFunctor { + using traits = function_traits; + using opmath_arg1_t = typename traits::template arg<0>::type; + __device__ return_t operator()(arg2_t b) const { + return f(a, b); + } + // NB: scalar is stored in higher precision! + AUnaryFunctor(func_t f_, opmath_arg1_t a_): f(f_), a(a_) {} + private: + func_t f; + opmath_arg1_t a; +}; + +template +struct BUnaryFunctor { + using traits = function_traits; + using opmath_arg2_t = typename traits::template arg<1>::type; + __device__ return_t operator()(arg1_t a) const { + return f(a, b); + } + // NB: scalar is stored in higher precision! + BUnaryFunctor(func_t f_, opmath_arg2_t b_): f(f_), b(b_) {} + private: + func_t f; + opmath_arg2_t b; +}; + +// Though seemingly noop, this inserts casts from arg1_t to func_t's type +// (which may be higher precision), as well as casts to return_t +template +struct BinaryFunctor { + __device__ return_t operator()(arg1_t a, arg2_t b) const { + return f(a, b); + } + BinaryFunctor(func_t f_): f(f_) {} + private: + func_t f; +}; + +// Unlike gpu_kernel_with_scalars, this allows you to pass a func_t which +// accepts inputs at higher precision (typically opmath_t), but then +// ensure that we load from memory at the correct precision (scalar_t) +// to avoid expensive loads. For the whole sordid story see +// https://dev-discuss.pytorch.org/t/cuda-loops-case-study-code-generation-vs-templates/302 +template +void opmath_gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) { + TORCH_INTERNAL_ASSERT(iter.ntensors() == 3); + + using traits = function_traits; + using opmath_arg1_t = typename traits::template arg<0>::type; + using opmath_arg2_t = typename traits::template arg<1>::type; + static_assert( + traits::arity == 2, + "gpu_kernel_with_scalars only supports two input arguments"); + + if (iter.is_cpu_scalar(1)) { + AUnaryFunctor af(f, iter.scalar_value(1)); + iter.remove_operand(1); + // TODO: When all kernels that use gpu_kernel_with_scalars are + // ported to structured, this device guard can be deleted. This + // works around incorrect device guard generation for pre-structured + // kernels device guards, but structured kernels do it right and + // we can assume the device is already set correctly + const OptionalDeviceGuard device_guard(iter.device(1)); + gpu_kernel(iter, af); + } else if (iter.is_cpu_scalar(2)) { + BUnaryFunctor bf(f, iter.scalar_value(2)); + iter.remove_operand(2); + gpu_kernel(iter, bf); + } else { + gpu_kernel(iter, BinaryFunctor(f)); + } +} + +template +void opmath_symmetric_gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) { + // Use symmetric property of the functor to reduce number of kernels, + // requires f(a, b) == f(b, a) + TORCH_INTERNAL_ASSERT(iter.ntensors() == 3); + + using traits = function_traits; + using opmath_arg_t = typename traits::template arg<0>::type; + static_assert( + traits::arity == 2, + "gpu_kernel_with_scalars only supports two input arguments"); + static_assert(std::is_same::type>::value, + "f is not symmetric"); + + OptionalDeviceGuard device_guard; + opmath_arg_t scalar_val{}; + + if (iter.is_cpu_scalar(1)) { + scalar_val = iter.scalar_value(1); + iter.remove_operand(1); + + // TODO: When all kernels that use gpu_kernel_with_scalars are + // ported to structured, this device guard can be deleted. This + // works around incorrect device guard generation for pre-structured + // kernels device guards, but structured kernels do it right and + // we can assume the device is already set correctly + device_guard.reset_device(iter.device(1)); + } else if (iter.is_cpu_scalar(2)) { + scalar_val = iter.scalar_value(2); + iter.remove_operand(2); + } + + if (iter.ninputs() == 2) { + gpu_kernel(iter, BinaryFunctor(f)); + } else { + AUnaryFunctor unary_f(f, scalar_val); + gpu_kernel(iter, unary_f); + } +} + +// Legacy variant that assumes that func_t has the correct types +// that we expect to load from memory +template +void gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) { + using traits = function_traits; + static_assert( + traits::arity == 2, + "gpu_kernel_with_scalars only supports two input arguments"); + using arg1_t = typename traits::template arg<0>::type; + using arg2_t = typename traits::template arg<1>::type; + using return_t = typename traits::result_type; + opmath_gpu_kernel_with_scalars(iter, f); +} + +namespace { // functions for `gpu_kernel_multiple_outputs`. + +// check the return type is `thrust::tuple`, not `std::tuple`. +template struct is_tuple: std::false_type {}; + +template struct is_tuple>: std::true_type {}; + +template +C10_LAUNCH_BOUNDS_1(num_threads()) +__global__ void unrolled_elementwise_kernel_for_multi_outputs(int N, func_t f, array_t data, inp_calc_t ic, out_calc_t oc) { + int remaining = N - block_work_size() * blockIdx.x; + elementwise_kernel_helper(f, memory::policies::multi_outputs_unroll(data, remaining, ic, oc)); +} + +template +static inline void launch_unrolled_kernel_for_multi_outputs(int64_t N, const func_t& f, array_t data, inp_calc_t ic, out_calc_t oc) { + TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); + int64_t grid = (N + block_work_size() - 1) / block_work_size(); + auto stream = at::cuda::getCurrentCUDAStream(); + unrolled_elementwise_kernel_for_multi_outputs<<>>(N, f, data, ic, oc); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +void gpu_kernel_multiple_outputs_impl(TensorIteratorBase& iter, const func_t& f) { + using traits = function_traits; + using output_t = typename traits::result_type; + static_assert(is_tuple::value, "f's return type must be `thrust::tuple`"); + constexpr int num_outputs = thrust::tuple_size::value; + constexpr int num_inputs = traits::arity; + constexpr int ntensors = num_outputs + num_inputs; + + TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing()); + TORCH_INTERNAL_ASSERT(iter.ntensors() == ntensors); + + at::detail::Array data; + for (int i = 0; i < ntensors; i++) { + data[i] = (char*)iter.data_ptr(i); + } + + int64_t numel = iter.numel(); + + if (iter.is_contiguous()) { + auto input_calc = TrivialOffsetCalculator(); + auto output_calc = TrivialOffsetCalculator(); + launch_unrolled_kernel_for_multi_outputs(numel, f, data, input_calc, output_calc); + } else { + auto input_calc = make_input_offset_calculator(iter); + auto output_calc = make_output_offset_calculator(iter); + launch_unrolled_kernel_for_multi_outputs(numel, f, data, input_calc, output_calc); + } +} +} // namespace + +template +void gpu_kernel_multiple_outputs(TensorIteratorBase& iter, const func_t& f) { + ASSERT_HOST_DEVICE_LAMBDA(func_t); + + for (int arg = 0; arg < iter.ntensors(); arg++) { + TORCH_INTERNAL_ASSERT(iter.device(arg).is_cuda()); + } + + if (iter.numel() == 0) { + return; + } + + if (!iter.can_use_32bit_indexing()) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + gpu_kernel_multiple_outputs(sub_iter, f); + } + return; + } + + gpu_kernel_multiple_outputs_impl(iter, f); +} + +} //namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Math.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Math.cuh new file mode 100644 index 0000000000000000000000000000000000000000..5188801f12c60b67d2da3486d02ced455a17231c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Math.cuh @@ -0,0 +1,3375 @@ +#pragma once + +#include +#include +#include +#include + +namespace at { +namespace native { +// See note [Jiterator] +// TODO: elaborate in this comment on the structure of math.cuh +#if AT_USE_JITERATOR() + +const auto ndtri_string = jiterator_stringify( + /* + * This function is derived from the implementation of the digamma function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + * + * Evaluates polynomial of degree N: + * + * 2 N + * y = C + C x + C x +...+ C x + * 0 1 2 N + * + * Coefficients are stored in reverse order: + * + * coef[0] = C , ..., coef[N] = C . + * N 0 + */ + template + T polevl(const T x, const T A[], const int len) { + // NOTE: This `polevl` is different from other `polevl` + // implementation (in PyTorch) which expect the `len` to be + // `len(A) - 1` instead of `len(A)`. + T result = 0; + for (int i = 0; i < len; ++i) { + result = result * x + A[i]; + } + return result; + } + + /* + * This function is derived from the implementation of the i1e function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + * + * Computes the argument, x, for which the area under the Gaussian probability density function + * (integrated from minus infinity to x) is equal to y. + */ + template + T ndtri(T y0) { + + constexpr T zero = 0; + constexpr T one = 1; + + // Handles special cases + if (y0 == zero) { + return NEG_INFINITY; + } + if (y0 == one) { + return POS_INFINITY; + } + if (y0 < zero || y0 > one) { + return NAN; + } + + bool code = true; + T y = y0; + // Note: the constant 0.135... is equal to exp(-2) + if (y > one - T{0.13533528323661269189}) { + y = one - y; + code = false; + } + + if (y > T{0.13533528323661269189}) { + /* approximation for 0 <= |y - 0.5| <= 3/8 */ + static const T P0[5] = { + -5.99633501014107895267E1, + 9.80010754185999661536E1, + -5.66762857469070293439E1, + 1.39312609387279679503E1, + -1.23916583867381258016E0, + }; + + static const T Q0[9] = { + 1.00000000000000000000E0, + 1.95448858338141759834E0, + 4.67627912898881538453E0, + 8.63602421390890590575E1, + -2.25462687854119370527E2, + 2.00260212380060660359E2, + -8.20372256168333339912E1, + 1.59056225126211695515E1, + -1.18331621121330003142E0, + }; + + /* sqrt(2pi) */ + constexpr T s2pi = 2.50662827463100050242E0; + + y = y - T{0.5}; + const T y2 = y * y; + T x = y + y * (y2 * polevl(y2, P0, int{5}) / polevl(y2, Q0, int{9})); + return x * s2pi; + } + + T x = sqrt(T{-2.} * log(y)); + const T x0 = x - (log(x) / x); + + const T z = one / x; + T x1; + + /* y > exp(-32) = 1.2664165549e-14 */ + if (x < T{8.0}) { + /* Approximation for interval z = sqrt(-2 log y ) between 2 and 8 + * i.e., y between exp(-2) = .135 and exp(-32) = 1.27e-14. + */ + static const T P1[9] = { + 4.05544892305962419923E0, + 3.15251094599893866154E1, + 5.71628192246421288162E1, + 4.40805073893200834700E1, + 1.46849561928858024014E1, + 2.18663306850790267539E0, + -1.40256079171354495875E-1, + -3.50424626827848203418E-2, + -8.57456785154685413611E-4, + }; + + static const T Q1[9] = { + 1.00000000000000000000E0, + 1.57799883256466749731E1, + 4.53907635128879210584E1, + 4.13172038254672030440E1, + 1.50425385692907503408E1, + 2.50464946208309415979E0, + -1.42182922854787788574E-1, + -3.80806407691578277194E-2, + -9.33259480895457427372E-4, + }; + + x1 = z * polevl(z, P1, int{9}) / polevl(z, Q1, int{9}); + } else { + /* Approximation for interval z = sqrt(-2 log y ) between 8 and 64 + * i.e., y between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890. + */ + static const T P2[9] = { + 3.23774891776946035970E0, + 6.91522889068984211695E0, + 3.93881025292474443415E0, + 1.33303460815807542389E0, + 2.01485389549179081538E-1, + 1.23716634817820021358E-2, + 3.01581553508235416007E-4, + 2.65806974686737550832E-6, + 6.23974539184983293730E-9, + }; + + static const T Q2[9] = { + 1.00000000000000000000E0, + 6.02427039364742014255E0, + 3.67983563856160859403E0, + 1.37702099489081330271E0, + 2.16236993594496635890E-1, + 1.34204006088543189037E-2, + 3.28014464682127739104E-4, + 2.89247864745380683936E-6, + 6.79019408009981274425E-9, + }; + + x1 = z * polevl(z, P2, int{9}) / polevl(z, Q2, int{9}); + } + + x = x0 - x1; + return (!code) ? x : -x; + } +); // ndtri_string + +const auto log_ndtr_string = jiterator_stringify( + template + T log_ndtr(T x) { + constexpr T SQRT1_2{0.707106781186547524400844362104849039}; // 1/sqrt(2) + T t = x * SQRT1_2; + if (x < T{-1.0}) { + return log(erfcx(-t) / 2) - t * t; + } else { + return log1p(-erfc(t) / 2); + } + } +); // log_ndtr_string + +const auto gcd_string = jiterator_stringify( + template + T gcd(const T a_in, const T b_in) { + T a = abs(a_in); + T b = abs(b_in); + + while (a != T{0}) { + T c = a; + a = b % a; + b = c; + } + + return b; + } +); // gcd_string + +const auto lcm_string = jiterator_stringify( + template + T gcd(const T a_in, const T b_in) { + T a = abs(a_in); + T b = abs(b_in); + + while (a != T{0}) { + T c = a; + a = b % a; + b = c; + } + + return b; + } + + template + T lcm(const T a, const T b) { + T g = gcd(a, b); + return (g == T{0}) ? T{0} : abs(a / g * b); + } +); // lcm_string + +/* + * For licensing information, please refer to the cpu implementation located in "ATen/native/Math.h". + */ +// [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma +const auto digamma_string = jiterator_stringify( + template + T digamma(T x) { + static const double PI_f64 = 3.14159265358979323846; + + // Short-circuits if x is +/- 0 and returns -/+ ∞ per the C++ standard + if (x == 0) { + return copysign(POS_INFINITY, -x); + } + + T result = 0; + if (x < 0) { + // Short-circuits if x is a negative integer and returns NaN + // per the C++ standard + const bool x_is_integer = (x == trunc(x)); + if (x_is_integer) { + return NAN; + } + + // Extracts the fractional part of x as r, since tan(pi * r) is more numerically + // accurate than tan(pi * x). While these operations are mathematically equivalent + // since both x and r are in radians and tan() has a periodicity of pi, in practice + // the computation of pi * x is a source of error (when |x| > 1). + double q, r; + r = modf(static_cast(x), &q); + result = - PI_f64 / tan(PI_f64 * r); + x = 1 - x; + } + + while (x < T{10}) { + result -= T{1} / x; + x += T{1}; + } + + if (x == T{10}) { + return result + T{2.25175258906672110764}; + } + + T y = 0; + if (x < T{1.0e17}) { + const T A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2, + }; + + + T z = T{1} / (x * x); + + T polevl_result = 0; + for (int i = 0; i <= 6; i++) { + polevl_result = polevl_result * z + A[i]; + } + y = z * polevl_result; + } + + return log(x) - (T{0.5} / x) - y + result; + } +); // digamma_string + +/* + * This function is derived from the implementation of the zeta function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + */ +const auto zeta_string = jiterator_stringify( + template + T zeta(T x, T q) { + const T MACHEP{1.11022302462515654042E-16}; + constexpr T zero{0}; + constexpr T half{0.5}; + constexpr T one{1}; + static const T A[] = { + 12.0, + -720.0, + 30240.0, + -1209600.0, + 47900160.0, + -1.8924375803183791606e9, /*1.307674368e12/691*/ + 7.47242496e10, + -2.950130727918164224e12, /*1.067062284288e16/3617*/ + 1.1646782814350067249e14, /*5.109094217170944e18/43867*/ + -4.5979787224074726105e15, /*8.028576626982912e20/174611*/ + 1.8152105401943546773e17, /*1.5511210043330985984e23/854513*/ + -7.1661652561756670113e18 /*1.6938241367317436694528e27/236364091*/ + }; + + int i = 0; + T a, b, k, s, t, w; + + // Short-circuits x -> +infty + if (x == one) { + return POS_INFINITY; + } + + // Short-circuits x < 1 -> NaN + if (x < one) { + return NAN; + } + + // Short-circuits negative q integers map to +infty, + // negative q non-integers map to NaN + if (q <= zero) { + if (q == floor(q)) { + return POS_INFINITY; + } + if (x != floor(x)) { + return NAN; + } + } + + s = pow(q, -x); + a = q; + i = 0; + b = zero; + while ((i < 9) || (a <= T{9.0})) { + i += 1; + a += one; + b = pow(a, -x); + s += b; + if ((-MACHEP * s < b) && (b < MACHEP * s)) { + return s; + } + }; + + w = a; + s += b * w / (x - one); + s -= half * b; + a = one; + k = zero; + for (int i = 0; i < 12; i++) { + a *= x + k; + b /= w; + t = a * b / A[i]; + s = s + t; + t = fabs(t / s); + + if (t < MACHEP) { + return s; + } + + k += one; + a *= x + k; + b /= w; + k += one; + } + + return s; + } +); // zeta_string + +const auto trigamma_string = jiterator_stringify( + template + T trigamma(T x) { + const T PI{3.14159265358979323846}; + T sign = 1; + T result = 0; + + if (x < T{0.5}) { + sign = -1; + T sin_pi_x = sin(PI * x); + result -= (PI * PI) / (sin_pi_x * sin_pi_x); + x = 1 - x; + } + + for (int i = 0; i < 6; ++i) { + result += T{1} / (x * x); + x += 1; + } + + const T one{1}; + const T ixx = one / (x*x); + result += (one + one / (T{2}*x) + ixx * (one/T{6} - ixx * (one/T{30} - ixx * (one/T{42})))) / x; + return sign * result; +} +); // trigamma_string + +const auto lgamma_string = jiterator_stringify( + template + T lgamma_kernel(T a) { + return lgamma(a); + } +); // lgamma_string + +const auto polygamma_string = zeta_string + jiterator_stringify( + template + T polygamma(T x, int n) { + // already blocked if n <= 1 + const auto one = T{1}; + return ((n % 2) ? one : -one) * exp(lgamma(static_cast(n) + one)) * + zeta(static_cast(n + 1), x); + } +); // polygamma_string + +const auto exp2_string = jiterator_stringify( + template + T exp2_impl(T a) { + return exp2(a); + } + + namespace std { template class complex; } + template + std::complex exp2_impl(std::complex x) { + // There is no std::exp2 overload for complex, so instead + // use the identity 2^x = e^(ln(2) * x) + const auto ln_2 = static_cast(0.693147180559945309417232121458176); + return exp(ln_2 * x); + } + + template + T exp2_kernel(T a) { + return exp2_impl(a); + } +); // exp2_string + +const auto erfc_string = jiterator_stringify( + template + T erfc_kernel(T a) { + return erfc(a); + } +); // erfc_string + +const auto erfinv_string = jiterator_stringify( + template + T erfinv_kernel(T a) { + return erfinv(a); + } +); // erfinv_string + +const auto entr_string = jiterator_stringify( + template + T entr(T a) { + if (a != a) { + return a; + } + + if (a > 0) { + return -a * log(a); + } + + if (a == 0) { + return 0; + } + + return NEG_INFINITY; + } +); // entr_string + +// NOTE: `kaiser_window_string` depends on `i0_string` +// for its implementation. +const auto i0_string = jiterator_stringify( + template + T chbevl(T x, const T array[], const int len) { + + T b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (int i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = x * b1 - b2 + array[i]; + } + + return T{0.5} * (b0 - b2); + } + + template + T i0(T _x) { + T x = fabs(_x); + + if (x <= T{8.0}) { + /* Chebyshev coefficients for exp(-x) I0(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I0(x) } = 1. + */ + static const T A[] = { + -4.41534164647933937950E-18, 3.33079451882223809783E-17, + -2.43127984654795469359E-16, 1.71539128555513303061E-15, + -1.16853328779934516808E-14, 7.67618549860493561688E-14, + -4.85644678311192946090E-13, 2.95505266312963983461E-12, + -1.72682629144155570723E-11, 9.67580903537323691224E-11, + -5.18979560163526290666E-10, 2.65982372468238665035E-9, + -1.30002500998624804212E-8, 6.04699502254191894932E-8, + -2.67079385394061173391E-7, 1.11738753912010371815E-6, + -4.41673835845875056359E-6, 1.64484480707288970893E-5, + -5.75419501008210370398E-5, 1.88502885095841655729E-4, + -5.76375574538582365885E-4, 1.63947561694133579842E-3, + -4.32430999505057594430E-3, 1.05464603945949983183E-2, + -2.37374148058994688156E-2, 4.93052842396707084878E-2, + -9.49010970480476444210E-2, 1.71620901522208775349E-1, + -3.04682672343198398683E-1, 6.76795274409476084995E-1}; + + T y = (x / T{2.0}) - T{2.0}; + return exp(x) * chbevl(y, A, int{30}); + } + + // Handles x > 8 case + /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi). + */ + const T B[] = { + -7.23318048787475395456E-18, -4.83050448594418207126E-18, + 4.46562142029675999901E-17, 3.46122286769746109310E-17, + -2.82762398051658348494E-16, -3.42548561967721913462E-16, + 1.77256013305652638360E-15, 3.81168066935262242075E-15, + -9.55484669882830764870E-15, -4.15056934728722208663E-14, + 1.54008621752140982691E-14, 3.85277838274214270114E-13, + 7.18012445138366623367E-13, -1.79417853150680611778E-12, + -1.32158118404477131188E-11, -3.14991652796324136454E-11, + 1.18891471078464383424E-11, 4.94060238822496958910E-10, + 3.39623202570838634515E-9, 2.26666899049817806459E-8, + 2.04891858946906374183E-7, 2.89137052083475648297E-6, + 6.88975834691682398426E-5, 3.36911647825569408990E-3, + 8.04490411014108831608E-1}; + + return (exp(x) * chbevl(T{32.0} / x - T{2.0}, B, int{25})) / sqrt(x); + } +); // i0_string + +const auto i1_string = jiterator_stringify( + template + T chbevl(const T x, const T array[], const int len) { + T b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (int i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = x * b1 - b2 + array[i]; + } + + return T{0.5} * (b0 - b2); + } + + template + T i1(T _x) { + const T x = fabs(_x); + + if (x <= T{8.0}) { + // Chebyshev coefficients for exp(-x) i1(x) in the internal [0, 8] + // lim(x->0){ exp(-x) i1(x) / x } = 1/2 + static const T coefficients[] = { + 2.77791411276104639959E-18, -2.11142121435816608115E-17, + 1.55363195773620046921E-16, -1.10559694773538630805E-15, + 7.60068429473540693410E-15, -5.04218550472791168711E-14, + 3.22379336594557470981E-13, -1.98397439776494371520E-12, + 1.17361862988909016308E-11, -6.66348972350202774223E-11, + 3.62559028155211703701E-10, -1.88724975172282928790E-9, + 9.38153738649577178388E-9, -4.44505912879632808065E-8, + 2.00329475355213526229E-7, -8.56872026469545474066E-7, + 3.47025130813767847674E-6, -1.32731636560394358279E-5, + 4.78156510755005422638E-5, -1.61760815825896745588E-4, + 5.12285956168575772895E-4, -1.51357245063125314899E-3, + 4.15642294431288815669E-3, -1.05640848946261981558E-2, + 2.47264490306265168283E-2, -5.29459812080949914269E-2, + 1.02643658689847095384E-1, -1.76416518357834055153E-1, + 2.52587186443633654823E-1}; + const T y = x / T{2.0} - T{2.0}; + const T out = exp(x) * x * chbevl(y, coefficients, int{29}); + return (_x < T{0.0}) ? -out : out; + } + + // Chebyshev coefficients for exp(-x) sqrt(x) i1(x) + // in the inverted interval [8, infinity] + // lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi) + static const T coefficients[] = { + 7.51729631084210481353E-18, 4.41434832307170791151E-18, + -4.65030536848935832153E-17, -3.20952592199342395980E-17, + 2.96262899764595013876E-16, 3.30820231092092828324E-16, + -1.88035477551078244854E-15, -3.81440307243700780478E-15, + 1.04202769841288027642E-14, 4.27244001671195135429E-14, + -2.10154184277266431302E-14, -4.08355111109219731823E-13, + -7.19855177624590851209E-13, 2.03562854414708950722E-12, + 1.41258074366137813316E-11, 3.25260358301548823856E-11, + -1.89749581235054123450E-11, -5.58974346219658380687E-10, + -3.83538038596423702205E-9, -2.63146884688951950684E-8, + -2.51223623787020892529E-7, -3.88256480887769039346E-6, + -1.10588938762623716291E-4, -9.76109749136146840777E-3, + 7.78576235018280120474E-1}; + const T out = (exp(x) * chbevl(T{32.} / x - T{2.}, coefficients, int{25})) / sqrt(x); + return (_x < T{0.}) ? -out : out; + } +); // i1_string + +const auto i1e_string = jiterator_stringify( + template + T chbevl(const T x, const T array[], const int len) { + T b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (int i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = x * b1 - b2 + array[i]; + } + + return T{0.5} * (b0 - b2); + } + + // See double and float instantiations below + template + T i1e(T _x) { } + + // Double specialization (uses different coefficients than the float version) + template<> + double i1e(double _x) { + const double x = fabs(_x); + if (x <= double{8.}) { + // Chebyshev double coefficients for exp(-x) i1(x) in the interval [0,8]. + // Note: lim(x->0){ exp(-x) i1(x) / x } = 1/2. + static const double coefficients[] = { + 2.77791411276104639959E-18, -2.11142121435816608115E-17, + 1.55363195773620046921E-16, -1.10559694773538630805E-15, + 7.60068429473540693410E-15, -5.04218550472791168711E-14, + 3.22379336594557470981E-13, -1.98397439776494371520E-12, + 1.17361862988909016308E-11, -6.66348972350202774223E-11, + 3.62559028155211703701E-10, -1.88724975172282928790E-9, + 9.38153738649577178388E-9, -4.44505912879632808065E-8, + 2.00329475355213526229E-7, -8.56872026469545474066E-7, + 3.47025130813767847674E-6, -1.32731636560394358279E-5, + 4.78156510755005422638E-5, -1.61760815825896745588E-4, + 5.12285956168575772895E-4, -1.51357245063125314899E-3, + 4.15642294431288815669E-3, -1.05640848946261981558E-2, + 2.47264490306265168283E-2, -5.29459812080949914269E-2, + 1.02643658689847095384E-1, -1.76416518357834055153E-1, + 2.52587186443633654823E-1}; + const double y = x / double{2.} - double{2.}; + const double out = chbevl(y, coefficients, int{29}) * x; + return (_x < 0.) ? -out : out; + } + + // Chebyshev coefficients for exp(-x) sqrt(x) i1(x) + // in the inverted interval (8, infinity]. + // Note: lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi). + // TODO: what's an "inverted interval"? Open on the left + // and closed on the right? + static const double coefficients[] = { + 7.51729631084210481353E-18, 4.41434832307170791151E-18, + -4.65030536848935832153E-17, -3.20952592199342395980E-17, + 2.96262899764595013876E-16, 3.30820231092092828324E-16, + -1.88035477551078244854E-15, -3.81440307243700780478E-15, + 1.04202769841288027642E-14, 4.27244001671195135429E-14, + -2.10154184277266431302E-14, -4.08355111109219731823E-13, + -7.19855177624590851209E-13, 2.03562854414708950722E-12, + 1.41258074366137813316E-11, 3.25260358301548823856E-11, + -1.89749581235054123450E-11, -5.58974346219658380687E-10, + -3.83538038596423702205E-9, -2.63146884688951950684E-8, + -2.51223623787020892529E-7, -3.88256480887769039346E-6, + -1.10588938762623716291E-4, -9.76109749136146840777E-3, + 7.78576235018280120474E-1}; + + const double out = chbevl(double{32.} / x - double{2.}, coefficients, int{25}) / sqrt(x); + return (_x < double{0.}) ? -out : out; + } + + // Float specialization (uses different coefficients than the double version) + template<> + float i1e(float _x) { + const float x = fabsf(_x); + if (x <= float{8.}) { + // Chebyshev double coefficients for exp(-x) i1(x) in the interval [0,8]. + // Note: lim(x->0){ exp(-x) i1(x) / x } = 1/2. + static const float coefficients[] = { + 9.38153738649577178388E-9f, + -4.44505912879632808065E-8f, + 2.00329475355213526229E-7f, + -8.56872026469545474066E-7f, + 3.47025130813767847674E-6f, + -1.32731636560394358279E-5f, + 4.78156510755005422638E-5f, + -1.61760815825896745588E-4f, + 5.12285956168575772895E-4f, + -1.51357245063125314899E-3f, + 4.15642294431288815669E-3f, + -1.05640848946261981558E-2f, + 2.47264490306265168283E-2f, + -5.29459812080949914269E-2f, + 1.02643658689847095384E-1f, + -1.76416518357834055153E-1f, + 2.52587186443633654823E-1f}; + const float y = x / float{2.} - float{2.}; + const float out = chbevl(y, coefficients, int{17}) * x; + return (_x < 0.) ? -out : out; + } + + // Chebyshev coefficients for exp(-x) sqrt(x) i1(x) + // in the inverted interval (8, infinity]. + // Note: lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi). + // TODO: what's an "inverted interval"? Open on the left + // and closed on the right? + static const float coefficients[] = { + -3.83538038596423702205E-9f, + -2.63146884688951950684E-8f, + -2.51223623787020892529E-7f, + -3.88256480887769039346E-6f, + -1.10588938762623716291E-4f, + -9.76109749136146840777E-3f, + 7.78576235018280120474E-1f}; + + const float out = chbevl(float{32.} / x - float{2.}, coefficients, int{7}) / sqrt(x); + return (_x < float{0.}) ? -out : out; + } +); // i1e_string + +const auto kaiser_window_string = i0_string + jiterator_stringify( + template + T kaiser_window(T a, T inv_alpha, T beta, T inv_i0_beta) { + T x = a * inv_alpha - T{1}; + T y = max(T{0}, T{1} - x * x); + return i0(beta * sqrt(y)) * inv_i0_beta; + } +); // kaiser_window_string + +const auto sinc_string = jiterator_stringify( + template + T sinc(T a) { + if (a == T(0)) { + return T(1); + } else { + constexpr T pi = T(3.14159265358979323846L); + T product = pi * a; + return std::sin(product) / product; + } + } +); // sinc_string + +const auto erfcx_string = jiterator_stringify( + /* The next function is taken from http://ab-initio.mit.edu/Faddeev */ + + /* Copyright (c) 2012 Massachusetts Institute of Technology + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + /* erfcx(x) = exp(x^2) erfc(x) function, for real x, written by + Steven G. Johnson, October 2012. + + This function combines a few different ideas. + + First, for x > 50, it uses a continued-fraction expansion (same as + for the Faddeeva function, but with algebraic simplifications for z=i*x). + + Second, for 0 <= x <= 50, it uses Chebyshev polynomial approximations, + but with two twists: + + a) It maps x to y = 4 / (4+x) in [0,1]. This simple transformation, + inspired by a similar transformation in the octave-forge/specfun + erfcx by Soren Hauberg, results in much faster Chebyshev convergence + than other simple transformations I have examined. + + b) Instead of using a single Chebyshev polynomial for the entire + [0,1] y interval, we break the interval up into 100 equal + subintervals, with a switch/lookup table, and use much lower + degree Chebyshev polynomials in each subinterval. This greatly + improves performance in my tests. + + For x < 0, we use the relationship erfcx(-x) = 2 exp(x^2) - erfc(x), + with the usual checks for overflow etcetera. + + Performance-wise, it seems to be substantially faster than either + the SLATEC DERFC function [or an erfcx function derived therefrom] + or Cody's CALERF function (from netlib.org/specfun), while + retaining near machine precision in accuracy. + */ + + /* Given y100 = 100 * y, where y = 4 / (4 + x) for x >= 0, compute erfc(x). + + Uses a look-up table of 100 different Chebyshev polynomials + for y intervals [0,0.01], [0.01,0.02], ...., [0.99,1], generated + with the help of Maple and a little shell script. This allows + the Chebyshev polynomials to be of significantly lower degree (about 1/4) + compared to fitting the whole [0,1] interval with a single polynomial. + */ + + // TODO: review if this is computing in double when given a float input + template + T erfcx_y100(T y100) { + switch (static_cast(y100)) { + case 0: { + T t = 2*y100 - 1; + return 0.70878032454106438663e-3 + (0.71234091047026302958e-3 + (0.35779077297597742384e-5 + (0.17403143962587937815e-7 + (0.81710660047307788845e-10 + (0.36885022360434957634e-12 + 0.15917038551111111111e-14 * t) * t) * t) * t) * t) * t; + } + case 1: { + T t = 2*y100 - 3; + return 0.21479143208285144230e-2 + (0.72686402367379996033e-3 + (0.36843175430938995552e-5 + (0.18071841272149201685e-7 + (0.85496449296040325555e-10 + (0.38852037518534291510e-12 + 0.16868473576888888889e-14 * t) * t) * t) * t) * t) * t; + } + case 2: { + T t = 2*y100 - 5; + return 0.36165255935630175090e-2 + (0.74182092323555510862e-3 + (0.37948319957528242260e-5 + (0.18771627021793087350e-7 + (0.89484715122415089123e-10 + (0.40935858517772440862e-12 + 0.17872061464888888889e-14 * t) * t) * t) * t) * t) * t; + } + case 3: { + T t = 2*y100 - 7; + return 0.51154983860031979264e-2 + (0.75722840734791660540e-3 + (0.39096425726735703941e-5 + (0.19504168704300468210e-7 + (0.93687503063178993915e-10 + (0.43143925959079664747e-12 + 0.18939926435555555556e-14 * t) * t) * t) * t) * t) * t; + } + case 4: { + T t = 2*y100 - 9; + return 0.66457513172673049824e-2 + (0.77310406054447454920e-3 + (0.40289510589399439385e-5 + (0.20271233238288381092e-7 + (0.98117631321709100264e-10 + (0.45484207406017752971e-12 + 0.20076352213333333333e-14 * t) * t) * t) * t) * t) * t; + } + case 5: { + T t = 2*y100 - 11; + return 0.82082389970241207883e-2 + (0.78946629611881710721e-3 + (0.41529701552622656574e-5 + (0.21074693344544655714e-7 + (0.10278874108587317989e-9 + (0.47965201390613339638e-12 + 0.21285907413333333333e-14 * t) * t) * t) * t) * t) * t; + } + case 6: { + T t = 2*y100 - 13; + return 0.98039537275352193165e-2 + (0.80633440108342840956e-3 + (0.42819241329736982942e-5 + (0.21916534346907168612e-7 + (0.10771535136565470914e-9 + (0.50595972623692822410e-12 + 0.22573462684444444444e-14 * t) * t) * t) * t) * t) * t; + } + case 7: { + T t = 2*y100 - 15; + return 0.11433927298290302370e-1 + (0.82372858383196561209e-3 + (0.44160495311765438816e-5 + (0.22798861426211986056e-7 + (0.11291291745879239736e-9 + (0.53386189365816880454e-12 + 0.23944209546666666667e-14 * t) * t) * t) * t) * t) * t; + } + case 8: { + T t = 2*y100 - 17; + return 0.13099232878814653979e-1 + (0.84167002467906968214e-3 + (0.45555958988457506002e-5 + (0.23723907357214175198e-7 + (0.11839789326602695603e-9 + (0.56346163067550237877e-12 + 0.25403679644444444444e-14 * t) * t) * t) * t) * t) * t; + } + case 9: { + T t = 2*y100 - 19; + return 0.14800987015587535621e-1 + (0.86018092946345943214e-3 + (0.47008265848816866105e-5 + (0.24694040760197315333e-7 + (0.12418779768752299093e-9 + (0.59486890370320261949e-12 + 0.26957764568888888889e-14 * t) * t) * t) * t) * t) * t; + } + case 10: { + T t = 2*y100 - 21; + return 0.16540351739394069380e-1 + (0.87928458641241463952e-3 + (0.48520195793001753903e-5 + (0.25711774900881709176e-7 + (0.13030128534230822419e-9 + (0.62820097586874779402e-12 + 0.28612737351111111111e-14 * t) * t) * t) * t) * t) * t; + } + case 11: { + T t = 2*y100 - 23; + return 0.18318536789842392647e-1 + (0.89900542647891721692e-3 + (0.50094684089553365810e-5 + (0.26779777074218070482e-7 + (0.13675822186304615566e-9 + (0.66358287745352705725e-12 + 0.30375273884444444444e-14 * t) * t) * t) * t) * t) * t; + } + case 12: { + T t = 2*y100 - 25; + return 0.20136801964214276775e-1 + (0.91936908737673676012e-3 + (0.51734830914104276820e-5 + (0.27900878609710432673e-7 + (0.14357976402809042257e-9 + (0.70114790311043728387e-12 + 0.32252476000000000000e-14 * t) * t) * t) * t) * t) * t; + } + case 13: { + T t = 2*y100 - 27; + return 0.21996459598282740954e-1 + (0.94040248155366777784e-3 + (0.53443911508041164739e-5 + (0.29078085538049374673e-7 + (0.15078844500329731137e-9 + (0.74103813647499204269e-12 + 0.34251892320000000000e-14 * t) * t) * t) * t) * t) * t; + } + case 14: { + T t = 2*y100 - 29; + return 0.23898877187226319502e-1 + (0.96213386835900177540e-3 + (0.55225386998049012752e-5 + (0.30314589961047687059e-7 + (0.15840826497296335264e-9 + (0.78340500472414454395e-12 + 0.36381553564444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 15: { + T t = 2*y100 - 31; + return 0.25845480155298518485e-1 + (0.98459293067820123389e-3 + (0.57082915920051843672e-5 + (0.31613782169164830118e-7 + (0.16646478745529630813e-9 + (0.82840985928785407942e-12 + 0.38649975768888888890e-14 * t) * t) * t) * t) * t) * t; + } + case 16: { + T t = 2*y100 - 33; + return 0.27837754783474696598e-1 + (0.10078108563256892757e-2 + (0.59020366493792212221e-5 + (0.32979263553246520417e-7 + (0.17498524159268458073e-9 + (0.87622459124842525110e-12 + 0.41066206488888888890e-14 * t) * t) * t) * t) * t) * t; + } + case 17: { + T t = 2*y100 - 35; + return 0.29877251304899307550e-1 + (0.10318204245057349310e-2 + (0.61041829697162055093e-5 + (0.34414860359542720579e-7 + (0.18399863072934089607e-9 + (0.92703227366365046533e-12 + 0.43639844053333333334e-14 * t) * t) * t) * t) * t) * t; + } + case 18: { + T t = 2*y100 - 37; + return 0.31965587178596443475e-1 + (0.10566560976716574401e-2 + (0.63151633192414586770e-5 + (0.35924638339521924242e-7 + (0.19353584758781174038e-9 + (0.98102783859889264382e-12 + 0.46381060817777777779e-14 * t) * t) * t) * t) * t) * t; + } + case 19: { + T t = 2*y100 - 39; + return 0.34104450552588334840e-1 + (0.10823541191350532574e-2 + (0.65354356159553934436e-5 + (0.37512918348533521149e-7 + (0.20362979635817883229e-9 + (0.10384187833037282363e-11 + 0.49300625262222222221e-14 * t) * t) * t) * t) * t) * t; + } + case 20: { + T t = 2*y100 - 41; + return 0.36295603928292425716e-1 + (0.11089526167995268200e-2 + (0.67654845095518363577e-5 + (0.39184292949913591646e-7 + (0.21431552202133775150e-9 + (0.10994259106646731797e-11 + 0.52409949102222222221e-14 * t) * t) * t) * t) * t) * t; + } + case 21: { + T t = 2*y100 - 43; + return 0.38540888038840509795e-1 + (0.11364917134175420009e-2 + (0.70058230641246312003e-5 + (0.40943644083718586939e-7 + (0.22563034723692881631e-9 + (0.11642841011361992885e-11 + 0.55721092871111111110e-14 * t) * t) * t) * t) * t) * t; + } + case 22: { + T t = 2*y100 - 45; + return 0.40842225954785960651e-1 + (0.11650136437945673891e-2 + (0.72569945502343006619e-5 + (0.42796161861855042273e-7 + (0.23761401711005024162e-9 + (0.12332431172381557035e-11 + 0.59246802364444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 23: { + T t = 2*y100 - 47; + return 0.43201627431540222422e-1 + (0.11945628793917272199e-2 + (0.75195743532849206263e-5 + (0.44747364553960993492e-7 + (0.25030885216472953674e-9 + (0.13065684400300476484e-11 + 0.63000532853333333334e-14 * t) * t) * t) * t) * t) * t; + } + case 24: { + T t = 2*y100 - 49; + return 0.45621193513810471438e-1 + (0.12251862608067529503e-2 + (0.77941720055551920319e-5 + (0.46803119830954460212e-7 + (0.26375990983978426273e-9 + (0.13845421370977119765e-11 + 0.66996477404444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 25: { + T t = 2*y100 - 51; + return 0.48103121413299865517e-1 + (0.12569331386432195113e-2 + (0.80814333496367673980e-5 + (0.48969667335682018324e-7 + (0.27801515481905748484e-9 + (0.14674637611609884208e-11 + 0.71249589351111111110e-14 * t) * t) * t) * t) * t) * t; + } + case 26: { + T t = 2*y100 - 53; + return 0.50649709676983338501e-1 + (0.12898555233099055810e-2 + (0.83820428414568799654e-5 + (0.51253642652551838659e-7 + (0.29312563849675507232e-9 + (0.15556512782814827846e-11 + 0.75775607822222222221e-14 * t) * t) * t) * t) * t) * t; + } + case 27: { + T t = 2*y100 - 55; + return 0.53263363664388864181e-1 + (0.13240082443256975769e-2 + (0.86967260015007658418e-5 + (0.53662102750396795566e-7 + (0.30914568786634796807e-9 + (0.16494420240828493176e-11 + 0.80591079644444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 28: { + T t = 2*y100 - 57; + return 0.55946601353500013794e-1 + (0.13594491197408190706e-2 + (0.90262520233016380987e-5 + (0.56202552975056695376e-7 + (0.32613310410503135996e-9 + (0.17491936862246367398e-11 + 0.85713381688888888890e-14 * t) * t) * t) * t) * t) * t; + } + case 29: { + T t = 2*y100 - 59; + return 0.58702059496154081813e-1 + (0.13962391363223647892e-2 + (0.93714365487312784270e-5 + (0.58882975670265286526e-7 + (0.34414937110591753387e-9 + (0.18552853109751857859e-11 + 0.91160736711111111110e-14 * t) * t) * t) * t) * t) * t; + } + case 30: { + T t = 2*y100 - 61; + return 0.61532500145144778048e-1 + (0.14344426411912015247e-2 + (0.97331446201016809696e-5 + (0.61711860507347175097e-7 + (0.36325987418295300221e-9 + (0.19681183310134518232e-11 + 0.96952238400000000000e-14 * t) * t) * t) * t) * t) * t; + } + case 31: { + T t = 2*y100 - 63; + return 0.64440817576653297993e-1 + (0.14741275456383131151e-2 + (0.10112293819576437838e-4 + (0.64698236605933246196e-7 + (0.38353412915303665586e-9 + (0.20881176114385120186e-11 + 0.10310784480000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 32: { + T t = 2*y100 - 65; + return 0.67430045633130393282e-1 + (0.15153655418916540370e-2 + (0.10509857606888328667e-4 + (0.67851706529363332855e-7 + (0.40504602194811140006e-9 + (0.22157325110542534469e-11 + 0.10964842115555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 33: { + T t = 2*y100 - 67; + return 0.70503365513338850709e-1 + (0.15582323336495709827e-2 + (0.10926868866865231089e-4 + (0.71182482239613507542e-7 + (0.42787405890153386710e-9 + (0.23514379522274416437e-11 + 0.11659571751111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 34: { + T t = 2*y100 - 69; + return 0.73664114037944596353e-1 + (0.16028078812438820413e-2 + (0.11364423678778207991e-4 + (0.74701423097423182009e-7 + (0.45210162777476488324e-9 + (0.24957355004088569134e-11 + 0.12397238257777777778e-13 * t) * t) * t) * t) * t) * t; + } + case 35: { + T t = 2*y100 - 71; + return 0.76915792420819562379e-1 + (0.16491766623447889354e-2 + (0.11823685320041302169e-4 + (0.78420075993781544386e-7 + (0.47781726956916478925e-9 + (0.26491544403815724749e-11 + 0.13180196462222222222e-13 * t) * t) * t) * t) * t) * t; + } + case 36: { + T t = 2*y100 - 73; + return 0.80262075578094612819e-1 + (0.16974279491709504117e-2 + (0.12305888517309891674e-4 + (0.82350717698979042290e-7 + (0.50511496109857113929e-9 + (0.28122528497626897696e-11 + 0.14010889635555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 37: { + T t = 2*y100 - 75; + return 0.83706822008980357446e-1 + (0.17476561032212656962e-2 + (0.12812343958540763368e-4 + (0.86506399515036435592e-7 + (0.53409440823869467453e-9 + (0.29856186620887555043e-11 + 0.14891851591111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 38: { + T t = 2*y100 - 77; + return 0.87254084284461718231e-1 + (0.17999608886001962327e-2 + (0.13344443080089492218e-4 + (0.90900994316429008631e-7 + (0.56486134972616465316e-9 + (0.31698707080033956934e-11 + 0.15825697795555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 39: { + T t = 2*y100 - 79; + return 0.90908120182172748487e-1 + (0.18544478050657699758e-2 + (0.13903663143426120077e-4 + (0.95549246062549906177e-7 + (0.59752787125242054315e-9 + (0.33656597366099099413e-11 + 0.16815130613333333333e-13 * t) * t) * t) * t) * t) * t; + } + case 40: { + T t = 2*y100 - 81; + return 0.94673404508075481121e-1 + (0.19112284419887303347e-2 + (0.14491572616545004930e-4 + (0.10046682186333613697e-6 + (0.63221272959791000515e-9 + (0.35736693975589130818e-11 + 0.17862931591111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 41: { + T t = 2*y100 - 83; + return 0.98554641648004456555e-1 + (0.19704208544725622126e-2 + (0.15109836875625443935e-4 + (0.10567036667675984067e-6 + (0.66904168640019354565e-9 + (0.37946171850824333014e-11 + 0.18971959040000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 42: { + T t = 2*y100 - 85; + return 0.10255677889470089531e0 + (0.20321499629472857418e-2 + (0.15760224242962179564e-4 + (0.11117756071353507391e-6 + (0.70814785110097658502e-9 + (0.40292553276632563925e-11 + 0.20145143075555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 43: { + T t = 2*y100 - 87; + return 0.10668502059865093318e0 + (0.20965479776148731610e-2 + (0.16444612377624983565e-4 + (0.11700717962026152749e-6 + (0.74967203250938418991e-9 + (0.42783716186085922176e-11 + 0.21385479360000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 44: { + T t = 2*y100 - 89; + return 0.11094484319386444474e0 + (0.21637548491908170841e-2 + (0.17164995035719657111e-4 + (0.12317915750735938089e-6 + (0.79376309831499633734e-9 + (0.45427901763106353914e-11 + 0.22696025653333333333e-13 * t) * t) * t) * t) * t) * t; + } + case 45: { + T t = 2*y100 - 91; + return 0.11534201115268804714e0 + (0.22339187474546420375e-2 + (0.17923489217504226813e-4 + (0.12971465288245997681e-6 + (0.84057834180389073587e-9 + (0.48233721206418027227e-11 + 0.24079890062222222222e-13 * t) * t) * t) * t) * t) * t; + } + case 46: { + T t = 2*y100 - 93; + return 0.11988259392684094740e0 + (0.23071965691918689601e-2 + (0.18722342718958935446e-4 + (0.13663611754337957520e-6 + (0.89028385488493287005e-9 + (0.51210161569225846701e-11 + 0.25540227111111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 47: { + T t = 2*y100 - 95; + return 0.12457298393509812907e0 + (0.23837544771809575380e-2 + (0.19563942105711612475e-4 + (0.14396736847739470782e-6 + (0.94305490646459247016e-9 + (0.54366590583134218096e-11 + 0.27080225920000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 48: { + T t = 2*y100 - 97; + return 0.12941991566142438816e0 + (0.24637684719508859484e-2 + (0.20450821127475879816e-4 + (0.15173366280523906622e-6 + (0.99907632506389027739e-9 + (0.57712760311351625221e-11 + 0.28703099555555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 49: { + T t = 2*y100 - 99; + return 0.13443048593088696613e0 + (0.25474249981080823877e-2 + (0.21385669591362915223e-4 + (0.15996177579900443030e-6 + (0.10585428844575134013e-8 + (0.61258809536787882989e-11 + 0.30412080142222222222e-13 * t) * t) * t) * t) * t) * t; + } + case 50: { + T t = 2*y100 - 101; + return 0.13961217543434561353e0 + (0.26349215871051761416e-2 + (0.22371342712572567744e-4 + (0.16868008199296822247e-6 + (0.11216596910444996246e-8 + (0.65015264753090890662e-11 + 0.32210394506666666666e-13 * t) * t) * t) * t) * t) * t; + } + case 51: { + T t = 2*y100 - 103; + return 0.14497287157673800690e0 + (0.27264675383982439814e-2 + (0.23410870961050950197e-4 + (0.17791863939526376477e-6 + (0.11886425714330958106e-8 + (0.68993039665054288034e-11 + 0.34101266222222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 52: { + T t = 2*y100 - 105; + return 0.15052089272774618151e0 + (0.28222846410136238008e-2 + (0.24507470422713397006e-4 + (0.18770927679626136909e-6 + (0.12597184587583370712e-8 + (0.73203433049229821618e-11 + 0.36087889048888888890e-13 * t) * t) * t) * t) * t) * t; + } + case 53: { + T t = 2*y100 - 107; + return 0.15626501395774612325e0 + (0.29226079376196624949e-2 + (0.25664553693768450545e-4 + (0.19808568415654461964e-6 + (0.13351257759815557897e-8 + (0.77658124891046760667e-11 + 0.38173420035555555555e-13 * t) * t) * t) * t) * t) * t; + } + case 54: { + T t = 2*y100 - 109; + return 0.16221449434620737567e0 + (0.30276865332726475672e-2 + (0.26885741326534564336e-4 + (0.20908350604346384143e-6 + (0.14151148144240728728e-8 + (0.82369170665974313027e-11 + 0.40360957457777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 55: { + T t = 2*y100 - 111; + return 0.16837910595412130659e0 + (0.31377844510793082301e-2 + (0.28174873844911175026e-4 + (0.22074043807045782387e-6 + (0.14999481055996090039e-8 + (0.87348993661930809254e-11 + 0.42653528977777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 56: { + T t = 2*y100 - 113; + return 0.17476916455659369953e0 + (0.32531815370903068316e-2 + (0.29536024347344364074e-4 + (0.23309632627767074202e-6 + (0.15899007843582444846e-8 + (0.92610375235427359475e-11 + 0.45054073102222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 57: { + T t = 2*y100 - 115; + return 0.18139556223643701364e0 + (0.33741744168096996041e-2 + (0.30973511714709500836e-4 + (0.24619326937592290996e-6 + (0.16852609412267750744e-8 + (0.98166442942854895573e-11 + 0.47565418097777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 58: { + T t = 2*y100 - 117; + return 0.18826980194443664549e0 + (0.35010775057740317997e-2 + (0.32491914440014267480e-4 + (0.26007572375886319028e-6 + (0.17863299617388376116e-8 + (0.10403065638343878679e-10 + 0.50190265831111111110e-13 * t) * t) * t) * t) * t) * t; + } + case 59: { + T t = 2*y100 - 119; + return 0.19540403413693967350e0 + (0.36342240767211326315e-2 + (0.34096085096200907289e-4 + (0.27479061117017637474e-6 + (0.18934228504790032826e-8 + (0.11021679075323598664e-10 + 0.52931171733333333334e-13 * t) * t) * t) * t) * t) * t; + } + case 60: { + T t = 2*y100 - 121; + return 0.20281109560651886959e0 + (0.37739673859323597060e-2 + (0.35791165457592409054e-4 + (0.29038742889416172404e-6 + (0.20068685374849001770e-8 + (0.11673891799578381999e-10 + 0.55790523093333333334e-13 * t) * t) * t) * t) * t) * t; + } + case 61: { + T t = 2*y100 - 123; + return 0.21050455062669334978e0 + (0.39206818613925652425e-2 + (0.37582602289680101704e-4 + (0.30691836231886877385e-6 + (0.21270101645763677824e-8 + (0.12361138551062899455e-10 + 0.58770520160000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 62: { + T t = 2*y100 - 125; + return 0.21849873453703332479e0 + (0.40747643554689586041e-2 + (0.39476163820986711501e-4 + (0.32443839970139918836e-6 + (0.22542053491518680200e-8 + (0.13084879235290858490e-10 + 0.61873153262222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 63: { + T t = 2*y100 - 127; + return 0.22680879990043229327e0 + (0.42366354648628516935e-2 + (0.41477956909656896779e-4 + (0.34300544894502810002e-6 + (0.23888264229264067658e-8 + (0.13846596292818514601e-10 + 0.65100183751111111110e-13 * t) * t) * t) * t) * t) * t; + } + case 64: { + T t = 2*y100 - 129; + return 0.23545076536988703937e0 + (0.44067409206365170888e-2 + (0.43594444916224700881e-4 + (0.36268045617760415178e-6 + (0.25312606430853202748e-8 + (0.14647791812837903061e-10 + 0.68453122631111111110e-13 * t) * t) * t) * t) * t) * t; + } + case 65: { + T t = 2*y100 - 131; + return 0.24444156740777432838e0 + (0.45855530511605787178e-2 + (0.45832466292683085475e-4 + (0.38352752590033030472e-6 + (0.26819103733055603460e-8 + (0.15489984390884756993e-10 + 0.71933206364444444445e-13 * t) * t) * t) * t) * t) * t; + } + case 66: { + T t = 2*y100 - 133; + return 0.25379911500634264643e0 + (0.47735723208650032167e-2 + (0.48199253896534185372e-4 + (0.40561404245564732314e-6 + (0.28411932320871165585e-8 + (0.16374705736458320149e-10 + 0.75541379822222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 67: { + T t = 2*y100 - 135; + return 0.26354234756393613032e0 + (0.49713289477083781266e-2 + (0.50702455036930367504e-4 + (0.42901079254268185722e-6 + (0.30095422058900481753e-8 + (0.17303497025347342498e-10 + 0.79278273368888888890e-13 * t) * t) * t) * t) * t) * t; + } + case 68: { + T t = 2*y100 - 137; + return 0.27369129607732343398e0 + (0.51793846023052643767e-2 + (0.53350152258326602629e-4 + (0.45379208848865015485e-6 + (0.31874057245814381257e-8 + (0.18277905010245111046e-10 + 0.83144182364444444445e-13 * t) * t) * t) * t) * t) * t; + } + case 69: { + T t = 2*y100 - 139; + return 0.28426714781640316172e0 + (0.53983341916695141966e-2 + (0.56150884865255810638e-4 + (0.48003589196494734238e-6 + (0.33752476967570796349e-8 + (0.19299477888083469086e-10 + 0.87139049137777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 70: { + T t = 2*y100 - 141; + return 0.29529231465348519920e0 + (0.56288077305420795663e-2 + (0.59113671189913307427e-4 + (0.50782393781744840482e-6 + (0.35735475025851713168e-8 + (0.20369760937017070382e-10 + 0.91262442613333333334e-13 * t) * t) * t) * t) * t) * t; + } + case 71: { + T t = 2*y100 - 143; + return 0.30679050522528838613e0 + (0.58714723032745403331e-2 + (0.62248031602197686791e-4 + (0.53724185766200945789e-6 + (0.37827999418960232678e-8 + (0.21490291930444538307e-10 + 0.95513539182222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 72: { + T t = 2*y100 - 145; + return 0.31878680111173319425e0 + (0.61270341192339103514e-2 + (0.65564012259707640976e-4 + (0.56837930287837738996e-6 + (0.40035151353392378882e-8 + (0.22662596341239294792e-10 + 0.99891109760000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 73: { + T t = 2*y100 - 147; + return 0.33130773722152622027e0 + (0.63962406646798080903e-2 + (0.69072209592942396666e-4 + (0.60133006661885941812e-6 + (0.42362183765883466691e-8 + (0.23888182347073698382e-10 + 0.10439349811555555556e-12 * t) * t) * t) * t) * t) * t; + } + case 74: { + T t = 2*y100 - 149; + return 0.34438138658041336523e0 + (0.66798829540414007258e-2 + (0.72783795518603561144e-4 + (0.63619220443228800680e-6 + (0.44814499336514453364e-8 + (0.25168535651285475274e-10 + 0.10901861383111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 75: { + T t = 2*y100 - 151; + return 0.35803744972380175583e0 + (0.69787978834882685031e-2 + (0.76710543371454822497e-4 + (0.67306815308917386747e-6 + (0.47397647975845228205e-8 + (0.26505114141143050509e-10 + 0.11376390933333333333e-12 * t) * t) * t) * t) * t) * t; + } + case 76: { + T t = 2*y100 - 153; + return 0.37230734890119724188e0 + (0.72938706896461381003e-2 + (0.80864854542670714092e-4 + (0.71206484718062688779e-6 + (0.50117323769745883805e-8 + (0.27899342394100074165e-10 + 0.11862637614222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 77: { + T t = 2*y100 - 155; + return 0.38722432730555448223e0 + (0.76260375162549802745e-2 + (0.85259785810004603848e-4 + (0.75329383305171327677e-6 + (0.52979361368388119355e-8 + (0.29352606054164086709e-10 + 0.12360253370666666667e-12 * t) * t) * t) * t) * t) * t; + } + case 78: { + T t = 2*y100 - 157; + return 0.40282355354616940667e0 + (0.79762880915029728079e-2 + (0.89909077342438246452e-4 + (0.79687137961956194579e-6 + (0.55989731807360403195e-8 + (0.30866246101464869050e-10 + 0.12868841946666666667e-12 * t) * t) * t) * t) * t) * t; + } + case 79: { + T t = 2*y100 - 159; + return 0.41914223158913787649e0 + (0.83456685186950463538e-2 + (0.94827181359250161335e-4 + (0.84291858561783141014e-6 + (0.59154537751083485684e-8 + (0.32441553034347469291e-10 + 0.13387957943111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 80: { + T t = 2*y100 - 161; + return 0.43621971639463786896e0 + (0.87352841828289495773e-2 + (0.10002929142066799966e-3 + (0.89156148280219880024e-6 + (0.62480008150788597147e-8 + (0.34079760983458878910e-10 + 0.13917107176888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 81: { + T t = 2*y100 - 163; + return 0.45409763548534330981e0 + (0.91463027755548240654e-2 + (0.10553137232446167258e-3 + (0.94293113464638623798e-6 + (0.65972492312219959885e-8 + (0.35782041795476563662e-10 + 0.14455745872000000000e-12 * t) * t) * t) * t) * t) * t; + } + case 82: { + T t = 2*y100 - 165; + return 0.47282001668512331468e0 + (0.95799574408860463394e-2 + (0.11135019058000067469e-3 + (0.99716373005509038080e-6 + (0.69638453369956970347e-8 + (0.37549499088161345850e-10 + 0.15003280712888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 83: { + T t = 2*y100 - 167; + return 0.49243342227179841649e0 + (0.10037550043909497071e-1 + (0.11750334542845234952e-3 + (0.10544006716188967172e-5 + (0.73484461168242224872e-8 + (0.39383162326435752965e-10 + 0.15559069118222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 84: { + T t = 2*y100 - 169; + return 0.51298708979209258326e0 + (0.10520454564612427224e-1 + (0.12400930037494996655e-3 + (0.11147886579371265246e-5 + (0.77517184550568711454e-8 + (0.41283980931872622611e-10 + 0.16122419680000000000e-12 * t) * t) * t) * t) * t) * t; + } + case 85: { + T t = 2*y100 - 171; + return 0.53453307979101369843e0 + (0.11030120618800726938e-1 + (0.13088741519572269581e-3 + (0.11784797595374515432e-5 + (0.81743383063044825400e-8 + (0.43252818449517081051e-10 + 0.16692592640000000000e-12 * t) * t) * t) * t) * t) * t; + } + case 86: { + T t = 2*y100 - 173; + return 0.55712643071169299478e0 + (0.11568077107929735233e-1 + (0.13815797838036651289e-3 + (0.12456314879260904558e-5 + (0.86169898078969313597e-8 + (0.45290446811539652525e-10 + 0.17268801084444444444e-12 * t) * t) * t) * t) * t) * t; + } + case 87: { + T t = 2*y100 - 175; + return 0.58082532122519320968e0 + (0.12135935999503877077e-1 + (0.14584223996665838559e-3 + (0.13164068573095710742e-5 + (0.90803643355106020163e-8 + (0.47397540713124619155e-10 + 0.17850211608888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 88: { + T t = 2*y100 - 177; + return 0.60569124025293375554e0 + (0.12735396239525550361e-1 + (0.15396244472258863344e-3 + (0.13909744385382818253e-5 + (0.95651595032306228245e-8 + (0.49574672127669041550e-10 + 0.18435945564444444444e-12 * t) * t) * t) * t) * t) * t; + } + case 89: { + T t = 2*y100 - 179; + return 0.63178916494715716894e0 + (0.13368247798287030927e-1 + (0.16254186562762076141e-3 + (0.14695084048334056083e-5 + (0.10072078109604152350e-7 + (0.51822304995680707483e-10 + 0.19025081422222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 90: { + T t = 2*y100 - 181; + return 0.65918774689725319200e0 + (0.14036375850601992063e-1 + (0.17160483760259706354e-3 + (0.15521885688723188371e-5 + (0.10601827031535280590e-7 + (0.54140790105837520499e-10 + 0.19616655146666666667e-12 * t) * t) * t) * t) * t) * t; + } + case 91: { + T t = 2*y100 - 183; + return 0.68795950683174433822e0 + (0.14741765091365869084e-1 + (0.18117679143520433835e-3 + (0.16392004108230585213e-5 + (0.11155116068018043001e-7 + (0.56530360194925690374e-10 + 0.20209663662222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 92: { + T t = 2*y100 - 185; + return 0.71818103808729967036e0 + (0.15486504187117112279e-1 + (0.19128428784550923217e-3 + (0.17307350969359975848e-5 + (0.11732656736113607751e-7 + (0.58991125287563833603e-10 + 0.20803065333333333333e-12 * t) * t) * t) * t) * t) * t; + } + case 93: { + T t = 2*y100 - 187; + return 0.74993321911726254661e0 + (0.16272790364044783382e-1 + (0.20195505163377912645e-3 + (0.18269894883203346953e-5 + (0.12335161021630225535e-7 + (0.61523068312169087227e-10 + 0.21395783431111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 94: { + T t = 2*y100 - 189; + return 0.78330143531283492729e0 + (0.17102934132652429240e-1 + (0.21321800585063327041e-3 + (0.19281661395543913713e-5 + (0.12963340087354341574e-7 + (0.64126040998066348872e-10 + 0.21986708942222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 95: { + T t = 2*y100 - 191; + return 0.81837581041023811832e0 + (0.17979364149044223802e-1 + (0.22510330592753129006e-3 + (0.20344732868018175389e-5 + (0.13617902941839949718e-7 + (0.66799760083972474642e-10 + 0.22574701262222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 96: { + T t = 2*y100 - 193; + return 0.85525144775685126237e0 + (0.18904632212547561026e-1 + (0.23764237370371255638e-3 + (0.21461248251306387979e-5 + (0.14299555071870523786e-7 + (0.69543803864694171934e-10 + 0.23158593688888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 97: { + T t = 2*y100 - 195; + return 0.89402868170849933734e0 + (0.19881418399127202569e-1 + (0.25086793128395995798e-3 + (0.22633402747585233180e-5 + (0.15008997042116532283e-7 + (0.72357609075043941261e-10 + 0.23737194737777777778e-12 * t) * t) * t) * t) * t) * t; + } + case 98: { + T t = 2*y100 - 197; + return 0.93481333942870796363e0 + (0.20912536329780368893e-1 + (0.26481403465998477969e-3 + (0.23863447359754921676e-5 + (0.15746923065472184451e-7 + (0.75240468141720143653e-10 + 0.24309291271111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 99: { + T t = 2*y100 - 199; + return 0.97771701335885035464e0 + (0.22000938572830479551e-1 + (0.27951610702682383001e-3 + (0.25153688325245314530e-5 + (0.16514019547822821453e-7 + (0.78191526829368231251e-10 + 0.24873652355555555556e-12 * t) * t) * t) * t) * t) * t; + } + } + + // we only get here if y = 1, i.e. |x| < 4*eps, in which case + // erfcx is within 1e-15 of 1.. + return 1.; + } + + template + T erfcx(T x) { + // Short-circuits on NaN (returning NaN) + if (x != x) { + return x; + } + + if (x >= 0) { + if (x > T{50}) { // continued-fraction expansion is faster + const T ispi = 0.56418958354775628694807945156; // 1 / sqrt(pi) + + if (x > T{5e7}) { // 1-term expansion, important to avoid overflow + return ispi / x; + } + + /* 5-term expansion (rely on compiler for CSE), simplified from: + ispi / (x+0.5/(x+1/(x+1.5/(x+2/x)))) */ + return ispi * ((x*x) * (x*x+T{4.5}) + T{2}) / (x * ((x*x) * (x*x+T{5}) + T{3.75})); + } + + // x >= 0 x <= 50 + return erfcx_y100(T{400} / (T{4} + x)); + } + + // x < 0 + if (x < T{-26.7}) { + return POS_INFINITY; + } else if (x < T{-6.1}) { + return T{2} * exp(x * x); + } + + // x < 0 and x >= -6.1 + return T{2} * exp(x * x) - erfcx_y100(T{400} / (T{4} - x)); + } +); // erfcx_string + +const auto airy_ai_string = jiterator_stringify( + template + T airy_ai_forward(T x) { + static const T AN[] = { + +3.46538101525629032477e-01, + +1.20075952739645805542e+01, + +7.62796053615234516538e+01, + +1.68089224934630576269e+02, + +1.59756391350164413639e+02, + +7.05360906840444183113e+01, + +1.40264691163389668864e+01, + +9.99999999999999995305e-01, + }; + + static const T AD[] = { + +5.67594532638770212846e-01, + +1.47562562584847203173e+01, + +8.45138970141474626562e+01, + +1.77318088145400459522e+02, + +1.64234692871529701831e+02, + +7.14778400825575695274e+01, + +1.40959135607834029598e+01, + +1.00000000000000000470e+00, + }; + + static const T AFN[] = { + -1.31696323418331795333e-01, + -6.26456544431912369773e-01, + -6.93158036036933542233e-01, + -2.79779981545119124951e-01, + -4.91900132609500318020e-02, + -4.06265923594885404393e-03, + -1.59276496239262096340e-04, + -2.77649108155232920844e-06, + -1.67787698489114633780e-08, + }; + + static const T AFD[] = { + +1.33560420706553243746e+01, + +3.26825032795224613948e+01, + +2.67367040941499554804e+01, + +9.18707402907259625840e+00, + +1.47529146771666414581e+00, + +1.15687173795188044134e-01, + +4.40291641615211203805e-03, + +7.54720348287414296618e-05, + +4.51850092970580378464e-07, + }; + + static const T AGN[] = { + +1.97339932091685679179e-02, + +3.91103029615688277255e-01, + +1.06579897599595591108e+00, + +9.39169229816650230044e-01, + +3.51465656105547619242e-01, + +6.33888919628925490927e-02, + +5.85804113048388458567e-03, + +2.82851600836737019778e-04, + +6.98793669997260967291e-06, + +8.11789239554389293311e-08, + +3.41551784765923618484e-10, + }; + + static const T AGD[] = { + +9.30892908077441974853e+00, + +1.98352928718312140417e+01, + +1.55646628932864612953e+01, + +5.47686069422975497931e+00, + +9.54293611618961883998e-01, + +8.64580826352392193095e-02, + +4.12656523824222607191e-03, + +1.01259085116509135510e-04, + +1.17166733214413521882e-06, + +4.91834570062930015649e-09, + }; + + int domain_flag = 0; + + T ai; + + if (isinf(x)) { + return NAN; + } + + if (x > T(103.892)) { + return T(0.0); + } + + T f; + T g; + T k; + + if (x < T(-2.09)) { + T z = T(1.0) / (T(-2.0) * x * sqrt(-x) / T(3.0)); + + T afn = 0.0; + + for (uint8_t index = 0; index <= 8; index++) { + afn = afn * (z * z) + AFN[index]; + } + + T afd = 0.0; + + for (uint8_t index = 0; index <= 8; index++) { + afd = afd * (z * z) + AFD[index]; + } + + T agn = 0.0; + + for (uint8_t index = 0; index <= 10 + 0; index++) { + agn = agn * (z * z) + AGN[index]; + } + + T agd = 0.0; + + for (uint8_t index = 0; index <= 10 - 1; index++) { + agd = agd * (z * z) + AGD[index]; + } + + T t = T(-2.0) * x * sqrt(-x) / T(3.0) + T(0.25) * T(3.14159265358979323846); + + return T(5.64189583547756286948e-01) / sqrt(sqrt(-x)) * (sin(t) * (T(1.0) + z * z * afn / afd) - cos(t) * (z * agn / agd)); + } + + if (x >= T(2.09)) { + domain_flag = 5; + + T zeta = T(2.0) * x * sqrt(x) / T(3.0); + + T an = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + an = an * (T(1.0) / zeta) + AN[index]; + } + + T ad = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + ad = ad * (T(1.0) / zeta) + AD[index]; + } + + ai = T(5.64189583547756286948e-01) * (an / ad) / (T(2.0) * sqrt(sqrt(x)) * exp(zeta)); + + if (x > T(8.3203353)) { + return ai; + } + } + + f = 1.0; + g = x; + k = 1.0; + + T m = 1.0; + T n = x; + T t = 1.0; + T z = x * x * x; + + while (t > T(1.11022302462515654042e-16)) { + m *= z; + k += T(1.0); + m /= k; + n *= z; + k += T(1.0); + n /= k; + m /= k; + f += m; + k += T(1.0); + n /= k; + g += n; + + t = abs(m / f); + } + + if ((domain_flag & 1) == 0) { + return T(0.355028053887817239260) * f - T(0.258819403792806798405) * g; + } + + return ai; + } // T airy_ai(T x) +); // airy_ai_string + +const auto bessel_j0_string = jiterator_stringify( + template + T bessel_j0_forward(T x) { + static const T PP[] = { + +7.96936729297347051624e-04, + +8.28352392107440799803e-02, + +1.23953371646414299388e+00, + +5.44725003058768775090e+00, + +8.74716500199817011941e+00, + +5.30324038235394892183e+00, + +9.99999999999999997821e-01, + }; + + static const T PQ[] = { + +9.24408810558863637013e-04, + +8.56288474354474431428e-02, + +1.25352743901058953537e+00, + +5.47097740330417105182e+00, + +8.76190883237069594232e+00, + +5.30605288235394617618e+00, + +1.00000000000000000218e+00, + }; + + static const T QP[] = { + -1.13663838898469149931e-02, + -1.28252718670509318512e+00, + -1.95539544257735972385e+01, + -9.32060152123768231369e+01, + -1.77681167980488050595e+02, + -1.47077505154951170175e+02, + -5.14105326766599330220e+01, + -6.05014350600728481186e+00, + }; + + static const T QQ[] = { + +6.43178256118178023184e+01, + +8.56430025976980587198e+02, + +3.88240183605401609683e+03, + +7.24046774195652478189e+03, + +5.93072701187316984827e+03, + +2.06209331660327847417e+03, + +2.42005740240291393179e+02, + }; + + static const T RP[] = { + -4.79443220978201773821e+09, + +1.95617491946556577543e+12, + -2.49248344360967716204e+14, + +9.70862251047306323952e+15, + }; + + static const T RQ[] = { + +4.99563147152651017219e+02, + +1.73785401676374683123e+05, + +4.84409658339962045305e+07, + +1.11855537045356834862e+10, + +2.11277520115489217587e+12, + +3.10518229857422583814e+14, + +3.18121955943204943306e+16, + +1.71086294081043136091e+18, + }; + + if (x < T(0)) { + x = -x; + } + + if (x <= T(5.0)) { + if (x < T(0.00001)) { + return T(1.0) - x * x / T(4.0); + } + + T rp = 0.0; + + for (uint8_t index = 0; index <= 3; index++) { + rp = rp * (x * x) + RP[index]; + } + + T rq = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + rq = rq * (x * x) + RQ[index]; + } + + return (x * x - T(5.78318596294678452118e+00)) * (x * x - T(3.04712623436620863991e+01)) * rp / rq; + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(25.0) / (x * x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(25.0) / (x * x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(25.0) / (x * x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(25.0) / (x * x)) + QQ[index]; + } + + return (pp / pq * cos(x - T(0.785398163397448309615660845819875721)) - T(5.0) / x * (qp / qq) * sin(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / sqrt(x); + } // bessel_j0_forward(T x) +); // bessel_j0_string + +const auto bessel_y0_string = bessel_j0_string + jiterator_stringify( + template + T bessel_y0_forward(T x) { + static const T PP[] = { + +7.96936729297347051624e-04, + +8.28352392107440799803e-02, + +1.23953371646414299388e+00, + +5.44725003058768775090e+00, + +8.74716500199817011941e+00, + +5.30324038235394892183e+00, + +9.99999999999999997821e-01, + }; + + static const T PQ[] = { + +9.24408810558863637013e-04, + +8.56288474354474431428e-02, + +1.25352743901058953537e+00, + +5.47097740330417105182e+00, + +8.76190883237069594232e+00, + +5.30605288235394617618e+00, + +1.00000000000000000218e+00, + }; + + static const T QP[] = { + -1.13663838898469149931e-02, + -1.28252718670509318512e+00, + -1.95539544257735972385e+01, + -9.32060152123768231369e+01, + -1.77681167980488050595e+02, + -1.47077505154951170175e+02, + -5.14105326766599330220e+01, + -6.05014350600728481186e+00, + }; + + static const T QQ[] = { + +6.43178256118178023184e+01, + +8.56430025976980587198e+02, + +3.88240183605401609683e+03, + +7.24046774195652478189e+03, + +5.93072701187316984827e+03, + +2.06209331660327847417e+03, + +2.42005740240291393179e+02, + }; + + static const T YP[] = { + +1.55924367855235737965e+04, + -1.46639295903971606143e+07, + +5.43526477051876500413e+09, + -9.82136065717911466409e+11, + +8.75906394395366999549e+13, + -3.46628303384729719441e+15, + +4.42733268572569800351e+16, + -1.84950800436986690637e+16, + }; + + static const T YQ[] = { + +1.04128353664259848412e+03, + +6.26107330137134956842e+05, + +2.68919633393814121987e+08, + +8.64002487103935000337e+10, + +2.02979612750105546709e+13, + +3.17157752842975028269e+15, + +2.50596256172653059228e+17, + }; + + if (x <= T(5.0)) { + if (x == T(0.0)) { + return NEG_INFINITY; + } + + if (x < T(0.0)) { + NAN; + } + + T yp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + yp = yp * (x * x) + YP[index]; + } + + T yq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + yq = yq * (x * x) + YQ[index]; + } + + return yp / yq + (T(0.636619772367581343075535053490057448) * log(x) * bessel_j0_forward(x)); + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(25.0) / (x * x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(25.0) / (x * x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(25.0) / (x * x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(25.0) / (x * x)) + QQ[index]; + } + + return (pp / pq * sin(x - T(0.785398163397448309615660845819875721)) + T(5.0) / x * (qp / qq) * cos(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / sqrt(x); + } // bessel_y0_forward(T x) +); // bessel_y0_string + +const auto bessel_j1_string = jiterator_stringify( + template + T bessel_j1_forward(T x) { + static const T PP[] = { + +7.62125616208173112003e-04, + +7.31397056940917570436e-02, + +1.12719608129684925192e+00, + +5.11207951146807644818e+00, + +8.42404590141772420927e+00, + +5.21451598682361504063e+00, + +1.00000000000000000254e+00, + }; + + static const T PQ[] = { + +5.71323128072548699714e-04, + +6.88455908754495404082e-02, + +1.10514232634061696926e+00, + +5.07386386128601488557e+00, + +8.39985554327604159757e+00, + +5.20982848682361821619e+00, + +9.99999999999999997461e-01, + }; + + static const T QP[] = { + +5.10862594750176621635e-02, + +4.98213872951233449420e+00, + +7.58238284132545283818e+01, + +3.66779609360150777800e+02, + +7.10856304998926107277e+02, + +5.97489612400613639965e+02, + +2.11688757100572135698e+02, + +2.52070205858023719784e+01, + }; + + static const T QQ[] = { + +7.42373277035675149943e+01, + +1.05644886038262816351e+03, + +4.98641058337653607651e+03, + +9.56231892404756170795e+03, + +7.99704160447350683650e+03, + +2.82619278517639096600e+03, + +3.36093607810698293419e+02, + }; + + static const T RP[] = { + -8.99971225705559398224e+08, + +4.52228297998194034323e+11, + -7.27494245221818276015e+13, + +3.68295732863852883286e+15, + }; + + static const T RQ[] = { + +6.20836478118054335476e+02, + +2.56987256757748830383e+05, + +8.35146791431949253037e+07, + +2.21511595479792499675e+10, + +4.74914122079991414898e+12, + +7.84369607876235854894e+14, + +8.95222336184627338078e+16, + +5.32278620332680085395e+18, + }; + + if (x < T(0.0)) { + return -bessel_j1_forward(-x); + } + + if (x <= T(5.0)) { + T rp = 0.0; + + for (uint8_t index = 0; index <= 3; index++) { + rp = rp * (x * x) + RP[index]; + } + + T rq = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + rq = rq * (x * x) + RQ[index]; + } + + return rp / rq * x * (x * x - T(1.46819706421238932572e+01)) * (x * x - T(4.92184563216946036703e+01)); + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index]; + } + + return (pp / pq * cos(x - T(2.356194490192344928846982537459627163)) - T(5.0) / x * (qp / qq) * sin(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / sqrt(x); + } // bessel_j1_forward(T x) +); // bessel_j1_string + +const auto bessel_y1_string = bessel_j1_string + jiterator_stringify( + template + T bessel_y1_forward(T x) { + static const T PP[] = { + +7.62125616208173112003e-04, + +7.31397056940917570436e-02, + +1.12719608129684925192e+00, + +5.11207951146807644818e+00, + +8.42404590141772420927e+00, + +5.21451598682361504063e+00, + +1.00000000000000000254e+00, + }; + + static const T PQ[] = { + +5.71323128072548699714e-04, + +6.88455908754495404082e-02, + +1.10514232634061696926e+00, + +5.07386386128601488557e+00, + +8.39985554327604159757e+00, + +5.20982848682361821619e+00, + +9.99999999999999997461e-01, + }; + + static const T QP[] = { + +5.10862594750176621635e-02, + +4.98213872951233449420e+00, + +7.58238284132545283818e+01, + +3.66779609360150777800e+02, + +7.10856304998926107277e+02, + +5.97489612400613639965e+02, + +2.11688757100572135698e+02, + +2.52070205858023719784e+01, + }; + + static const T QQ[] = { + +7.42373277035675149943e+01, + +1.05644886038262816351e+03, + +4.98641058337653607651e+03, + +9.56231892404756170795e+03, + +7.99704160447350683650e+03, + +2.82619278517639096600e+03, + +3.36093607810698293419e+02, + }; + + static const T YP[] = { + +1.26320474790178026440e+09, + -6.47355876379160291031e+11, + +1.14509511541823727583e+14, + -8.12770255501325109621e+15, + +2.02439475713594898196e+17, + -7.78877196265950026825e+17, + }; + + static const T YQ[] = { + +5.94301592346128195359e+02, + +2.35564092943068577943e+05, + +7.34811944459721705660e+07, + +1.87601316108706159478e+10, + +3.88231277496238566008e+12, + +6.20557727146953693363e+14, + +6.87141087355300489866e+16, + +3.97270608116560655612e+18, + }; + + if (x <= T(5.0)) { + if (x == T(0.0)) { + return NEG_INFINITY; + } + + if (x <= T(0.0)) { + return NAN; + } + + T yp = 0.0; + + for (uint8_t index = 0; index <= 5; index++) { + yp = yp * (x * x) + YP[index]; + } + + T yq = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + yq = yq * (x * x) + YQ[index]; + } + + return x * (yp / yq) + (T(0.636619772367581343075535053490057448) * (bessel_j1_forward(x) * log(x) - T(1.0) / x)); + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index]; + } + + return (pp / pq * sin(x - T(2.356194490192344928846982537459627163)) + T(5.0) / x * (qp / qq) * cos(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / sqrt(x); + } // bessel_y1_forward(T x) +); // bessel_y1_string + +const auto chebyshev_polynomial_t_string = jiterator_stringify( + template + T chebyshev_polynomial_t_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0) || n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 6) && (abs(x) < T(1.0))) { + return cos(n * acos(x)); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x; + } + + T p = T(1.0); + T q = x; + T r; + + for (int64_t k = 2; k <= n; k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; + } // chebyshev_polynomial_t_forward(T x, int64_t n) + + template + T chebyshev_polynomial_t_forward(T x, T n) { + return chebyshev_polynomial_t_forward(x, static_cast(n)); + } // chebyshev_polynomial_t_forward(T x, T n) +); // chebyshev_polynomial_t_string + +const auto chebyshev_polynomial_u_string = jiterator_stringify( + template + T chebyshev_polynomial_u_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0) || n % 2 == 0) { + return n + 1; + } + + return -(n + 1); + } + + if ((n > 8) && (abs(x) < T(1.0))) { + if (sin(acos(x)) != T(0.0)) { + return sin((n + 1) * acos(x)) / sin(acos(x)); + } + + return (n + 1) * cos((n + 1) * acos(x)) / x; + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x; + } + + T p = T(1.0); + T q = x + x; + T r; + + for (int64_t k = 2; k <= n; k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; + } // chebyshev_polynomial_u_forward(T x, int64_t n) + + template + T chebyshev_polynomial_u_forward(T x, T n) { + return chebyshev_polynomial_u_forward(x, static_cast(n)); + } // chebyshev_polynomial_u_forward(T x, T n) +); // chebyshev_polynomial_u_string + +const auto chebyshev_polynomial_v_string = jiterator_stringify( + template + T chebyshev_polynomial_v_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0)) { + return T(1.0); + } + + if (n % 2 == 0) { + return n + n + 1; + } + + return -(n + n + 1); + } + + if ((n > 8) && (abs(x) < T(1.0))) { + if (sin(acos(x) / T(2.0)) != T(1.0)) { + return cos((n + T(0.5)) * acos(x)) / cos(acos(x) / T(2.0)); + } + + if (n % 2 == 0) { + return n + n + 1; + } + + return -(n + n + 1); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0); + T r; + + for (int64_t k = 2; k <= n; k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; + } // chebyshev_polynomial_v_forward(T x, int64_t n) + + template + T chebyshev_polynomial_v_forward(T x, T n) { + return chebyshev_polynomial_v_forward(x, static_cast(n)); + } // chebyshev_polynomial_v_forward(T x, T n) +); // chebyshev_polynomial_v_string + +const auto chebyshev_polynomial_w_string = jiterator_stringify( + template + T chebyshev_polynomial_w_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0)) { + return n + n + 1; + } + + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 8) && (abs(x) < T(1.0))) { + if (cos(acos(x) / T(2.0)) != T(1.0)) { + return sin((n + T(0.5)) * acos(x)) / sin(acos(x) / T(2.0)); + } + + if (x > T(0.0)) { + return n + n + 1; + } + + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x + T(1.0); + } + + T p = T(1.0); + T q = x + x + T(1.0); + T r; + + for (int64_t k = 2; k <= n; k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; + } // chebyshev_polynomial_w_forward(T x, int64_t n) + + template + T chebyshev_polynomial_w_forward(T x, T n) { + return chebyshev_polynomial_w_forward(x, static_cast(n)); + } // chebyshev_polynomial_w_forward(T x, T n) +); // chebyshev_polynomial_w_string + +const auto hermite_polynomial_h_string = jiterator_stringify( + template + T hermite_polynomial_h_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x; + } + + T p = T(1.0); + T q = x + x; + T r = T(0.0); + + for (int64_t k = 2; k < n + n; k += 2) { + r = (x + x) * q - k * p; + p = q; + q = r; + } + + return r; + } // hermite_polynomial_h_forward(T x, int64_t n) + + template + T hermite_polynomial_h_forward(T x, T n) { + return hermite_polynomial_h_forward(x, static_cast(n)); + } // hermite_polynomial_h_forward(T x, T n) +); // hermite_polynomial_h_string + +const auto hermite_polynomial_he_string = jiterator_stringify( + template + T hermite_polynomial_he_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x; + } + + T p = T(1.0); + T q = x; + T r; + + for (int64_t k = 1; k < n; k++) { + r = x * q - k * p; + p = q; + q = r; + } + + return r; + } // hermite_polynomial_he_forward(T x, int64_t n) + + template + T hermite_polynomial_he_forward(T x, T n) { + return hermite_polynomial_he_forward(x, static_cast(n)); + } // hermite_polynomial_he_forward(T x, T n) +); // hermite_polynomial_he_string + +const auto laguerre_polynomial_l_string = jiterator_stringify( + template + T laguerre_polynomial_l_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(0.0)) { + return T(1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return T(1.0) - x; + } + + T p = T(1.0); + T q = T(1.0) - x; + T r; + + for (int64_t k = 1; k < n; k++) { + r = (((k + k) + (T(1.0) - x)) * q - k * p) / (k + 1); + p = q; + q = r; + } + + return r; + } // laguerre_polynomial_l_forward(T x, int64_t n) + + template + T laguerre_polynomial_l_forward(T x, T n) { + return laguerre_polynomial_l_forward(x, static_cast(n)); + } // laguerre_polynomial_l_forward(T x, T n) +); // laguerre_polynomial_l_string + +const auto legendre_polynomial_p_string = jiterator_stringify( + template + T legendre_polynomial_p_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0) || n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x; + } + + T p = T(1.0); + T q = x; + T r; + + for (int64_t k = 1; k < n; k++) { + r = ((k + k + 1) * x * q - k * p) / (k + 1); + p = q; + q = r; + } + + return r; + } // legendre_polynomial_p_forward(T x, int64_t n) + + template + T legendre_polynomial_p_forward(T x, T n) { + return legendre_polynomial_p_forward(x, static_cast(n)); + } // legendre_polynomial_p_forward(T x, T n) +); // legendre_polynomial_p_string + +const auto modified_bessel_i0_string = jiterator_stringify( + template + T modified_bessel_i0_forward(T x) { + static const T A[] = { + -4.41534164647933937950e-18, + +3.33079451882223809783e-17, + -2.43127984654795469359e-16, + +1.71539128555513303061e-15, + -1.16853328779934516808e-14, + +7.67618549860493561688e-14, + -4.85644678311192946090e-13, + +2.95505266312963983461e-12, + -1.72682629144155570723e-11, + +9.67580903537323691224e-11, + -5.18979560163526290666e-10, + +2.65982372468238665035e-09, + -1.30002500998624804212e-08, + +6.04699502254191894932e-08, + -2.67079385394061173391e-07, + +1.11738753912010371815e-06, + -4.41673835845875056359e-06, + +1.64484480707288970893e-05, + -5.75419501008210370398e-05, + +1.88502885095841655729e-04, + -5.76375574538582365885e-04, + +1.63947561694133579842e-03, + -4.32430999505057594430e-03, + +1.05464603945949983183e-02, + -2.37374148058994688156e-02, + +4.93052842396707084878e-02, + -9.49010970480476444210e-02, + +1.71620901522208775349e-01, + -3.04682672343198398683e-01, + +6.76795274409476084995e-01, + }; + + static const T B[] = { + -7.23318048787475395456e-18, + -4.83050448594418207126e-18, + +4.46562142029675999901e-17, + +3.46122286769746109310e-17, + -2.82762398051658348494e-16, + -3.42548561967721913462e-16, + +1.77256013305652638360e-15, + +3.81168066935262242075e-15, + -9.55484669882830764870e-15, + -4.15056934728722208663e-14, + +1.54008621752140982691e-14, + +3.85277838274214270114e-13, + +7.18012445138366623367e-13, + -1.79417853150680611778e-12, + -1.32158118404477131188e-11, + -3.14991652796324136454e-11, + +1.18891471078464383424e-11, + +4.94060238822496958910e-10, + +3.39623202570838634515e-09, + +2.26666899049817806459e-08, + +2.04891858946906374183e-07, + +2.89137052083475648297e-06, + +6.88975834691682398426e-05, + +3.36911647825569408990e-03, + +8.04490411014108831608e-01, + }; + + T p; + T q = 0.0; + + if (abs(x) <= T(8.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 30; index++) { + p = q; + q = a; + a = ((abs(x) / T(2.0)) - T(2.0)) * q - p + A[index]; + } + + return exp(abs(x)) * (T(0.5) * (a - p)); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(32.0) / abs(x) - T(2.0)) * q - p + B[index]; + } + + return exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x)); + } // modified_bessel_i0_forward(T x) +); // modified_bessel_i0_string + +const auto modified_bessel_i1_string = jiterator_stringify( + template + T modified_bessel_i1_forward(T x) { + static const T A[] = { + +2.77791411276104639959e-18, + -2.11142121435816608115e-17, + +1.55363195773620046921e-16, + -1.10559694773538630805e-15, + +7.60068429473540693410e-15, + -5.04218550472791168711e-14, + +3.22379336594557470981e-13, + -1.98397439776494371520e-12, + +1.17361862988909016308e-11, + -6.66348972350202774223e-11, + +3.62559028155211703701e-10, + -1.88724975172282928790e-09, + +9.38153738649577178388e-09, + -4.44505912879632808065e-08, + +2.00329475355213526229e-07, + -8.56872026469545474066e-07, + +3.47025130813767847674e-06, + -1.32731636560394358279e-05, + +4.78156510755005422638e-05, + -1.61760815825896745588e-04, + +5.12285956168575772895e-04, + -1.51357245063125314899e-03, + +4.15642294431288815669e-03, + -1.05640848946261981558e-02, + +2.47264490306265168283e-02, + -5.29459812080949914269e-02, + +1.02643658689847095384e-01, + -1.76416518357834055153e-01, + +2.52587186443633654823e-01, + }; + + static const T B[] = { + +7.51729631084210481353e-18, + +4.41434832307170791151e-18, + -4.65030536848935832153e-17, + -3.20952592199342395980e-17, + +2.96262899764595013876e-16, + +3.30820231092092828324e-16, + -1.88035477551078244854e-15, + -3.81440307243700780478e-15, + +1.04202769841288027642e-14, + +4.27244001671195135429e-14, + -2.10154184277266431302e-14, + -4.08355111109219731823e-13, + -7.19855177624590851209e-13, + +2.03562854414708950722e-12, + +1.41258074366137813316e-11, + +3.25260358301548823856e-11, + -1.89749581235054123450e-11, + -5.58974346219658380687e-10, + -3.83538038596423702205e-09, + -2.63146884688951950684e-08, + -2.51223623787020892529e-07, + -3.88256480887769039346e-06, + -1.10588938762623716291e-04, + -9.76109749136146840777e-03, + +7.78576235018280120474e-01, + }; + + T p; + T q = 0.0; + + if (abs(x) <= T(8.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 29; index++) { + p = q; + q = a; + a = ((abs(x) / T(2.0)) - T(2.0)) * q - p + A[index]; + } + + if (x < T(0.0)) { + return -(T(0.5) * (a - p) * abs(x) * exp(abs(x))); + } + + return T(0.5) * (a - p) * abs(x) * exp(abs(x)); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(32.0) / abs(x) - T(2.0)) * q - p + B[index]; + } + + if (x < T(0.0)) { + return -(exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x))); + } + + return exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x)); + } // modified_bessel_i1_forward(T x) +); // modified_bessel_i1_string + +const auto modified_bessel_k0_string = modified_bessel_i0_string + jiterator_stringify( + template + T modified_bessel_k0_forward(T x) { + static const T A[] = { + +1.37446543561352307156e-16, + +4.25981614279661018399e-14, + +1.03496952576338420167e-11, + +1.90451637722020886025e-09, + +2.53479107902614945675e-07, + +2.28621210311945178607e-05, + +1.26461541144692592338e-03, + +3.59799365153615016266e-02, + +3.44289899924628486886e-01, + -5.35327393233902768720e-01, + }; + + static const T B[] = { + +5.30043377268626276149e-18, + -1.64758043015242134646e-17, + +5.21039150503902756861e-17, + -1.67823109680541210385e-16, + +5.51205597852431940784e-16, + -1.84859337734377901440e-15, + +6.34007647740507060557e-15, + -2.22751332699166985548e-14, + +8.03289077536357521100e-14, + -2.98009692317273043925e-13, + +1.14034058820847496303e-12, + -4.51459788337394416547e-12, + +1.85594911495471785253e-11, + -7.95748924447710747776e-11, + +3.57739728140030116597e-10, + -1.69753450938905987466e-09, + +8.57403401741422608519e-09, + -4.66048989768794782956e-08, + +2.76681363944501510342e-07, + -1.83175552271911948767e-06, + +1.39498137188764993662e-05, + -1.28495495816278026384e-04, + +1.56988388573005337491e-03, + -3.14481013119645005427e-02, + +2.44030308206595545468e+00, + }; + + if (x == T(0.0)) { + return INFINITY; + } + + if (x < T(0.0)) { + return NAN; + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 10; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return T(0.5) * (a - p) - log(0.5 * x) * modified_bessel_i0_forward(x); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return exp(-x) * (T(0.5) * (b - p)) / sqrt(x); + } // modified_bessel_k0_forward(T x) +); // modified_bessel_k0_string + +const auto scaled_modified_bessel_k0_string = modified_bessel_i0_string + jiterator_stringify( + template + T scaled_modified_bessel_k0_forward(T x) { + static const T A[] = { + +1.37446543561352307156e-16, + +4.25981614279661018399e-14, + +1.03496952576338420167e-11, + +1.90451637722020886025e-09, + +2.53479107902614945675e-07, + +2.28621210311945178607e-05, + +1.26461541144692592338e-03, + +3.59799365153615016266e-02, + +3.44289899924628486886e-01, + -5.35327393233902768720e-01, + }; + + static const T B[] = { + +5.30043377268626276149e-18, + -1.64758043015242134646e-17, + +5.21039150503902756861e-17, + -1.67823109680541210385e-16, + +5.51205597852431940784e-16, + -1.84859337734377901440e-15, + +6.34007647740507060557e-15, + -2.22751332699166985548e-14, + +8.03289077536357521100e-14, + -2.98009692317273043925e-13, + +1.14034058820847496303e-12, + -4.51459788337394416547e-12, + +1.85594911495471785253e-11, + -7.95748924447710747776e-11, + +3.57739728140030116597e-10, + -1.69753450938905987466e-09, + +8.57403401741422608519e-09, + -4.66048989768794782956e-08, + +2.76681363944501510342e-07, + -1.83175552271911948767e-06, + +1.39498137188764993662e-05, + -1.28495495816278026384e-04, + +1.56988388573005337491e-03, + -3.14481013119645005427e-02, + +2.44030308206595545468e+00, + }; + + if (x == T(0.0)) { + return INFINITY; + } + + if (x < T(0.0)) { + return NAN; + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 10; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return (T(0.5) * (a - p) - log(T(0.5) * x) * modified_bessel_i0_forward(x)) * exp(x); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return T(0.5) * (b - p) / sqrt(x); + } // T scaled_modified_bessel_k0_forward(T x) +); // scaled_modified_bessel_k0_string + +const auto modified_bessel_k1_string = modified_bessel_i1_string + jiterator_stringify( + template + T modified_bessel_k1_forward(T x) { + static const T A[] = { + -7.02386347938628759343e-18, + -2.42744985051936593393e-15, + -6.66690169419932900609e-13, + -1.41148839263352776110e-10, + -2.21338763073472585583e-08, + -2.43340614156596823496e-06, + -1.73028895751305206302e-04, + -6.97572385963986435018e-03, + -1.22611180822657148235e-01, + -3.53155960776544875667e-01, + +1.52530022733894777053e+00, + }; + + static const T B[] = { + -5.75674448366501715755e-18, + +1.79405087314755922667e-17, + -5.68946255844285935196e-17, + +1.83809354436663880070e-16, + -6.05704724837331885336e-16, + +2.03870316562433424052e-15, + -7.01983709041831346144e-15, + +2.47715442448130437068e-14, + -8.97670518232499435011e-14, + +3.34841966607842919884e-13, + -1.28917396095102890680e-12, + +5.13963967348173025100e-12, + -2.12996783842756842877e-11, + +9.21831518760500529508e-11, + -4.19035475934189648750e-10, + +2.01504975519703286596e-09, + -1.03457624656780970260e-08, + +5.74108412545004946722e-08, + -3.50196060308781257119e-07, + +2.40648494783721712015e-06, + -1.93619797416608296024e-05, + +1.95215518471351631108e-04, + -2.85781685962277938680e-03, + +1.03923736576817238437e-01, + +2.72062619048444266945e+00, + }; + + if (x == T(0.0)) { + return INFINITY; + } + + if (x < T(0.0)) { + return NAN; + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 11; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x; + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return exp(-x) * (T(0.5) * (b - p)) / sqrt(x); + } // modified_bessel_k1_forward(T x) +); // modified_bessel_k1_string + +const auto scaled_modified_bessel_k1_string = modified_bessel_i1_string + jiterator_stringify( + template + T scaled_modified_bessel_k1_forward(T x) { + static const T A[] = { + -7.02386347938628759343e-18, + -2.42744985051936593393e-15, + -6.66690169419932900609e-13, + -1.41148839263352776110e-10, + -2.21338763073472585583e-08, + -2.43340614156596823496e-06, + -1.73028895751305206302e-04, + -6.97572385963986435018e-03, + -1.22611180822657148235e-01, + -3.53155960776544875667e-01, + +1.52530022733894777053e+00, + }; + + static const T B[] = { + -5.75674448366501715755e-18, + +1.79405087314755922667e-17, + -5.68946255844285935196e-17, + +1.83809354436663880070e-16, + -6.05704724837331885336e-16, + +2.03870316562433424052e-15, + -7.01983709041831346144e-15, + +2.47715442448130437068e-14, + -8.97670518232499435011e-14, + +3.34841966607842919884e-13, + -1.28917396095102890680e-12, + +5.13963967348173025100e-12, + -2.12996783842756842877e-11, + +9.21831518760500529508e-11, + -4.19035475934189648750e-10, + +2.01504975519703286596e-09, + -1.03457624656780970260e-08, + +5.74108412545004946722e-08, + -3.50196060308781257119e-07, + +2.40648494783721712015e-06, + -1.93619797416608296024e-05, + +1.95215518471351631108e-04, + -2.85781685962277938680e-03, + +1.03923736576817238437e-01, + +2.72062619048444266945e+00, + }; + + if (x == T(0.0)) { + return INFINITY; + } + + if (x < T(0.0)) { + return NAN; + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 11; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return (log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x) * exp(x); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return (T(0.5) * (b - p) / sqrt(x)); + } // T scaled_modified_bessel_k1_forward(T x) +); // scaled_modified_bessel_k1_string + +const auto shifted_chebyshev_polynomial_t_string = jiterator_stringify( + template + T shifted_chebyshev_polynomial_t_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return T(1.0); + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) { + return cos(n * acos(x + x - T(1.0))); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0); + T r; + + for (int64_t k = 2; k <= n; k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; + } // shifted_chebyshev_polynomial_t_forward(T x, int64_t n) + + template + T shifted_chebyshev_polynomial_t_forward(T x, T n) { + return shifted_chebyshev_polynomial_t_forward(x, static_cast(n)); + } // shifted_chebyshev_polynomial_t_forward(T x, T n) +); // shifted_chebyshev_polynomial_t_string + +const auto shifted_chebyshev_polynomial_u_string = jiterator_stringify( + template + T shifted_chebyshev_polynomial_u_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return n + 1; + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return n + 1; + } + + return -(n + 1); + } + + if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) { + if (sin(acos(x + x - T(1.0))) != T(0.0)) { + return sin((n + 1) * acos(x + x - T(1.0))) / sin(acos(x + x - T(1.0))); + } + + return (n + 1) * cos((n + 1) * acos(x + x - T(1.0))) / (x + x - T(1.0)); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0) + (x + x - T(1.0)); + } + + T p = T(1.0); + T q = x + x - T(1.0) + (x + x - T(1.0)); + T r; + + for (int64_t k = 2; k <= n; k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; + } // shifted_chebyshev_polynomial_u_forward(T x, int64_t n) + + template + T shifted_chebyshev_polynomial_u_forward(T x, T n) { + return shifted_chebyshev_polynomial_u_forward(x, static_cast(n)); + } // shifted_chebyshev_polynomial_u_forward(T x, T n) +); // shifted_chebyshev_polynomial_u_string + +const auto shifted_chebyshev_polynomial_v_string = jiterator_stringify( + template + T shifted_chebyshev_polynomial_v_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return T(1.0); + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return (n + n + 1); + } + + return -(n + n + 1); + } + + if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) { + if (sin(acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) { + return cos(((n) + T(0.5)) * acos(x + x - T(1.0))) / cos(acos(x + x - T(1.0)) / T(2.0)); + } + + if (n % 2 == 0) { + return n + n + 1; + } + + return -(n + n + 1); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0) + (x + x - T(1.0)) - T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0) + (x + x - T(1.0)) - T(1.0); + T r; + + for (int64_t k = 2; k <= n; k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; + } // shifted_chebyshev_polynomial_v_forward(T x, int64_t n) + + template + T shifted_chebyshev_polynomial_v_forward(T x, T n) { + return shifted_chebyshev_polynomial_v_forward(x, static_cast(n)); + } // shifted_chebyshev_polynomial_v_forward(T x, T n) +); // shifted_chebyshev_polynomial_v_string + +const auto shifted_chebyshev_polynomial_w_string = jiterator_stringify( + template + T shifted_chebyshev_polynomial_w_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return n + n + 1; + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 4) && (abs(x + x - T(1.0)) < T(1.0))) { + if (cos(acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) { + return sin((n + T(0.5)) * acos(x + x - T(1.0))) / sin(acos(x + x - T(1.0)) / T(2.0)); + } + + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0) + (x + x - T(1.0)) + T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0) + (x + x - T(1.0)) + T(1.0); + T r; + + for (int64_t k = 2; k <= n; k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; + } // shifted_chebyshev_polynomial_w_forward(T x, int64_t n) + + template + T shifted_chebyshev_polynomial_w_forward(T x, T n) { + return shifted_chebyshev_polynomial_w_forward(x, static_cast(n)); + } // shifted_chebyshev_polynomial_w_forward(T x, T n) +); // shifted_chebyshev_polynomial_w_string + +const auto spherical_bessel_j0_string = jiterator_stringify( + template + T spherical_bessel_j0_forward(T x) { + if (isinf(x)) { + return T(0.0); + } + + if (abs(x) < T(0.5)) { + return T(1.0) + x * x * (T(-1.0) / T(6.0) + x * x * (T(1.0) / T(120.0) + x * x * (T(-1.0) / T(5040.0) + x * x * (T(1.0) / T(362880.0) + x * x * (T(-1.0) / T(39916800.0) + x * x * (T(1.0) / T(6227020800.0))))))); + } + + return sin(x) / x; + } // T spherical_bessel_j0_forward(T x) +); // spherical_bessel_j0_string + +#else // !AT_USE_JITERATOR() -- kernels must be precompiled + +template +static inline C10_HOST_DEVICE scalar_t calc_gcd(scalar_t a_in, scalar_t b_in) { + scalar_t a = ::abs(a_in); + scalar_t b = ::abs(b_in); + while (a != 0) { + scalar_t c = a; + a = b % a; + b = c; + } + return b; +} + +/* + * For licensing information, please refer to the cpu implementation located in "ATen/native/Math.h". + */ +template +static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) { + // [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma + using accscalar_t = at::acc_type; + static const double PI_f64 = 3.14159265358979323846; + const accscalar_t PSI_10 = 2.25175258906672110764; + const accscalar_t A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2, + }; + + accscalar_t x = static_cast(in); + if (x == 0) { + // As per C++ standard for gamma related functions and SciPy, + // If the argument is ±0, ±∞ is returned + return std::copysign(static_cast(INFINITY), -x); + } + + bool x_is_integer = x == ::trunc(x); + accscalar_t result = 0; + if (x < 0) { + if (x_is_integer) { + // As per C++ standard for gamma related functions and SciPy, + // If the argument is a negative integer, NaN is returned + return static_cast(NAN); + } + // Extracts the fractional part of x as r, since tan(pi * r) is more numerically + // accurate than tan(pi * x). While these operations are mathematically equivalent + // since both x and r are in radians and tan() has a periodicity of pi, in practice + // the computation of pi * x is a source of error (when |x| > 1). + double q, r; + r = ::modf(static_cast(x), &q); + result = static_cast(- PI_f64 / ::tan(PI_f64 * r)); + x = 1 - x; + } + + while (x < 10) { + result -= 1 / x; + x += 1; + } + if (x == 10) { + return static_cast(result + PSI_10); + } + + accscalar_t y = 0; + if (x < 1.0e17) { + accscalar_t z = 1 / (x * x); + + accscalar_t polevl_result = 0; + for (int i = 0; i <= 6; i++) { + polevl_result = polevl_result * z + A[i]; + } + y = z * polevl_result; + } + + return static_cast(::log(x) - (static_cast(0.5) / x) - y + result); +} + +template +static inline C10_HOST_DEVICE scalar_t calc_trigamma(scalar_t in) { + using accscalar_t = at::acc_type; + const accscalar_t PI = 3.14159265358979323846; + accscalar_t x = static_cast(in); + accscalar_t sign = +1; + accscalar_t result = 0; + if (x < 0.5f) { + sign = -1; + accscalar_t sin_pi_x = ::sin(PI * x); + result -= (PI * PI) / (sin_pi_x * sin_pi_x); + x = 1 - x; + } + for (int i = 0; i < 6; ++i) { + result += 1 / (x * x); + x += 1; + } + const accscalar_t one = static_cast(1); + const accscalar_t ixx = 1 / (x*x); + result += (1 + 1 / (2*x) + ixx * (one/6 - ixx * (one/30 - ixx * (one/42)))) / x; + return static_cast(sign * result); +} + +/* + * For licensing information and documentation, please refer to the cpu implementation located in "ATen/native/Math.h". + */ +template +static inline C10_HOST_DEVICE scalar_t +chbevl(scalar_t _x, const scalar_t array[], size_t len) { + static_assert(!std::is_same() && !std::is_same(), "don't instantiate with low precision type"); + + scalar_t b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (size_t i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = _x * b1 - b2 + array[i]; + } + + return (0.5 * (b0 - b2)); +} + +/* + * For licensing information and documentation, please refer to the cpu implementation located in "ATen/native/Math.h". + */ +template +C10_HOST_DEVICE inline std::tuple chebyshev_coefficients_i0e_A() { + /* Chebyshev coefficients for exp(-x) I0(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I0(x) } = 1. + */ + static const T coefficients[] = { + -4.41534164647933937950E-18, 3.33079451882223809783E-17, + -2.43127984654795469359E-16, 1.71539128555513303061E-15, + -1.16853328779934516808E-14, 7.67618549860493561688E-14, + -4.85644678311192946090E-13, 2.95505266312963983461E-12, + -1.72682629144155570723E-11, 9.67580903537323691224E-11, + -5.18979560163526290666E-10, 2.65982372468238665035E-9, + -1.30002500998624804212E-8, 6.04699502254191894932E-8, + -2.67079385394061173391E-7, 1.11738753912010371815E-6, + -4.41673835845875056359E-6, 1.64484480707288970893E-5, + -5.75419501008210370398E-5, 1.88502885095841655729E-4, + -5.76375574538582365885E-4, 1.63947561694133579842E-3, + -4.32430999505057594430E-3, 1.05464603945949983183E-2, + -2.37374148058994688156E-2, 4.93052842396707084878E-2, + -9.49010970480476444210E-2, 1.71620901522208775349E-1, + -3.04682672343198398683E-1, 6.76795274409476084995E-1}; + + return std::make_tuple(coefficients, 30); +} + +template +C10_HOST_DEVICE inline std::tuple chebyshev_coefficients_i0e_B() { + /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi). + */ + static const T coefficients[] = { + -7.23318048787475395456E-18, -4.83050448594418207126E-18, + 4.46562142029675999901E-17, 3.46122286769746109310E-17, + -2.82762398051658348494E-16, -3.42548561967721913462E-16, + 1.77256013305652638360E-15, 3.81168066935262242075E-15, + -9.55484669882830764870E-15, -4.15056934728722208663E-14, + 1.54008621752140982691E-14, 3.85277838274214270114E-13, + 7.18012445138366623367E-13, -1.79417853150680611778E-12, + -1.32158118404477131188E-11, -3.14991652796324136454E-11, + 1.18891471078464383424E-11, 4.94060238822496958910E-10, + 3.39623202570838634515E-9, 2.26666899049817806459E-8, + 2.04891858946906374183E-7, 2.89137052083475648297E-6, + 6.88975834691682398426E-5, 3.36911647825569408990E-3, + 8.04490411014108831608E-1}; + + return std::make_tuple(coefficients, 25); +} + +template +static inline C10_HOST_DEVICE scalar_t calc_i0(scalar_t _x) { + static_assert(!std::is_same() && !std::is_same(), "don't instantiate with low precision type"); + // Upcast input for numerical accuracy purposes + // Needed for accurate results if input is bfloat16 or float16 + scalar_t x = ::abs(_x); + + if (x <= scalar_t{8.0}) { + auto coeff_pair = chebyshev_coefficients_i0e_A(); + auto A = std::get<0>(coeff_pair); + auto len = std::get<1>(coeff_pair); + scalar_t y = (x / scalar_t{2.0}) - scalar_t{2.0}; + return (::exp(x) * chbevl(y, A, len)); + } + + auto coeff_pair = chebyshev_coefficients_i0e_B(); + auto B = std::get<0>(coeff_pair); + auto len = std::get<1>(coeff_pair); + return (::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x)); +} + +template +C10_HOST_DEVICE inline + typename std::enable_if::value, std::tuple>::type + chebyshev_coefficients_i1e_A() { + /* Chebyshev coefficients for exp(-x) I1(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I1(x) / x } = 1/2. + */ + static const T coefficients[] = { + 2.77791411276104639959E-18, -2.11142121435816608115E-17, + 1.55363195773620046921E-16, -1.10559694773538630805E-15, + 7.60068429473540693410E-15, -5.04218550472791168711E-14, + 3.22379336594557470981E-13, -1.98397439776494371520E-12, + 1.17361862988909016308E-11, -6.66348972350202774223E-11, + 3.62559028155211703701E-10, -1.88724975172282928790E-9, + 9.38153738649577178388E-9, -4.44505912879632808065E-8, + 2.00329475355213526229E-7, -8.56872026469545474066E-7, + 3.47025130813767847674E-6, -1.32731636560394358279E-5, + 4.78156510755005422638E-5, -1.61760815825896745588E-4, + 5.12285956168575772895E-4, -1.51357245063125314899E-3, + 4.15642294431288815669E-3, -1.05640848946261981558E-2, + 2.47264490306265168283E-2, -5.29459812080949914269E-2, + 1.02643658689847095384E-1, -1.76416518357834055153E-1, + 2.52587186443633654823E-1}; + + return std::make_tuple(coefficients, 29); +} + +template +C10_HOST_DEVICE inline + typename std::enable_if::value, std::tuple>::type + chebyshev_coefficients_i1e_A() { + /* Chebyshev coefficients for exp(-x) I1(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I1(x) / x } = 1/2. + */ + static const T coeff[] = { + 9.38153738649577178388E-9f, + -4.44505912879632808065E-8f, + 2.00329475355213526229E-7f, + -8.56872026469545474066E-7f, + 3.47025130813767847674E-6f, + -1.32731636560394358279E-5f, + 4.78156510755005422638E-5f, + -1.61760815825896745588E-4f, + 5.12285956168575772895E-4f, + -1.51357245063125314899E-3f, + 4.15642294431288815669E-3f, + -1.05640848946261981558E-2f, + 2.47264490306265168283E-2f, + -5.29459812080949914269E-2f, + 1.02643658689847095384E-1f, + -1.76416518357834055153E-1f, + 2.52587186443633654823E-1f}; + return std::make_tuple(coeff, 17); +}; + +template +C10_HOST_DEVICE inline + typename std::enable_if::value, std::tuple>::type + chebyshev_coefficients_i1e_B() { + /* Chebyshev coefficients for exp(-x) sqrt(x) I1(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I1(x) } = 1/sqrt(2pi). + */ + static const T coefficients[] = { + 7.51729631084210481353E-18, 4.41434832307170791151E-18, + -4.65030536848935832153E-17, -3.20952592199342395980E-17, + 2.96262899764595013876E-16, 3.30820231092092828324E-16, + -1.88035477551078244854E-15, -3.81440307243700780478E-15, + 1.04202769841288027642E-14, 4.27244001671195135429E-14, + -2.10154184277266431302E-14, -4.08355111109219731823E-13, + -7.19855177624590851209E-13, 2.03562854414708950722E-12, + 1.41258074366137813316E-11, 3.25260358301548823856E-11, + -1.89749581235054123450E-11, -5.58974346219658380687E-10, + -3.83538038596423702205E-9, -2.63146884688951950684E-8, + -2.51223623787020892529E-7, -3.88256480887769039346E-6, + -1.10588938762623716291E-4, -9.76109749136146840777E-3, + 7.78576235018280120474E-1}; + + return std::make_tuple(coefficients, 25); +} + +template +C10_HOST_DEVICE inline + typename std::enable_if::value, std::tuple>::type + chebyshev_coefficients_i1e_B() { + /* Chebyshev coefficients for exp(-x) sqrt(x) I1(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I1(x) } = 1/sqrt(2pi). + */ + static const T coeff[] = { + -3.83538038596423702205E-9f, + -2.63146884688951950684E-8f, + -2.51223623787020892529E-7f, + -3.88256480887769039346E-6f, + -1.10588938762623716291E-4f, + -9.76109749136146840777E-3f, + 7.78576235018280120474E-1f}; + + return std::make_tuple(coeff, 7); +}; + +template +static inline C10_HOST_DEVICE scalar_t calc_i1(scalar_t _x) { + const auto x = ::abs(_x); + if (x <= scalar_t{8.0}) { + auto coeff_pair = chebyshev_coefficients_i1e_A(); + auto A = std::get<0>(coeff_pair); + auto len = std::get<1>(coeff_pair); + scalar_t y = x / scalar_t{2.0} - scalar_t{2.0}; + const scalar_t out = ::exp(x) * x * chbevl(y, A, len); + return (_x < scalar_t{0.0}) ? -out : out; + } + + auto coeff_pair = chebyshev_coefficients_i1e_B(); + auto B = std::get<0>(coeff_pair); + auto len = std::get<1>(coeff_pair); + const scalar_t out = (::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len)) / ::sqrt(x); + return (_x < scalar_t{0.0}) ? -out : out; +} + +template +static inline C10_HOST_DEVICE scalar_t calc_i1e(scalar_t _x) { + const auto x = ::abs(_x); + if (x <= scalar_t{8.0}) { + auto coeff_pair = chebyshev_coefficients_i1e_A(); + auto A = std::get<0>(coeff_pair); + auto len = std::get<1>(coeff_pair); + const scalar_t y = x / scalar_t{2.0} - scalar_t{2.0}; + const scalar_t out = chbevl(y, A, len) * x; + return (_x < scalar_t{0.0}) ? -out : out; + } + + auto coeff_pair = chebyshev_coefficients_i1e_B(); + auto B = std::get<0>(coeff_pair); + auto len = std::get<1>(coeff_pair); + const scalar_t out = chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x); + return (_x < scalar_t{0.0}) ? -out : out; +} + +#endif // AT_USE_JITERATOR() (this closes the "else" branch of a if/else preprocessor directive) + +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh new file mode 100644 index 0000000000000000000000000000000000000000..4553276bab684eca9d858bc81ae8f34c9445afa5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh @@ -0,0 +1,401 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace { + +int log2_ceil(int value) { + int log2_value = 0; + while ((1 << log2_value) < value) ++log2_value; + return log2_value; +} + +template +struct Add { + __device__ __forceinline__ T operator()(T a, T b) const { + return a + b; + } +}; + +template +struct Max { + __device__ __forceinline__ T operator()(T a, T b) const { + return a < b ? b : a; + } +}; + +template class ReduceOp> +__device__ __forceinline__ void warp_reduce(acc_t* sum) { + ReduceOp r; + #pragma unroll + for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + acc_t b = WARP_SHFL_XOR(sum[i], offset, WARP_SIZE); + sum[i] = r(sum[i], b); + } + } +} + +// The softmax_warp_* methods perform softmax forward and backward propagation on samples spanning the fast dimension. +// Each sample contains element_count scalar elements. element_count can be any integer value <= 1024. +// The template arguments have the following meaning: +// One "WARP" works on one "BATCH". One "BATCH" contains "WARP_BATCH" samples. +// WARP_BATCH is equal to 1 when element_count is large, and > 1 when element_count is small. +// A "WARP" contains "C10_WARPS_SIZE" threads, these treads are guaranteed to belong to the same warp. +// This is important because it means only __shfl_ instructions are required for reductions. +// Note that this means WARP_SIZE must be a power of two and <= architecture warp size. +// CUDA warp size is 32 for all existing GPU architectures, but there is no guarantee this will not change for future arch. +// ROCm warp size is 64 for all currently ROCm-supported GPU architectures, but this may change for future archs. +// is_log_softmax is a flag indicating whether SoftMax or LogSoftMax should be computed. +// is_masked is a flag indicating whether SoftMax or MaskedSoftMax should be computed. +// The template can be instantiated with any floating point type for the type arguments input_t, output_t and acc_t. +// This allows SoftMax to be fused with a cast immediately following the SoftMax. +// The mask should have the same shape as input, with a boolean indicate if the value is masked. +// The head_chunk_size is only used for transformer mask softmax, equals to H * D * D. +// For instance: +// input_t=half, acc_t=float, output_t=half => read half tensor, float accumulators, write half tensor. +// input_t=half, acc_t=float, output_t=float => read half tensor, float accumulators, write float tensor. +// input_t_float, acc_t=float, output_t=half => read float tensor, float accumulators, write half tensor. + +template +__global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batch_size, int stride, int element_count, const bool *mask = nullptr, const int head_chunk_size = -1, bool is_transformer_mask = false) +{ + // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_forward_kernel. + constexpr int next_power_of_two = 1 << log2_elements; + constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE; + constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1; + + int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH; + + // batch_size might not be a multiple of WARP_BATCH. Check how + // many batches have to computed within this WARP. + int local_batches = batch_size - first_batch; + if (local_batches > WARP_BATCH) + local_batches = WARP_BATCH; + + // there might be multiple batches per warp. compute the index within the batch + int local_idx = threadIdx.x; + int idx_offset = first_batch * stride + local_idx; + + src += idx_offset; + dst += idx_offset; + + if (is_transformer_mask) { + mask += ((first_batch * stride) / head_chunk_size) * stride + local_idx; + } else { + mask += idx_offset; + } + // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop, + // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep + // the nested loops. + // This should have no impact on performance because the loops are unrolled anyway. + + // load data from global memory + acc_t elements[WARP_BATCH][WARP_ITERATIONS]; + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : element_count; + for (int it = 0; it < WARP_ITERATIONS; ++it) { + int element_index = local_idx + it * WARP_SIZE; + if (element_index < batch_element_count) { + elements[i][it] = src[i*element_count+it*WARP_SIZE]; + } else { + elements[i][it] = -std::numeric_limits::infinity(); + } + } + } + + // compute max_value + acc_t max_value[WARP_BATCH]; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : element_count; + bool is_meaningful_max = false; + max_value[i] = elements[i][0]; + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + if (is_masked) { + int idx = it*WARP_SIZE; + if ((idx + local_idx) < batch_element_count) { + if (!is_transformer_mask) { + idx += i*element_count; + } + if (!mask[idx]) { + max_value[i] = (is_meaningful_max && max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it]; + is_meaningful_max = true; + } + } + } else { + max_value[i] = max_value[i] > elements[i][it] ? max_value[i] : elements[i][it]; + } + } + if (is_masked) { + if (!is_meaningful_max) { + max_value[i] = -std::numeric_limits::infinity(); + } + } + } + warp_reduce(max_value); + + acc_t sum[WARP_BATCH] { 0.0f }; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : element_count; + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + if (!is_masked) { + if (is_log_softmax) { + sum[i] += std::exp(elements[i][it] - max_value[i]); + } else { + elements[i][it] = std::exp(elements[i][it] - max_value[i]); + sum[i] += elements[i][it]; + } + } else { + int idx = it*WARP_SIZE; + bool valid = (idx + local_idx) < batch_element_count; + if (!is_transformer_mask) { + idx += i*element_count; + } + if (valid) { + if (!mask[idx]) { + if (is_log_softmax) { + sum[i] += std::exp(elements[i][it] - max_value[i]); + } else { + elements[i][it] = std::exp(elements[i][it] - max_value[i]); + sum[i] += elements[i][it]; + } + } else { + if (!is_log_softmax) { + // Masked values are treated as -infinity, and std::exp(-infinity) is 0. + elements[i][it] = 0; + } + } + } else { + if (!is_log_softmax) { + elements[i][it] = 0.; + } + } + } + } + } + warp_reduce(sum); + + // store result + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + if (i >= local_batches) + break; + if (is_log_softmax) sum[i] = std::log(sum[i]); + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + int element_index = local_idx + it * WARP_SIZE; + if (element_index < element_count) { + if (is_log_softmax) { + dst[i*element_count+it*WARP_SIZE] = elements[i][it] - max_value[i] - sum[i]; + } else if (sum[i] == 0) { + dst[i*element_count+it*WARP_SIZE] = std::numeric_limits::quiet_NaN(); + } else { + dst[i*element_count+it*WARP_SIZE] = elements[i][it] / sum[i]; + } + } else { + break; + } + } + } +} + +template +__global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad, const input_t *output, int batch_size, int stride, int element_count, const bool *mask = nullptr) +{ + // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_backward_kernel. + constexpr int next_power_of_two = 1 << log2_elements; + constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE; + constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1; + + int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH; + + // batch_size might not be a multiple of WARP_BATCH. Check how + // many batches have to computed within this WARP. + int local_batches = batch_size - first_batch; + if (local_batches > WARP_BATCH) + local_batches = WARP_BATCH; + + // there might be multiple batches per warp. compute the index within the batch + int local_idx = threadIdx.x % WARP_SIZE; + + // the first element to process by the current thread + int thread_offset = first_batch * stride + local_idx; + grad += thread_offset; + output += thread_offset; + gradInput += thread_offset; + if (is_masked) { + mask += thread_offset; + } + + // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop, + // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep + // the nested loops. + // This should have no impact on performance because the loops are unrolled anyway. + + // load data from global memory + acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS]; + acc_t output_reg[WARP_BATCH][WARP_ITERATIONS]; + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : element_count; + for (int it = 0; it < WARP_ITERATIONS; ++it) { + int element_index = local_idx + it * WARP_SIZE; + if (element_index < batch_element_count) { + grad_reg[i][it] = grad[i*element_count+it*WARP_SIZE]; + output_reg[i][it] = output[i*element_count+it*WARP_SIZE]; + } else { + grad_reg[i][it] = acc_t(0); + output_reg[i][it] = acc_t(0); + } + } + } + + acc_t sum[WARP_BATCH] { 0.0f }; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + if (!is_masked || !mask[i*element_count+it*WARP_SIZE]) { + sum[i] += grad_reg[i][it]; + } + } + } + warp_reduce(sum); + + // store result + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + if (i >= local_batches) + break; + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + int element_index = local_idx + it * WARP_SIZE; + if (element_index < element_count) { + if (is_masked && mask[i*element_count+it*WARP_SIZE]) { + gradInput[i*element_count+it*WARP_SIZE] = 0; + } + // compute gradients + else if (is_log_softmax) { + gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - std::exp(output_reg[i][it]) * sum[i]); + } else { + gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - output_reg[i][it] * sum[i]); + } + } + } + } +} + +} // end of anonymous namespace + +template +void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr, int chunk_size = -1, bool is_transformer_mask = false) +{ + TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 1024 ); + if (softmax_elements == 0) { + return; + } else { + int log2_elements = log2_ceil(softmax_elements); + const int next_power_of_two = 1 << log2_elements; + + // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward. + int warp_size = at::cuda::warp_size(); + warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size; + + // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward. + int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1; + + // use 128 threads per block to maximize gpu utilization + constexpr int threads_per_block = 128; + + int warps_per_block = (threads_per_block / warp_size); + int batches_per_block = warps_per_block * batches_per_warp; + int blocks = (batch_count + batches_per_block - 1) / batches_per_block; + dim3 threads(warp_size, warps_per_block, 1); + // Launch code would be more elegant if C++ supported FOR CONSTEXPR + switch (log2_elements) { + #define LAUNCH_SOFTMAX_WARP_FORWARD(L2E) case L2E: \ + softmax_warp_forward \ + <<>>(dst, \ + src, batch_count, softmax_elements_stride, softmax_elements, mask, chunk_size, is_transformer_mask); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); \ + break; + + LAUNCH_SOFTMAX_WARP_FORWARD(0); // 1 + LAUNCH_SOFTMAX_WARP_FORWARD(1); // 2 + LAUNCH_SOFTMAX_WARP_FORWARD(2); // 4 + LAUNCH_SOFTMAX_WARP_FORWARD(3); // 8 + LAUNCH_SOFTMAX_WARP_FORWARD(4); // 16 + LAUNCH_SOFTMAX_WARP_FORWARD(5); // 32 + LAUNCH_SOFTMAX_WARP_FORWARD(6); // 64 + LAUNCH_SOFTMAX_WARP_FORWARD(7); // 128 + LAUNCH_SOFTMAX_WARP_FORWARD(8); // 256 + LAUNCH_SOFTMAX_WARP_FORWARD(9); // 512 + LAUNCH_SOFTMAX_WARP_FORWARD(10); ; // 1024 + default: + break; + } + } +} + +template +void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const input_t *output, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr) +{ + TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 1024 ); + if (softmax_elements == 0) { + return; + } else { + int log2_elements = log2_ceil(softmax_elements); + const int next_power_of_two = 1 << log2_elements; + + // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward. + int warp_size = at::cuda::warp_size(); + warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size; + + // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward. + int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1; + + // use 128 threads per block to maximize gpu utilization + constexpr int threads_per_block = 128; + + int warps_per_block = (threads_per_block / warp_size); + int batches_per_block = warps_per_block * batches_per_warp; + int blocks = (batch_count + batches_per_block - 1) / batches_per_block; + dim3 threads(warp_size, warps_per_block, 1); + // Launch code would be more elegant if C++ supported FOR CONSTEXPR + switch (log2_elements) { + #define LAUNCH_SOFTMAX_WARP_BACKWARD(L2E) case L2E: \ + softmax_warp_backward \ + <<>> \ + (grad_input, grad, output, batch_count, softmax_elements_stride, \ + softmax_elements, mask); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); \ + break; + + LAUNCH_SOFTMAX_WARP_BACKWARD(0); // 1 + LAUNCH_SOFTMAX_WARP_BACKWARD(1); // 2 + LAUNCH_SOFTMAX_WARP_BACKWARD(2); // 4 + LAUNCH_SOFTMAX_WARP_BACKWARD(3); // 8 + LAUNCH_SOFTMAX_WARP_BACKWARD(4); // 16 + LAUNCH_SOFTMAX_WARP_BACKWARD(5); // 32 + LAUNCH_SOFTMAX_WARP_BACKWARD(6); // 64 + LAUNCH_SOFTMAX_WARP_BACKWARD(7); // 128 + LAUNCH_SOFTMAX_WARP_BACKWARD(8); // 256 + LAUNCH_SOFTMAX_WARP_BACKWARD(9); // 512 + LAUNCH_SOFTMAX_WARP_BACKWARD(10); // 1024 + default: + break; + } + } +} diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Reduce.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Reduce.cuh new file mode 100644 index 0000000000000000000000000000000000000000..79b23ea8afd4fc4e4d505141b43bbce06275a881 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Reduce.cuh @@ -0,0 +1,1355 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace at { namespace native { + +using at::detail::Array; + +static inline int64_t div_up(int64_t a, int64_t b) { + return (a + b - 1) / b; +} + +// returns floor(log2(n)) +static inline int last_pow2(int n) { + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); + return std::max(1, n - (n >> 1)); +} + +// returns reduced fraction numerator & denominator +C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) { + // get GCD of num and denom using Euclid's algorithm. + // Can replace this with std::gcd if we ever support c++17. + size_t a = denominator; + size_t b = numerator; + while (b != 0) { + a %= b; + // swap(a,b) + size_t tmp = a; + a = b; + b = tmp; + } + + // a is now the GCD + numerator /= a; + denominator /= a; +} + +//template for changing MAX_NUM_THREADS based on op dtype +template +struct mnt_wrapper { + static constexpr int MAX_NUM_THREADS = 512; +}; + +template <> +struct mnt_wrapper >{ + static constexpr int MAX_NUM_THREADS = 256; +}; + +constexpr int max_reduce_threads(c10::ScalarType type) { + return type == kComplexDouble ? 256 : 512; +} + +struct ReduceConfig { + static constexpr int BLOCK_X = 0; + static constexpr int BLOCK_Y = 1; + static constexpr int CTA = 2; + + static constexpr int input_vec_size = 4; + + ReduceConfig(int element_size_bytes, int num_outputs, int num_inputs) + : element_size_bytes(element_size_bytes) + , num_inputs(num_inputs) + , num_outputs(num_outputs) {} + int element_size_bytes; + int num_inputs; + int num_outputs; + int step_input = 1; + int step_output = 1; + int ctas_per_output = 1; + int input_mult[3] = {0, 0, 0}; + int output_mult[2] = {0, 0}; + + int block_width; + int block_height; + int num_threads; + + bool vectorize_input = false; + int output_vec_size = 1; + + template + void set_block_dimension(int64_t dim0, int64_t dim1) { + const int max_num_threads = mnt_wrapper::MAX_NUM_THREADS / output_vec_size; + int dim0_pow2 = dim0 < max_num_threads ? static_cast(last_pow2(dim0)) : max_num_threads; + int dim1_pow2 = dim1 < max_num_threads ? static_cast(last_pow2(dim1)) : max_num_threads; + block_width = std::min(dim0_pow2, int(at::cuda::warp_size())); + block_height = std::min(dim1_pow2, int(max_num_threads / block_width)); + block_width = std::min(dim0_pow2, int(max_num_threads / block_height)); + num_threads = block_width * block_height; + } + + int split_input(int parallelism) { + int step = step_input; + step_input *= parallelism; + return step; + } + + int split_output(int parallelism) { + int step = step_output; + step_output *= parallelism; + return step; + } + + dim3 block() const { + return dim3(block_width, block_height); + } + + dim3 grid() const { + return dim3(div_up(num_outputs / output_vec_size, step_output), ctas_per_output); + } + + C10_HOST_DEVICE bool should_block_x_reduce() const { + return input_mult[BLOCK_X] != 0; + } + + C10_HOST_DEVICE bool should_block_y_reduce() const { + return input_mult[BLOCK_Y] != 0; + } + + C10_HOST_DEVICE bool should_global_reduce() const { + return input_mult[CTA] != 0; + } + + C10_DEVICE bool should_store(int output_idx) const { + return output_idx < num_outputs && + (!should_block_x_reduce() || threadIdx.x == 0) && + (!should_block_y_reduce() || threadIdx.y == 0); + } + + C10_DEVICE bool should_reduce_tail() const { + return (!should_block_y_reduce() || threadIdx.y == 0) && + (!should_global_reduce() || blockIdx.y == 0); + } + + C10_HOST_DEVICE int input_idx() const { + int lane = threadIdx.x; + int warp = threadIdx.y; + int cta2 = blockIdx.y; + return (lane * input_mult[BLOCK_X] + + warp * input_mult[BLOCK_Y] + + cta2 * input_mult[CTA]); + } + + template + C10_HOST_DEVICE int output_idx() const { + int lane = threadIdx.x; + int warp = threadIdx.y; + int cta1 = blockIdx.x; + return (lane * output_mult[BLOCK_X] + + warp * output_mult[BLOCK_Y] + + cta1 * step_output) * output_vec_size; + } + + C10_DEVICE int shared_memory_offset(int offset) const { + return threadIdx.x + (threadIdx.y + offset) * blockDim.x; + } + + C10_DEVICE int staging_memory_offset(int cta2) const { + int offset = cta2 + blockIdx.x * gridDim.y; + if (!should_block_x_reduce()) { + offset = threadIdx.x + offset * blockDim.x; + } + return offset; + } + + int shared_memory_size() const { + if (!should_block_y_reduce() && + (!should_block_x_reduce() || + block_width <= at::cuda::warp_size())) { + return 0; + } + return element_size_bytes * num_threads * output_vec_size; + } + + int64_t global_memory_size() const { + if (!should_global_reduce()) { + return 0; + } + auto size = (int64_t)element_size_bytes * num_outputs * ctas_per_output; + if (!should_block_x_reduce()) { + size *= block().x * output_vec_size; + } + return size; + } + + int semaphore_size() const { + if (!should_global_reduce()) { + return 0; + } + return sizeof(int) * grid().x; + } + + int values_per_thread() const { + return div_up(num_inputs, step_input); + } +}; + +std::ostream& operator<<(std::ostream& out, const ReduceConfig& config); + +template +C10_LAUNCH_BOUNDS_2(nt, 4) +__global__ void reduce_kernel(R reduction) { + reduction.template run(); +} + +template +static OffsetCalculator<2, index_t> make_output_calculator(const TensorIterator& iter) { + int num_reduce_dims = iter.num_reduce_dims(); + int num_output_dims = iter.ndim() - num_reduce_dims; + int input_index = iter.ntensors() - 1; + int output_index = 0; + std::array strides = { + iter.strides(output_index).data() + num_reduce_dims, + iter.strides(input_index).data() + num_reduce_dims, + }; + auto shape = iter.shape().data() + num_reduce_dims; + return OffsetCalculator<2, index_t>(num_output_dims, shape, strides.data()); +} + +template +static OffsetCalculator<1, index_t> make_input_calculator(const TensorIterator& iter) { + int num_reduce_dims = iter.num_reduce_dims(); + int input_index = iter.ntensors() - 1; + std::array strides = { + iter.strides(input_index).data(), + }; + return OffsetCalculator<1, index_t>(num_reduce_dims, iter.shape().data(), strides.data()); +} + +template +struct func_wrapper_t { + using arg_t = typename binary_function_traits::arg1_t; + using scalar_t = typename binary_function_traits::arg2_t; + + func_t combine; + static inline __device__ out_scalar_t project(arg_t arg) { + return (out_scalar_t) arg; + } + static inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) { + return WARP_SHFL_DOWN(arg, offset); + } + + static __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) { + return acc; + } + + func_wrapper_t(const func_t& op) : combine(op) { + } + + // wrap a normal reduction that ignores the index + __device__ arg_t reduce(arg_t acc, scalar_t val, int64_t idx) const { + return combine(acc, val); + } +}; + +template +func_wrapper_t func_wrapper(const func_t& op) { + return func_wrapper_t { op }; +} + +template +struct ReduceJitOp { +//ReduceJitOp is almost like ReduceOp, but it doesn't have ops functor that specifies reduction operations +//Maybe we can find a way to unify ReduceOp and ReduceJitOp + using InputCalculator = OffsetCalculator<1, uint32_t>; + using OutputCalculator = OffsetCalculator<2, uint32_t>; + //TODO for now arg_t is always opmath_t of the input, later we'll need to change it + using arg_t = at::opmath_type; + + static constexpr int input_vec_size = ReduceConfig::input_vec_size; + //TODO - ReduceJitOp will probably need to be changed for reductions that need full functor, + //not just wrapper + arg_t ident; + ReduceConfig config; + InputCalculator input_calc; + OutputCalculator output_calc; + const void* src; + const char* dst[2]; //it accepts at most two destinations + // acc_buf used for accumulation among sub Tensor Iterator when accumulation on + // output is not permissible + void* acc_buf; + // cta_buf used for accumulation between blocks during global reduction + void* cta_buf; + int* semaphores; + int64_t base_idx; + bool accumulate; + bool final_output; + int noutputs; + + ReduceJitOp( + ReduceConfig config, + InputCalculator input_calc, + OutputCalculator output_calc, + const void* src, + char* dst0, + std::optional dst1, + void* acc_buf, + void* cta_buf, + int* semaphores, + arg_t ident, + int noutputs, + int64_t base_idx) + : ident(ident), + config(config), + input_calc(input_calc), + output_calc(output_calc), + src(src), + acc_buf(acc_buf), + cta_buf(cta_buf), + semaphores(semaphores), + base_idx(base_idx), + noutputs(noutputs) { + dst[0] = dst0; + if (dst1.has_value()) { + dst[1] = dst1.value(); + } + } +}; + +template +struct ReduceOp { + using traits = function_traits; + using arg_t = typename std::decay::type>::type; + + using InputCalculator = OffsetCalculator<1, index_t>; + using OutputCalculator = OffsetCalculator<2, index_t>; + + static constexpr bool can_accumulate_in_output = + std::is_convertible::value + && std::is_convertible::value; + + static constexpr int input_vec_size = ReduceConfig::input_vec_size; + + ops_t ops; + arg_t ident; + ReduceConfig config; + InputCalculator input_calc; + OutputCalculator output_calc; + const void* src; + const char* dst[2]; //it accepts at most two destinations + // acc_buf used for accumulation among sub Tensor Iterator when accumulation on + // output is not permissible + void* acc_buf; + // cta_buf used for accumulation between blocks during global reduction + void* cta_buf; + int* semaphores; + int64_t base_idx; + bool accumulate; + bool final_output; + int noutputs; + + ReduceOp( + ops_t ops, + ReduceConfig config, + InputCalculator input_calc, + OutputCalculator output_calc, + const void* src, + char* dst0, + std::optional dst1, + void* acc_buf, + void* cta_buf, + int* semaphores, + arg_t ident, + int noutputs, + int64_t base_idx) + : ops(ops), + ident(ident), + config(config), + input_calc(input_calc), + output_calc(output_calc), + src(src), + acc_buf(acc_buf), + cta_buf(cta_buf), + semaphores(semaphores), + base_idx(base_idx), + noutputs(noutputs) { + dst[0] = dst0; + if (dst1.has_value()) { + dst[1] = dst1.value(); + } + } + + template + C10_DEVICE void run() const { + extern __shared__ char shared_memory[]; + index_t output_idx = config.output_idx(); + index_t input_idx = config.input_idx(); + auto base_offsets1 = output_calc.get(output_idx)[1]; + + using arg_vec_t = at::detail::Array; + arg_vec_t value; + + if (output_idx < config.num_outputs && input_idx < config.num_inputs) { + const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1); + value = thread_reduce(input_slice); + } + + if (config.should_block_y_reduce()) { + value = block_y_reduce(value, shared_memory); + } + if (config.should_block_x_reduce()) { + value = block_x_reduce(value, shared_memory); + } + + using out_ptr_vec_t = at::detail::Array; + using offset_vec_t = at::detail::Array; + offset_vec_t base_offsets; + out_ptr_vec_t out; + + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + base_offsets[i] = output_calc.get(output_idx + i)[0]; + out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]); + } + + arg_vec_t* acc = nullptr; + if (acc_buf != nullptr) { + size_t numerator = sizeof(arg_t); + size_t denominator = sizeof(out_scalar_t); + reduce_fraction(numerator, denominator); + acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator)); + } + + if (config.should_global_reduce()) { + value = global_reduce(value, acc, shared_memory); + } else if (config.should_store(output_idx)) { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.translate_idx(value[i], base_idx); + } + } + + if (acc == nullptr) { + if (accumulate) { + value = accumulate_in_output(out, value); + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + *(out[i]) = get_accumulated_output(out[i], value[i]); + } + } + } else { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine((*acc)[i], value[i]); + } + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + *acc = value; + } + } + } + } + + template + C10_DEVICE at::detail::Array thread_reduce(const scalar_t* data) const { + if (config.vectorize_input) { + CUDA_KERNEL_ASSERT(output_vec_size == 1); + // reduce at the header of input_slice where memory is not aligned, + // so that thread_reduce will have an aligned memory to work on. + return {input_vectorized_thread_reduce_impl(data)}; + } else { + index_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t); + bool is_contiguous = (input_calc.dims == 1 && element_stride == 1); + if (is_contiguous) { + return thread_reduce_impl(data, [](index_t idx) { return idx; }); + } else if (input_calc.dims == 1) { + return thread_reduce_impl(data, [&](index_t idx) { return idx * element_stride; }); + } else { + return thread_reduce_impl(data, [&](index_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); }); + } + } + } + + C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const { + index_t end = config.num_inputs; + + // Handle the head of input slice where data is not aligned + arg_t value = ident; + constexpr int align_bytes = alignof(at::native::memory::aligned_vector); + constexpr int align_elements = align_bytes / sizeof(scalar_t); + int shift = ((uint64_t)data) % align_bytes / sizeof(scalar_t); + if (shift > 0) { + data -= shift; + end += shift; + if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){ + value = ops.reduce(value, c10::load(data + threadIdx.x), threadIdx.x - shift); + } + end -= align_elements; + data += align_elements; + shift = align_elements - shift; + } + + // Do the vectorized reduction + using load_t = at::native::memory::aligned_vector; + + index_t idx = config.input_idx(); + const index_t stride = config.step_input; + + // Multiple accumulators to remove dependency between unrolled loops. + arg_t value_list[input_vec_size]; + value_list[0] = value; + + #pragma unroll + for (int i = 1; i < input_vec_size; i++) { + value_list[i] = ident; + } + + while (idx * input_vec_size + input_vec_size - 1 < end) { + const auto values_vec = memory::load_vector(data, idx); + #pragma unroll + for (index_t i = 0; i < input_vec_size; i++) { + value_list[i] = ops.reduce(value_list[i], values_vec.val[i], shift + idx * input_vec_size + i); + } + idx += stride; + } + + // tail + index_t tail_start = end - end % input_vec_size; + if (config.should_reduce_tail()) { + int idx = tail_start + threadIdx.x; + if (idx < end) { + const auto value = c10::load(data + idx); + value_list[0] = ops.reduce(value_list[0], value, idx + shift); + } + } + + // combine accumulators + #pragma unroll + for (int i = 1; i < input_vec_size; i++) { + value_list[0] = ops.combine(value_list[0], value_list[i]); + } + return value_list[0]; + } + + template + C10_DEVICE at::detail::Array thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const { + index_t idx = config.input_idx(); + const index_t end = config.num_inputs; + const index_t stride = config.step_input; + + using arg_vec_t = at::detail::Array; + using load_t = at::native::memory::aligned_vector; + + // Multiple accumulators to remove dependency between unrolled loops. + arg_vec_t value_list[vt0]; + + #pragma unroll + for (int i = 0; i < vt0; i++) { + #pragma unroll + for (int j = 0; j < output_vec_size; j++) { + value_list[i][j] = ident; + } + } + + load_t values[vt0]; + + while (idx + (vt0 - 1) * stride < end) { + #pragma unroll + for (index_t i = 0; i < vt0; i++) { + const auto offset = calc(idx + i * stride) / output_vec_size; + values[i] = memory::load_vector(data_, offset); + } + #pragma unroll + for (index_t i = 0; i < vt0; i++) { + #pragma unroll + for (index_t j = 0; j < output_vec_size; j++) { + value_list[i][j] = ops.reduce(value_list[i][j], values[i].val[j], idx + i * stride); + } + } + idx += stride * vt0; + } + + // tail + int idx_ = idx; + #pragma unroll + for (index_t i = 0; i < vt0; i++) { + if (idx >= end) { + break; + } + const auto offset = calc(idx) / output_vec_size; + values[i] = memory::load_vector(data_, offset); + idx += stride; + } + idx = idx_; + #pragma unroll + for (index_t i = 0; i < vt0; i++) { + if (idx >= end) { + break; + } + #pragma unroll + for (index_t j = 0; j < output_vec_size; j++) { + value_list[i][j] = ops.reduce(value_list[i][j], values[i].val[j], idx); + } + idx += stride; + } + + // combine accumulators + #pragma unroll + for (int i = 1; i < vt0; i++) { + #pragma unroll + for (index_t j = 0; j < output_vec_size; j++) { + value_list[0][j] = ops.combine(value_list[0][j], value_list[i][j]); + } + } + return value_list[0]; + } + + template + C10_DEVICE at::detail::Array block_x_reduce(at::detail::Array value, char* shared_memory) const { + using args_vec_t = at::detail::Array; + int dim_x = blockDim.x; + args_vec_t* shared = (args_vec_t*)shared_memory; + if (dim_x > warpSize) { + int address_base = threadIdx.x + threadIdx.y*blockDim.x; + shared[address_base] = value; + for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) { + __syncthreads(); + if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) { + args_vec_t other = shared[address_base + offset]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine(value[i], other[i]); + } + shared[address_base] = value; + } + } + dim_x = warpSize; + } + + __syncthreads(); + + for (int offset = 1; offset < dim_x; offset <<= 1) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + arg_t other = ops.warp_shfl_down(value[i], offset); + value[i] = ops.combine(value[i], other); + } + } + return value; + } + + template + C10_DEVICE at::detail::Array block_y_reduce(at::detail::Array value, char* shared_memory) const { + using args_vec_t = at::detail::Array; + args_vec_t* shared = (args_vec_t*)shared_memory; + shared[config.shared_memory_offset(0)] = value; + for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) { + __syncthreads(); + if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) { + args_vec_t other = shared[config.shared_memory_offset(offset)]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine(value[i], other[i]); + } + shared[config.shared_memory_offset(0)] = value; + } + } + return value; + } + + C10_DEVICE bool mark_block_finished() const { + __shared__ bool is_last_block_done_shared; + + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) { + int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1); + is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1); + } + + __syncthreads(); + + return is_last_block_done_shared; + } + + template + C10_DEVICE at::detail::Array accumulate_in_output( + at::detail::Array out, + at::detail::Array value, + typename std::enable_if::type* = nullptr + ) const { + at::detail::Array ret; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + ret[i] = ops.combine(*(out[i]), value[i]); + } + return ret; + } + + template + C10_DEVICE out_scalar_t get_accumulated_output( + out_scalar_t* out, arg_t value, + typename std::enable_if::type* = nullptr + ) const { + CUDA_KERNEL_ASSERT(!final_output); + return (out_scalar_t)value; + } + + // This function should never be called -- + // it's the version of `accumulate_in_output` + // when accumulation in the output is not possible. + template + C10_DEVICE at::detail::Array accumulate_in_output( + at::detail::Array, + at::detail::Array, + typename std::enable_if::type* = nullptr + ) const { + CUDA_KERNEL_ASSERT(false); + return arg_t {}; + } + + // This function should never be called -- + // it's the version of `get_accumulated_output` + // when accumulation in the output is not possible. + template + C10_DEVICE out_scalar_t get_accumulated_output( + out_scalar_t* out, arg_t value, + typename std::enable_if::type* = nullptr + ) const { + CUDA_KERNEL_ASSERT(false); + return *out; + } + + template + C10_DEVICE void set_results(const T x, const index_t base_offset) const { + CUDA_KERNEL_ASSERT(noutputs == 1); + auto res = (out_scalar_t*)((char*)dst[0] + base_offset); + *res = x; + } + + //Currently implemented for max of two outputs + template + C10_DEVICE void set_results(const thrust::pair x, const index_t base_offset) const { + if (noutputs >= 1) { + auto res0 = (T1*)((char*)dst[0] + base_offset); + *res0 = x.first; + } + if (noutputs >= 2) { + // base offset is computed assuming element size being sizeof(T1), so we need to make a + // correction to obtain the correct base offset + auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2)); + *res1 = x.second; + } + } + + template + C10_DEVICE void set_results_to_output(at::detail::Array value, at::detail::Array base_offset) const { + CUDA_KERNEL_ASSERT(final_output); + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + set_results(ops.project(value[i]), base_offset[i]); + } + } + + template + C10_DEVICE at::detail::Array global_reduce(at::detail::Array value, at::detail::Array *acc, char* shared_memory) const { + using arg_vec_t = at::detail::Array; + using out_ptr_vec_t = at::detail::Array; + using offset_vec_t = at::detail::Array; + + arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf; + index_t output_idx = config.output_idx(); + offset_vec_t base_offsets; + out_ptr_vec_t out; + + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + base_offsets[i] = output_calc.get(output_idx + i)[0]; + out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]); + } + + bool should_store = config.should_store(output_idx); + if (should_store) { + index_t offset = config.staging_memory_offset(blockIdx.y); + reduce_buffer[offset] = value; + } + + __threadfence(); // make sure writes are globally visible + __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done + bool is_last_block_done = mark_block_finished(); + + if (is_last_block_done) { + __threadfence(); // complete the acquire pattern after atomic + value = ident; + if (config.should_block_x_reduce()) { + index_t input_offset = threadIdx.x + threadIdx.y * blockDim.x; + index_t step = blockDim.x * blockDim.y; + for (; input_offset < config.ctas_per_output; input_offset += step) { + index_t idx = config.staging_memory_offset(input_offset); + arg_vec_t next = reduce_buffer[idx]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine(value[i], next[i]); + } + } + } else { + index_t input_offset = threadIdx.y; + index_t step = blockDim.y; + for (; input_offset < config.ctas_per_output; input_offset += step) { + index_t idx = config.staging_memory_offset(input_offset); + arg_vec_t next = reduce_buffer[idx]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine(value[i], next[i]); + } + } + } + value = block_y_reduce(value, shared_memory); + if (config.should_block_x_reduce()) { + value = block_x_reduce(value, shared_memory); + } + if (should_store) { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.translate_idx(value[i], base_idx); + } + } + + if (acc == nullptr) { + if (accumulate) { + value = accumulate_in_output(out, value); + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + *(out[i]) = get_accumulated_output(out[i], value[i]); + } + } + } else { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine((*acc)[i], value[i]); + } + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + *acc = value; + } + } + } + } + + return value; + } +}; + +template +static void launch_reduce_kernel(const ReduceConfig& config, const R& reduction) { + dim3 block = config.block(); + dim3 grid = config.grid(); + + auto stream = at::cuda::getCurrentCUDAStream(); + int shared_memory = config.shared_memory_size(); + + switch(config.output_vec_size) { + case 4: + reduce_kernel<<>>(reduction); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + break; + case 2: + reduce_kernel<<>>(reduction); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + break; + default: + reduce_kernel<<>>(reduction); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +inline void launch_jitted_reduce_kernel( + std::mutex &jiterator_mutex, + std::array &fn_cache, + const at::cuda::jit::KernelDescriptor &desc, + int vt0, const ReduceConfig& config, void *reduction) { + dim3 block = config.block(); + dim3 grid = config.grid(); + + int shared_memory = config.shared_memory_size(); + at::cuda::jit::NvrtcFunction* fn_ptr; + switch(config.output_vec_size) { + case 4: + fn_ptr = &fn_cache[0]; + break; + case 2: + fn_ptr = &fn_cache[1]; + break; + default: + fn_ptr = &fn_cache[2]; + } + if (!fn_ptr->function) { + int max_threads_codegen = + max_reduce_threads(desc.f_inputs_type) / config.output_vec_size; + auto code = at::cuda::jit::generate_reduction_code( + desc, vt0, true, false, config.output_vec_size, max_threads_codegen); + + *fn_ptr = at::cuda::jit::jit_pwise_function(code, "reduction_" + desc.name); + } + constexpr int kernel_args = 1; + void* args[kernel_args]; + args[0] = reduction; + at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, grid, block, shared_memory); +} + + +class AccumulationBuffer { + public: + AccumulationBuffer() {} + + AccumulationBuffer(size_t acc_t_size, size_t out_t_size, char* out_ptr, int64_t size) { + out_ptr_ = (char*)out_ptr; + if (out_t_size >= acc_t_size) { + // reusing output buffer for accumulation. + acc_ptr_ = (char*)out_ptr; + numerator_ = 1; + denominator_ = 1; + } else { + auto& allocator = *c10::cuda::CUDACachingAllocator::get(); + buffer_ = allocator.allocate(size); + acc_ptr_ = (char*)buffer_.get(); + numerator_ = acc_t_size; + denominator_ = out_t_size; + reduce_fraction(numerator_, denominator_); + } + } + + char* get_acc_slice(char* out_ptr) { + if (acc_ptr_ == nullptr) { + return nullptr; + } + return acc_ptr_ + ((out_ptr - out_ptr_) * numerator_ / denominator_); + } + + private: + char* acc_ptr_ = nullptr; + char* out_ptr_ = nullptr; + size_t numerator_; + size_t denominator_; + at::DataPtr buffer_; +}; + +template +int get_output_vec_size(const TensorIterator &iter) { + int vec_size = 4; + auto update_vec_size = [&vec_size](uint64_t n) { + while(n % vec_size != 0) { + vec_size /= 2; + } + }; + + uint64_t base_address = reinterpret_cast(iter.data_ptr(iter.noutputs())) / sizeof(scalar_t); + update_vec_size(base_address); + + const int output_index = iter.num_reduce_dims(); + update_vec_size(iter.shape()[output_index]); + + int j = 0; + for(auto i : iter.strides(iter.noutputs())) { + if (j != output_index) { + update_vec_size(i / sizeof(scalar_t)); + } + j++; + } + return vec_size; +} + +template +ReduceConfig setReduceConfig(const TensorIterator& iter){ + // Start by assuming that each thread handles a single output and all + // the inputs for that output. + int64_t num_outputs = iter.num_output_elements(); + int64_t inputs_per_output = iter.numel() / num_outputs; + int input_index = iter.ntensors() - 1; + + auto config = ReduceConfig(sizeof(arg_t), num_outputs, inputs_per_output); + + int64_t dim0; + int64_t dim1; + int64_t fastest_moving_stride; + bool reduction_on_fastest_striding_dimension; + + if (iter.ndim() > 0) { + // Adjust block size to map block width to fastest changing dimension of input + // tensor. This grants the best possible memory accessing pattern, given that + // for non-contiguous tensor with space in between, we cannot have perfect + // memory coalescing. + reduction_on_fastest_striding_dimension = + (iter.num_reduce_dims() == iter.ndim()) || + (iter.strides(/*arg=*/input_index)[0] < + iter.strides(/*arg=*/input_index)[iter.num_reduce_dims()]); + // Notice that dim0 & dim1 does NOT guarantee any launch configuration here! + // dim0 & dim1 are more like the upper bound of the block dimension. The + // actual launch config and reduction scheme is determined by setting values + // to `config.input_mult` and `config.output_mult`. + // We try to max out dim1 so that we have enough threads per CTA to deliver + // performance for larger problem size. + if (reduction_on_fastest_striding_dimension) { + // Map block.x to the fastest reducing dimension. It implies: + // 1. block_x_reduce is required. + // 2. block.y now max out to num_outputs. + dim0 = inputs_per_output; + dim1 = num_outputs; + fastest_moving_stride = iter.strides(/*arg=*/input_index)[0]; + } else { + // Map block.x to the fastest non reducing dimension. It implies: + // 1. block_x_reduce is turned off. + // 2. block.y now max out to inputs_per_output. + dim0 = num_outputs; + dim1 = inputs_per_output; + fastest_moving_stride = iter.strides(/*arg=*/input_index)[iter.num_reduce_dims()]; + } + } else { + reduction_on_fastest_striding_dimension = true; + fastest_moving_stride = sizeof(scalar_t); + dim0 = 1; + dim1 = 1; + } + + // We do vectorization to gain better memory access, there are two cases which we call + // "vectorize along input" and "vectorize along output". Note that the "input/output" + // here does not mean we are vectorizing load/store instructions. We always only vectorize + // load instructions. + // + // Case 1: "vectorize along input" + // This case happens when we are reducing along fastest moving dimesion. In such case, threads + // with the same threadIdx.y works on the same reduction cooperatively and will produce results + // for the same output. In such case, values in each loaded vector always correspond to the same output. + // + // Case 2: "vectorize along output" + // This case happens when the fastest moving dimesion is not the dimension of reduction. In such case, + // threads with different threadIdx.x are independent and will produce results for different outputs. + // In such case, values in each loaded vector always correspond to different outputs. + if (fastest_moving_stride == sizeof(scalar_t)) { + if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1 && vt0 >= ReduceConfig::input_vec_size) { + // Case 1: "vectorize along input" + // Note that if vt0 < ReduceConfig::vec_size, then this means the register pressure could be high, in such case, + // we should avoid vectorization. + config.vectorize_input = true; + dim0 /= config.input_vec_size; + } else if (!reduction_on_fastest_striding_dimension) { + // Case 2: "vectorize along output" + config.output_vec_size = get_output_vec_size(iter); + dim0 /= config.output_vec_size; + } + } + + // Adjust block_width and block_height + config.set_block_dimension(dim0, dim1); + + int block_width = config.block_width; + int block_height = config.block_height; + + if (iter.ndim() == 0 || reduction_on_fastest_striding_dimension) { + // Split the input across lanes if the input is contiguous in the reduced + // dimension. This will require reduction between threads using warp + // shuffle instructions and shared memory (if block_width > warpSize). + config.input_mult[0] = config.split_input(block_width); + } else { + // Otherwise split the output across lanes in a warp. + config.output_mult[0] = config.split_output(block_width); + } + + constexpr int min_values_per_thread = 16; + constexpr int max_values_per_thread = 256; + + if (config.values_per_thread() >= block_height * 16 || config.values_per_thread() >= max_values_per_thread) { + // Divide the input across warps in a thread-block, if that leaves at least + // 16 elements to be summed by each thread. This will require inter-warp + // reduction using shared memory. + config.input_mult[1] = config.split_input(block_height); + } else { + // Otherwise, each warp handles a separate output. + config.output_mult[1] = config.split_output(block_height); + } + + const int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / config.num_threads; + const int num_mp = at::cuda::getCurrentDeviceProperties()->multiProcessorCount; + const int target_grid_size = num_mp * blocks_per_sm; + int grid = config.grid().x; + if (config.input_mult[1] != 0 && config.values_per_thread() >= max_values_per_thread && grid <= target_grid_size) { + // Divide the input across thread-blocks if the amount of work per-thread + // is large enough and the size of the output is small enough. This will + // require a reduction using global memory. + // If we decide to split input across blocks, as long as we can get enough + // number of blocks (`target_grid_size`) to balance SM, we should still + // make the number of values per thread large for best performance. + int ctas_per_output1 = div_up(target_grid_size, grid); + int ctas_per_output2 = div_up(config.values_per_thread(), min_values_per_thread); + int ctas_per_output3 = div_up(config.values_per_thread(), max_values_per_thread); + // We want the minimum of ctas_per_output1 and ctas_per_output2, so that each thread can have + // a large number of values to deal with. But we don't want values_per_thread to be larger than + // max_values_per_thread + config.ctas_per_output = std::max(std::min(ctas_per_output1, ctas_per_output2), ctas_per_output3); + if (config.ctas_per_output > 1) { + config.input_mult[2] = config.split_input(config.ctas_per_output); + } + } + return config; +}; + +template +inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t ident=0, + AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) { + AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1); + + using traits = function_traits; + using arg_t = typename traits::template arg<0>::type; + // at::Half/at::ComplexHalf overflows easily as it's range is very small. + // So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we + // set can_accumulate_in_output to False. + static constexpr bool is_inp_out_type_half_or_chalf = + (std::is_same::value && + std::is_same::value) || + (std::is_same, scalar_t>::value && + std::is_same, out_scalar_t>::value); + // at::BFloat16 has lower precision and can lead to rounding errors. + // So when scalar_t and out_scalar_t are at::BFloat16, we + // set can_accumulate_in_output to False. + static constexpr bool is_inp_out_type_bfloat16 = + (std::is_same::value && + std::is_same::value); + static constexpr bool can_accumulate_in_output = + std::is_convertible::value && + !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16); + + bool can_use_32bit_indexing = iter.can_use_32bit_indexing(); + std::unique_ptr owned_buf_ptr; + // The acc_buf_ptr is a shared pointer. It is create at the first entrance and + // reused by all recursive function calls. + if (acc_buf_ptr == NULL) { + // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter + // when accumulation in output is not possible. + if (!can_accumulate_in_output && !can_use_32bit_indexing) { + int64_t output_memory_size = iter.element_size(0); + for (int dim = 0; dim < iter.ndim(); dim++) { + output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]); + } + output_memory_size /= iter.element_size(0); //iter.strides is in bytes + owned_buf_ptr.reset(new AccumulationBuffer(sizeof(arg_t), + sizeof(out_scalar_t), + (char*) iter.data_ptr(0), + output_memory_size * sizeof(arg_t))); + } else { + owned_buf_ptr.reset(new AccumulationBuffer()); + } + acc_buf_ptr = owned_buf_ptr.get(); + } + + if (!can_use_32bit_indexing) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + int64_t sub_iter_base_idx = sub_iter.view_offsets()[0]; + + gpu_reduce_kernel(sub_iter, ops, ident, + acc_buf_ptr, sub_iter_base_idx); + } + return; + } + + const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1); + char* out_data = (char*)iter.data_ptr(0); + const auto noutputs = iter.noutputs(); + std::optional out_data_extra; + if (noutputs > 1) { + out_data_extra = (char*)iter.data_ptr(1); + } else { + out_data_extra = std::nullopt; + } + char* acc_data = acc_buf_ptr->get_acc_slice(out_data); + + ReduceConfig config = setReduceConfig(iter); + at::DataPtr buffer; + at::DataPtr semaphores; + if (config.should_global_reduce()) { + auto& allocator = *c10::cuda::CUDACachingAllocator::get(); + buffer = allocator.allocate(config.global_memory_size()); + semaphores = allocator.allocate(config.semaphore_size()); + + auto stream = at::cuda::getCurrentCUDAStream(); + AT_CUDA_CHECK(cudaMemsetAsync(semaphores.get(), 0, config.semaphore_size(), stream)); + } + + AT_ASSERT(can_use_32bit_indexing); + auto output_calc = make_output_calculator(iter); + auto input_calc = make_input_calculator(iter); + auto reduce = ReduceOp( + ops, + config, + input_calc, + output_calc, + in_data, + out_data, + out_data_extra, + acc_data, + buffer.get(), + (int*)semaphores.get(), + ident, + noutputs, + base_idx); + reduce.accumulate = iter.should_accumulate(); + reduce.final_output = iter.is_final_output(); + + launch_reduce_kernel::MAX_NUM_THREADS>(config, reduce); +} + +//TODO this is 100 lines of almost-copy-paste, because we have to have different template args for this function +//try unifying with gpu_reduce_kernel +template +inline void jitted_gpu_reduce_kernel(TensorIterator& iter, const std::string& func, ident_t ident=0, + AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) { + AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1); + + //TODO - this will be different for more complicated reductions, but for now reductions using + //func_wrapper all have arg_t = opmath + using arg_t = at::opmath_type; + // at::Half/at::ComplexHalf overflows easily as it's range is very small. + // So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we + // set can_accumulate_in_output to False. + static constexpr bool is_inp_out_type_half_or_chalf = + (std::is_same::value && + std::is_same::value) || + (std::is_same, scalar_t>::value && + std::is_same, out_scalar_t>::value); + // at::BFloat16 has lower precision and can lead to rounding errors. + // So when scalar_t and out_scalar_t are at::BFloat16, we + // set can_accumulate_in_output to False. + static constexpr bool is_inp_out_type_bfloat16 = + (std::is_same::value && + std::is_same::value); + static constexpr bool can_accumulate_in_output = + std::is_convertible::value && + !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16); + + bool can_use_32bit_indexing = iter.can_use_32bit_indexing(); + std::unique_ptr owned_buf_ptr; + + // The acc_buf_ptr is a shared pointer. It is create at the first entrance and + // reused by all recursive function calls. + if (acc_buf_ptr == NULL) { + // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter + // when accumulation in output is not possible. + if (!can_accumulate_in_output && !can_use_32bit_indexing) { + int64_t output_memory_size = iter.element_size(0); + for (int dim = 0; dim < iter.ndim(); dim++) { + output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]); + } + output_memory_size /= iter.element_size(0); //iter.strides is in bytes + owned_buf_ptr.reset(new AccumulationBuffer(sizeof(out_scalar_t), //TODO + sizeof(out_scalar_t), + (char*) iter.data_ptr(0), + output_memory_size * sizeof(out_scalar_t))); //TODO + } else { + owned_buf_ptr.reset(new AccumulationBuffer()); + } + acc_buf_ptr = owned_buf_ptr.get(); + } + + if (!can_use_32bit_indexing) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + int64_t sub_iter_base_idx = sub_iter.view_offsets()[0]; + + jitted_gpu_reduce_kernel(sub_iter, func, ident, + acc_buf_ptr, sub_iter_base_idx); + } + return; + } + + //TODO - for now we support a single input, we may be able to relax this constraint + const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1); + char* out_data = (char*)iter.data_ptr(0); + const auto noutputs = iter.noutputs(); + std::optional out_data_extra; + if (noutputs > 1) { + out_data_extra = (char*)iter.data_ptr(1); + } else { + out_data_extra = std::nullopt; + } + char* acc_data = acc_buf_ptr->get_acc_slice(out_data); + + ReduceConfig config = setReduceConfig(iter); + + at::DataPtr buffer; + at::DataPtr semaphores; + if (config.should_global_reduce()) { + auto& allocator = *c10::cuda::CUDACachingAllocator::get(); + buffer = allocator.allocate(config.global_memory_size()); + semaphores = allocator.allocate(config.semaphore_size()); + + auto stream = at::cuda::getCurrentCUDAStream(); + AT_CUDA_CHECK(cudaMemsetAsync(semaphores.get(), 0, config.semaphore_size(), stream)); + } + + AT_ASSERT(can_use_32bit_indexing); + auto output_calc = make_output_calculator(iter); + auto input_calc = make_input_calculator(iter); + auto reduce = ReduceJitOp( + config, + input_calc, + output_calc, + in_data, + out_data, + out_data_extra, + acc_data, + buffer.get(), + (int*)semaphores.get(), + ident, + noutputs, + base_idx); + reduce.accumulate = iter.should_accumulate(); + reduce.final_output = iter.is_final_output(); + + constexpr int nInputs = 1; + constexpr int nOutputs = 1; + static auto desc = at::cuda::jit::make_kernel_descriptor< + out_scalar_t, scalar_t>(name, func, nInputs, nOutputs); + + static std::mutex jiterator_mutex; + static std::vector> fn_cache(c10::cuda::device_count()); + auto &cache = fn_cache[iter.device().index()]; + + launch_jitted_reduce_kernel( + jiterator_mutex, cache, desc, vt0, config, &reduce); +} + +}} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ReduceOps.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ReduceOps.h new file mode 100644 index 0000000000000000000000000000000000000000..a67a019ae49e2ec73be935aa4cae703417e9bb19 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ReduceOps.h @@ -0,0 +1,20 @@ + +namespace at { +struct TensorIterator; +} + +namespace c10 { +class Scalar; +} + +namespace at { namespace native { + +void norm_launch_kernel(TensorIterator &iter, double val); +void min_launch_kernel(TensorIterator &iter); +void max_launch_kernel(TensorIterator &iter); +void aminmax_launch_kernel(TensorIterator &iter); +void min_all_launch_kernel(TensorIterator &iter); +void max_all_launch_kernel(TensorIterator &iter); +void aminmax_allreduce_launch_kernel(TensorIterator &iter); + +}} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ScanKernels.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ScanKernels.h new file mode 100644 index 0000000000000000000000000000000000000000..28e65372511bc7b50390a134e01554b6fa9ee171 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ScanKernels.h @@ -0,0 +1,18 @@ +#pragma once +#include + +namespace at { +class TensorBase; + +namespace native { + +// NOTE: these functions require output tensors to be contiguous +void launch_cummax_cuda_kernel(const TensorBase& self, const TensorBase& values, + const TensorBase& indices, int64_t dim); +void launch_cummin_cuda_kernel(const TensorBase& self, const TensorBase& values, + const TensorBase& indices, int64_t dim); +void launch_logcumsumexp_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim); +void launch_cumsum_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim); +void launch_cumprod_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim); + +}} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Sort.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Sort.h new file mode 100644 index 0000000000000000000000000000000000000000..656b4ce2c2bbac167457f31e8f554a5e409a2940 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Sort.h @@ -0,0 +1,17 @@ +#pragma once +#include +#include +#include + +namespace at { +namespace native { + +inline bool should_use_small_sort(const TensorBase &self, int64_t dim) { + return self.size(dim) <= 4096; +} + +void sortKeyValueInplace( + const TensorBase &key, const TensorBase &value, int dim, + bool descending, bool stable=false); + +}} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..1aefc2474fdfc5cab69c89a8fdb1b8be982394e8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.cuh @@ -0,0 +1,435 @@ +#pragma once + +#include +#include +#include +#include + +namespace at { +namespace native { + +// Used for a segmented reduction +struct ModeUnsignedBoolPair { + unsigned int val; + bool flag; +}; + +// In the kernel below, we have a common pattern of reducing (unsigned int, +// unsigned int) pairs of data +struct ModeUnsignedPair { + unsigned int val; + unsigned int index; +}; + +// Inclusive Scan via an upsweep/downsweep mechanism. Assumes: +// +// 1. Power2ScanSize is a power of 2. This code still works for collections that +// do not exactly contain a power of 2 number of elements, simply round up to +// the nearest power of 2 and then call. +// +// 2. That there are two-elements per thread, i.e. the size of the smem storage +// is 2 * blockDim.x * sizeof(T). +// +// Consider a (+)-Scan on the following elements: +// +// Upsweep: +// +// 0 1 2 3 4 5 6 7 +// 1 5 9 13 +// 6 22 +// 28 +// +// Downsweep: +// 15 +// 3 10 21 +template +__device__ void inclusivePrefixScan(T* smem, BinaryOp binop) { + // Reduce step ("upsweep") +#pragma unroll + for (int stride = 1; stride < Power2ScanSize; stride <<= 1) { + int index = (threadIdx.x + 1) * stride * 2 - 1; + if (index < Power2ScanSize) { + smem[index] = binop(smem[index], smem[index - stride]); + } + __syncthreads(); + } + + // Post-reduce step ("downsweep") +#pragma unroll + for (int stride = Power2ScanSize / 4; stride > 0; stride >>= 1) { + int index = (threadIdx.x + 1) * stride * 2 - 1; + if ((index + stride) < Power2ScanSize) { + smem[index + stride] = binop(smem[index + stride], smem[index]); + } + __syncthreads(); + } +} + +// Block-wide reduction where each thread locally reduces N +// values before letting a single warp take over - assumes +// threadVals is in registers, not shared memory +// +// If smem is not used again, there is no need to __syncthreads before this +// call. However, if smem will be used, e.g., this function is called in a loop, +// then __syncthreads is needed either before or afterwards to prevent non-0 +// threads overriding smem in the next loop before num-0 thread reads from it. +template +__device__ T reduceBlockWithNThreadLocalReductions( + T* smem, + T threadVals[N], + const unsigned int numVals, + ReduceOp reduceOp, + T init) { + int offset = threadIdx.x * N; + T local = offset < numVals ? threadVals[0] : init; + +#pragma unroll + for (int i = 1; i < N; ++i) { + ++offset; + T next = offset < numVals ? threadVals[i] : init; + local = reduceOp.combine(local, next); + } + + return cuda_utils::BlockReduce(local, reduceOp, init, smem); +} + +template +__device__ inline void swapVars(T& t1, T& t2) { + T tmp = t1; + t1 = t2; + t2 = tmp; +} + +template +__device__ inline void bitonicSwap( + K& kA, + V& vA, + bool& validA, + K& kB, + V& vB, + bool& validB, + bool dir, + const Comparator& comp) { + // Invalid entries always sort to the end + bool swap = (comp(kA, kB) && validA) || !validB; + if (swap == dir) { + swapVars(kA, kB); + swapVars(vA, vB); + swapVars(validA, validB); + } +}; + +template +__device__ inline void bitonicSwapKeys( + K& kA, + bool& validA, + K& kB, + bool& validB, + bool dir, + const Comparator& comp) { + bool swap = (comp(kA, kB) && validA) || !validB; + if (swap == dir) { + swapVars(kA, kB); + swapVars(validA, validB); + } +} + +template < + typename K, + typename IndexType, + int Power2SortSize, + typename Comparator> +__device__ inline void bitonicSortKeys( + K keys[Power2SortSize], + bool valid[Power2SortSize], + const Comparator& comp) { +#if !defined(USE_ROCM) +#pragma unroll +#endif + for (unsigned int size = 2; size < Power2SortSize; size *= 2) { + bool flag = ((threadIdx.x & (size / 2)) != 0); + +#if !defined(USE_ROCM) +#pragma unroll +#endif + for (unsigned int stride = size / 2; stride > 0; stride /= 2) { + __syncthreads(); + + unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + bitonicSwapKeys( + keys[pos], + valid[pos], + keys[pos + stride], + valid[pos + stride], + flag, + comp); + } + } + +#if !defined(USE_ROCM) +#pragma unroll +#endif + for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) { + __syncthreads(); + + unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + bitonicSwapKeys( + keys[pos], + valid[pos], + keys[pos + stride], + valid[pos + stride], + false, + comp); + } + + __syncthreads(); +} + +// The mode kernel has the following characteristics: It uses internal shared +// memory buffers of Power2Size, which must be greater than the number of +// elements. Additionally, there is one block for every slice to calculate the +// mode for, and in each block there is one thread for every two elements. +// +// Both sorted and positions are assumed to be contiguous Tensors with the mode +// dimension as the innermost dim, such that we can get the particular slice for +// a Tensor via its linear block dimension * the slice size. +template +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11070 +__launch_bounds__(1024, 1) +#endif +__global__ void compute_mode( + const T* input, + at::cuda::detail::TensorInfo values, + at::cuda::detail::TensorInfo indices, + int64_t sliceSize, + int64_t slices) { + int tidx = threadIdx.x; + int stidx = blockDim.x + threadIdx.x; // Second index this thread responsible for + + // First, we need to calculate the offset into the sorted Tensor that + // represents the start of the slice for this block to calculate the mode for. + // This offset is a combination of the gridIndices, and the number of elements + // in the slice. + unsigned int blockId = getLinearBlockId(); + unsigned int linearOffset = blockId * sliceSize; + + if (blockId >= slices) { + return; + } + + // shmem is a dynamically sized buffer we will use throughout the kernel to + // handle computation efficiently. The size of this shmem must be + // sizeof(T) * Power2Size + (2 * sizeof(unsigned int) * Power2Size) + // + // Initially, the buffer will be organized as follows: + // + // [smem (slice elements) | bmem (valid indices) | ] + extern __shared__ char shmem[]; + + // smem represents a proportion of the shared memory buffer that is used to + // store the elements from the slice: + T* smem = reinterpret_cast(shmem); + + // Each thread loads up to two elements from the Tensor into shared memory + if (tidx < sliceSize) { + smem[tidx] = c10::load(&input[linearOffset + tidx]); + } + if (stidx < sliceSize) { + smem[stidx] = c10::load(&input[linearOffset + stidx]); + } + + // Next, we initialize a boolean region of the buffer, offset by the loaded + // element smem region + bool* bmem = reinterpret_cast(&smem[Power2Size]); + + // The first use of this region stores bmem[i] = i < sliceSize to mark the + // valid components in the smem buffer + bmem[tidx] = tidx < sliceSize; + bmem[stidx] = stidx < sliceSize; + __syncthreads(); // barrier for smem, bmem initialization + + // First, sort the input slice in ascending order. smem contains the input + // elements, and bmem marks the valid indices + bitonicSortKeys( + smem, bmem, [&] GPU_LAMBDA(const auto& a, const auto& b) { + return a < b; + }); + __syncthreads(); // make no assumptions that the sort syncs at end + + // The next step of our algorithm is performing a block-wide comparison of + // neighboring elements. In particular, given an sorted input slice A, we + // produce an output slice B, such that B[i] = 1 if A[i-i] != A[i], otherwise + // 0. + // + // Given the input A = [0, 0, 1, 1, 2, 2, 2, 4, 5, 6, 6, 7, 8] + // B = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1] + // + // In particular, we can think of B[i] true indicating the start of a sequence + // of equal values in the sorted list. Similarly, we will also store the + // negation of B, which we'll call C. In particular, we can think of C[i] = + // true iff A[i-1] == A[i] in our original sorted slice. + // + // C = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0] + + // We overwrite bmem, and treat the rest of shared memory as a buffer of + // (index, flag) pairs where the index represents values from C, and the flag + // represents values from B. + // + // [smem (sorted slice) | ubpmem (index, flag pairs)] + + struct ModeUnsignedBoolPair* ubpmem = + reinterpret_cast(&smem[Power2Size]); + + if (tidx == 0) { + ubpmem[0].flag = true; + ubpmem[0].val = 0; + } + + // Compares elements (0, 1), (2, 3), ... and sets 1, 3, ... + ubpmem[tidx * 2 + 1].flag = + smem[tidx * 2] != smem[tidx * 2 + 1]; // (0, 1), (1, 2), etc. + ubpmem[tidx * 2 + 1].val = !ubpmem[tidx * 2 + 1].flag; + + // Compares elements (1, 2), (3, 4), ... and sets 2, 4, ... + if (((tidx + 1) * 2) < Power2Size) { + ubpmem[(tidx + 1) * 2].flag = + smem[((tidx + 1) * 2) - 1] != smem[(tidx + 1) * 2]; + ubpmem[(tidx + 1) * 2].val = !ubpmem[(tidx + 1) * 2].flag; + } + __syncthreads(); // barrier for ubpmem initialization + + // Next, we perform a segmented prefix sum on the neighboring elements, where + // the presence of a one indicates the start of a segment. In this case B acts + // as the segment start flags, and C is the buffer to be summed: + // + // Input (C) = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0] + // Flag (B) = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1] + // Output (C) = [0, 1, 0, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0] + // + // Afterwards, the (index) components of the ubpmem buffer contain the lengths + // of the segments (minus 1), i.e. the counts of each element in the original + // input. + inclusivePrefixScan( + ubpmem, [=] GPU_LAMBDA(const auto& a, const auto& b) { + ModeUnsignedBoolPair c; + c.val = a.flag ? a.val : a.val + b.val; + c.flag = a.flag | b.flag; + return c; + }); + // assumes scan syncs at the end + + // Next, we reinterpret the ubpmem buffer as pairs of unsigned integers (i.e. + // we treat the boolean flag regions as integers). We initialize these to + // represent indices, and we'll call this buffer I + struct ModeUnsignedPair* uupmem = + reinterpret_cast(ubpmem); + + // At this point, we need to find the maximum element in lengths buffer C. + // This element will represent the count (-1) of the mode. Because of the + // way we have set up the problem, the index where this mode occurs will + // also be the location of the mode value in the sorted array, e.g. + // + // smem = [0, 0, 1, 1, 1, 2] + // C = [0, 1, 0, 1, 2, 0] + // I = [0, 1, 2, 3, 4, 5] + // ^ + // maximum value, also aligned with mode = 1 + // + // We perform a block wide max-reduction of the C buffer, but we also need the + // indices to come along with it, so we utilize the uupmem construction. + // + // At the end we need to return the ModeUnsignedPair containing index = 4, val + // = 2, which represents the max + + // In practice, we will make each thread locally reduce 2 values in its + // registers prior to the global block-wide reduction. Note that instead of + // tidx/stidx, we utilize tidx * 2, tidx * 2 + 1, so each thread deals with + // adjacent elements. This is because the reduce code below relies on thread + // elements to be adjacent. + struct ModeUnsignedPair uup[2]; + uup[0].index = tidx * 2; + uup[0].val = ubpmem[tidx * 2].val; + uup[1].index = tidx * 2 + 1; + uup[1].val = ubpmem[tidx * 2 + 1].val; + __syncthreads(); + + struct ModeUnsignedPair max = {0, 0}; + + struct MaxOp { + inline __device__ ModeUnsignedPair combine(ModeUnsignedPair a, ModeUnsignedPair b) const { + return b.val > a.val ? b : a; + } + + inline __device__ ModeUnsignedPair warp_shfl_down(ModeUnsignedPair acc, int offset) const { + ModeUnsignedPair ret; + ret.index = WARP_SHFL_DOWN(acc.index, offset); + ret.val = WARP_SHFL_DOWN(acc.val, offset); + return ret; + } + } max_op; + + max = reduceBlockWithNThreadLocalReductions<2>( + uupmem, + uup, + sliceSize, + max_op, + max); + + // Store the mode in shared memory for use in finding the mode in the input + // slice + __shared__ T mode; + + // Given the above constraints, the mode is the value at the reduced index in + // the original sorted element buffer + if (tidx == 0) { + mode = smem[max.index]; + } + __syncthreads(); // broadcast mode + + // Finally, we need to find "an" index of the mode in the input + // Tensor. The API does not constrain which index we pick, but here + // we always pick the largest index. We store the index if the value + // is the mode, or 0 otherwise. Then find the maximum value. + // + // Again we reduce 2 elements in the thread's registers prior to the + // block-wide reduction + unsigned mode_index[2] = {0u, 0u}; + if (tidx * 2 < sliceSize) { + const unsigned idx = tidx * 2; + mode_index[0] = c10::load(&input[linearOffset + idx]) == mode ? idx : 0u; + } + if (tidx * 2 + 1 < sliceSize) { + const unsigned idx = tidx * 2 + 1; + mode_index[1] = c10::load(&input[linearOffset + idx]) == mode ? idx : 0u; + } + + struct MaxIndexOp { + inline __device__ unsigned combine(unsigned a, unsigned b) const { + return b > a ? b : a; + } + + inline __device__ unsigned warp_shfl_down(unsigned acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } + } max_index_op; + + int64_t index = reduceBlockWithNThreadLocalReductions<2>( + reinterpret_cast(&shmem[0]), + mode_index, + sliceSize, + max_index_op, + 0u); + + // Finally, we have the mode, and an index where it occurs. We use a single + // thread to place this in the appropriate output position + if (tidx == 0) { + unsigned int outputOffset = + at::cuda::detail::IndexToOffset::get( + blockId, values); + values.data[outputOffset] = mode; + indices.data[outputOffset] = index; + } +} + +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..b5660747997d4eb1ad56f79ec2d1f519921c05c2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.h @@ -0,0 +1,19 @@ +#pragma once +#include + +namespace at { +class TensorBase; +} + +namespace at { +namespace native { + +void launch_fused_mode_kernel( + const TensorBase &values, const TensorBase &indices, + const TensorBase &self, int64_t slice_size, int64_t slices); + +void launch_apply_mode_kernel( + const TensorBase &values, const TensorBase &indices, + const TensorBase &self, int64_t dim, int64_t ndim); + +}} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/UniqueCub.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/UniqueCub.cuh new file mode 100644 index 0000000000000000000000000000000000000000..6e1cccc2e175cb26b1ee12690d67d1514e95f246 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/UniqueCub.cuh @@ -0,0 +1,16 @@ +#include + +namespace at { +namespace native { +namespace internal { + +template +std::tuple unique_cuda_template( + const Tensor& self, + const bool consecutive, + const bool return_inverse, + const bool return_counts); + +} // namespace internal +} // namespace at +} // namespace native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh new file mode 100644 index 0000000000000000000000000000000000000000..d95eaaeccee2e93a675728e5d8e810403caca978 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh @@ -0,0 +1,40 @@ +#pragma once +#include + +namespace at { +namespace native { + +void _fused_adam_amsgrad_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList max_exp_avg_sqs, + at::TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +void _fused_adam_amsgrad_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList max_exp_avg_sqs, + at::TensorList state_steps, + const at::Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh new file mode 100644 index 0000000000000000000000000000000000000000..8369496fb31d74e2a1331f9f88c8cea4b99f1c6f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh @@ -0,0 +1,38 @@ +#pragma once +#include + +namespace at { +namespace native { + +void _fused_adam_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +void _fused_adam_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList state_steps, + const at::Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh new file mode 100644 index 0000000000000000000000000000000000000000..4d1fa405ab9639bd2a449fd3c7ced53af628a7e9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh @@ -0,0 +1,40 @@ +#pragma once +#include + +namespace at { +namespace native { + +void _fused_adamw_amsgrad_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList max_exp_avg_sqs, + at::TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +void _fused_adamw_amsgrad_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList max_exp_avg_sqs, + at::TensorList state_steps, + const at::Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh new file mode 100644 index 0000000000000000000000000000000000000000..32d580c31d118e1888f591ee7f74ee48bce462e7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh @@ -0,0 +1,38 @@ +#pragma once +#include + +namespace at { +namespace native { + +void _fused_adamw_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +void _fused_adamw_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList state_steps, + const at::Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh new file mode 100644 index 0000000000000000000000000000000000000000..6d1e861493d4f0fe2d0038c280f35741ff3add5e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh @@ -0,0 +1,681 @@ +namespace at { +namespace cuda { +//windows doesn't like large string literals, so split in two +const std::string reduction_template_0 = R"ESCAPE( + #define C10_HOST_DEVICE __host__ __device__ + #define C10_DEVICE __device__ + #if defined(__clang__) && defined(__HIP__) + #ifndef __forceinline__ + #define __forceinline__ inline __attribute__((always_inline)) + #endif + // until ROCm support for kernel asserts is restored + #define assert(expr) (static_cast(0)) + #endif + + template + __device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff) + { + #if defined(__clang__) && defined(__HIP__) + return __shfl_down(value, delta, width); + #else + return __shfl_down_sync(mask, value, delta, width); + #endif + } + + + #if ${complex} + template + __device__ __forceinline__ std::complex WARP_SHFL_DOWN(std::complex value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff) + { + return std::complex( + #if defined(__clang__) && defined(__HIP__) + __shfl_down(value.real(), delta, width), + __shfl_down(value.imag(), delta, width)); + #else + __shfl_down_sync(mask, value.real(), delta, width), + __shfl_down_sync(mask, value.imag(), delta, width)); + #endif + } + #endif + + // aligned vector generates vectorized load/store on CUDA + template + struct alignas(sizeof(scalar_t) * vec_size) aligned_vector { + scalar_t val[vec_size]; + }; + + + C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) { + // get GCD of num and denom using Euclid's algorithm. + // Can replace this with std::gcd if we ever support c++17. + size_t a = denominator; + size_t b = numerator; + while (b != 0) { + a %= b; + // swap(a,b) + size_t tmp = a; + a = b; + b = tmp; + } + + // a is now the GCD + numerator /= a; + denominator /= a; + } + + + + + struct ReduceConfig { + //has to match host-side ReduceConfig in the eager code + static constexpr int BLOCK_X = 0; + static constexpr int BLOCK_Y = 1; + static constexpr int CTA = 2; + + static constexpr int input_vec_size = 4; + int element_size_bytes; + int num_inputs; + int num_outputs; + int step_input = 1; + int step_output = 1; + int ctas_per_output = 1; + int input_mult[3] = {0, 0, 0}; + int output_mult[2] = {0, 0}; + + int block_width; + int block_height; + int num_threads; + + bool vectorize_input = false; + int output_vec_size = 1; + + C10_HOST_DEVICE bool should_block_x_reduce() const { + return input_mult[BLOCK_X] != 0; + } + + C10_HOST_DEVICE bool should_block_y_reduce() const { + return input_mult[BLOCK_Y] != 0; + } + + C10_HOST_DEVICE bool should_global_reduce() const { + return input_mult[CTA] != 0; + } + + C10_DEVICE bool should_store(int output_idx) const { + return output_idx < num_outputs && + (!should_block_x_reduce() || threadIdx.x == 0) && + (!should_block_y_reduce() || threadIdx.y == 0); + } + + C10_DEVICE bool should_reduce_tail() const { + return (!should_block_y_reduce() || threadIdx.y == 0) && + (!should_global_reduce() || blockIdx.y == 0); + } + + C10_HOST_DEVICE int input_idx() const { + int lane = threadIdx.x; + int warp = threadIdx.y; + int cta2 = blockIdx.y; + return (lane * input_mult[BLOCK_X] + + warp * input_mult[BLOCK_Y] + + cta2 * input_mult[CTA]); + } + + template + C10_HOST_DEVICE int output_idx() const { + int lane = threadIdx.x; + int warp = threadIdx.y; + int cta1 = blockIdx.x; + return (lane * output_mult[BLOCK_X] + + warp * output_mult[BLOCK_Y] + + cta1 * step_output) * output_vec_size; + } + + C10_DEVICE int shared_memory_offset(int offset) const { + return threadIdx.x + (threadIdx.y + offset) * blockDim.x; + } + + C10_DEVICE int staging_memory_offset(int cta2) const { + int offset = cta2 + blockIdx.x * gridDim.y; + if (!should_block_x_reduce()) { + offset = threadIdx.x + offset * blockDim.x; + } + return offset; + } + + + }; + + +//TODO this will need to be different for more generic reduction functions +namespace reducer { + + using scalar_t = ${scalar_type}; + using arg_t = ${reduction_accum_type}; + using out_scalar_t = ${result_type}; + + + inline __device__ ${functor} + + inline __device__ out_scalar_t project(arg_t arg) { + return (out_scalar_t) arg; + } + + inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) { + return WARP_SHFL_DOWN(arg, offset); + } + + inline __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) { + return acc; + } + + // wrap a normal reduction that ignores the index + inline __device__ arg_t reduce(arg_t acc, arg_t val, int64_t idx) { + return combine(acc, val); + } +} + + +struct ReduceJitOp { + using scalar_t = ${scalar_type}; + using arg_t = ${reduction_accum_type}; + using out_scalar_t = ${result_type}; + + using InputCalculator = OffsetCalculator<1>; + using OutputCalculator = OffsetCalculator<2>; + +// static constexpr bool can_accumulate_in_output = +// std::is_convertible::value +// && std::is_convertible::value; + + static constexpr int input_vec_size = ReduceConfig::input_vec_size; + + arg_t ident; + ReduceConfig config; + InputCalculator input_calc; + OutputCalculator output_calc; + const void* src; + const char* dst[2]; //it accepts at most two destinations + // acc_buf used for accumulation among sub Tensor Iterator when accumulation on + // output is not permissible + void* acc_buf; + // cta_buf used for accumulation between blocks during global reduction + void* cta_buf; + int* semaphores; + int64_t base_idx; + bool accumulate; + bool final_output; + int noutputs; + + + C10_DEVICE void run() const { + extern __shared__ char shared_memory[]; + uint32_t output_idx = config.output_idx<${output_vec_size}>(); + uint32_t input_idx = config.input_idx(); + auto base_offsets1 = output_calc.get(output_idx)[1]; + + using arg_vec_t = Array; + arg_vec_t value; + + if (output_idx < config.num_outputs && input_idx < config.num_inputs) { + const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1); + + value = thread_reduce<${output_vec_size}>(input_slice); + } + + if (config.should_block_y_reduce()) { + value = block_y_reduce<${output_vec_size}>(value, shared_memory); + } + if (config.should_block_x_reduce()) { + value = block_x_reduce<${output_vec_size}>(value, shared_memory); + } + + using out_ptr_vec_t = Array; + using offset_vec_t = Array; + offset_vec_t base_offsets; + out_ptr_vec_t out; + + #pragma unroll + for (int i = 0; i < ${output_vec_size}; i++) { + base_offsets[i] = output_calc.get(output_idx + i)[0]; + out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]); + } + + arg_vec_t* acc = nullptr; + if (acc_buf != nullptr) { + size_t numerator = sizeof(arg_t); + size_t denominator = sizeof(out_scalar_t); + reduce_fraction(numerator, denominator); + acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator)); + } + + if (config.should_global_reduce()) { + value = global_reduce<${output_vec_size}>(value, acc, shared_memory); + } else if (config.should_store(output_idx)) { + if (accumulate) { + #pragma unroll + for (int i = 0; i < ${output_vec_size}; i++) { + value[i] = reducer::translate_idx(value[i], base_idx); + } + } + + if (acc == nullptr) { + if (accumulate) { + value = accumulate_in_output<${output_vec_size}>(out, value); + } + if (final_output) { + set_results_to_output<${output_vec_size}>(value, base_offsets); + } else { + #pragma unroll + for (int i = 0; i < ${output_vec_size}; i++) { + *(out[i]) = get_accumulated_output(out[i], value[i]); + } + } + } else { + if (accumulate) { + #pragma unroll + for (int i = 0; i < ${output_vec_size}; i++) { + value[i] = reducer::combine((*acc)[i], value[i]); + } + } + if (final_output) { + set_results_to_output<${output_vec_size}>(value, base_offsets); + } else { + *acc = value; + } + } + } + } + + template + C10_DEVICE Array thread_reduce(const scalar_t* data) const { + if (config.vectorize_input) { + assert(output_vec_size == 1); + // reduce at the header of input_slice where memory is not aligned, + // so that thread_reduce will have an aligned memory to work on. + return {input_vectorized_thread_reduce_impl(data)}; + } else { + uint32_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t); + bool is_contiguous = (input_calc.dims == 1 && element_stride == 1); + if (is_contiguous) { + return thread_reduce_impl(data, [](uint32_t idx) { return idx; }); + } else if (input_calc.dims == 1) { + return thread_reduce_impl(data, [&](uint32_t idx) { return idx * element_stride; }); + } else { + return thread_reduce_impl(data, [&](uint32_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); }); + } + } + } + + C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const { + uint32_t end = config.num_inputs; + + // Handle the head of input slice where data is not aligned + arg_t value = ident; + constexpr int align_bytes = alignof(aligned_vector); + constexpr int align_elements = align_bytes / sizeof(scalar_t); + int shift = ((int64_t)data) % align_bytes / sizeof(scalar_t); + if (shift > 0) { + data -= shift; + end += shift; + if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){ + value = reducer::reduce(value, data[threadIdx.x], threadIdx.x - shift); + } + end -= align_elements; + data += align_elements; + shift = align_elements - shift; + } + + // Do the vectorized reduction + using load_t = aligned_vector; + + uint32_t idx = config.input_idx(); + const uint32_t stride = config.step_input; + + // Multiple accumulators to remove dependency between unrolled loops. + arg_t value_list[input_vec_size]; + value_list[0] = value; + + #pragma unroll + for (int i = 1; i < input_vec_size; i++) { + value_list[i] = ident; + } + + scalar_t values[input_vec_size]; + + load_t *values_vector = reinterpret_cast(&values[0]); + + while (idx * input_vec_size + input_vec_size - 1 < end) { + *values_vector = reinterpret_cast(data)[idx]; + #pragma unroll + for (uint32_t i = 0; i < input_vec_size; i++) { + value_list[i] = reducer::reduce(value_list[i], values[i], shift + idx * input_vec_size + i); + } + idx += stride; + } + + // tail + uint32_t tail_start = end - end % input_vec_size; + if (config.should_reduce_tail()) { + int idx = tail_start + threadIdx.x; + if (idx < end) { + value_list[0] = reducer::reduce(value_list[0], data[idx], idx + shift); + } + } + + // combine accumulators + #pragma unroll + for (int i = 1; i < input_vec_size; i++) { + value_list[0] = reducer::combine(value_list[0], value_list[i]); + } + return value_list[0]; + } + + template + C10_DEVICE Array thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const { + uint32_t idx = config.input_idx(); + const uint32_t end = config.num_inputs; + const uint32_t stride = config.step_input; + const int vt0=${vt0}; + + using arg_vec_t = Array; + using load_t = aligned_vector; + const load_t* data = reinterpret_cast(data_); + + // Multiple accumulators to remove dependency between unrolled loops. + arg_vec_t value_list[vt0]; + + #pragma unroll + for (int i = 0; i < vt0; i++) { + #pragma unroll + for (int j = 0; j < output_vec_size; j++) { + value_list[i][j] = ident; + } + } + + load_t values[vt0]; + + while (idx + (vt0 - 1) * stride < end) { + #pragma unroll + for (uint32_t i = 0; i < vt0; i++) { + values[i] = data[calc(idx + i * stride) / output_vec_size]; + } + #pragma unroll + for (uint32_t i = 0; i < vt0; i++) { + #pragma unroll + for (uint32_t j = 0; j < output_vec_size; j++) { + value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx + i * stride); + } + } + idx += stride * vt0; + } + + // tail + int idx_ = idx; + #pragma unroll + for (uint32_t i = 0; i < vt0; i++) { + if (idx >= end) { + break; + } + values[i] = data[calc(idx) / output_vec_size]; + idx += stride; + } + idx = idx_; + #pragma unroll + for (uint32_t i = 0; i < vt0; i++) { + if (idx >= end) { + break; + } + #pragma unroll + for (uint32_t j = 0; j < output_vec_size; j++) { + value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx); + } + idx += stride; + } + + // combine accumulators + #pragma unroll + for (int i = 1; i < vt0; i++) { + #pragma unroll + for (uint32_t j = 0; j < output_vec_size; j++) { + value_list[0][j] = reducer::combine(value_list[0][j], value_list[i][j]); + } + } + return value_list[0]; + } + template + C10_DEVICE Array block_x_reduce(Array value, char* shared_memory) const { + using args_vec_t = Array; + int dim_x = blockDim.x; + args_vec_t* shared = (args_vec_t*)shared_memory; + if (dim_x > warpSize) { + int address_base = threadIdx.x + threadIdx.y*blockDim.x; + shared[address_base] = value; + for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) { + __syncthreads(); + if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) { + args_vec_t other = shared[address_base + offset]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine(value[i], other[i]); + } + shared[address_base] = value; + } + } + dim_x = warpSize; + } + + __syncthreads(); + + for (int offset = 1; offset < dim_x; offset <<= 1) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + arg_t other = reducer::warp_shfl_down(value[i], offset); + value[i] = reducer::combine(value[i], other); + } + } + return value; + } + + template + C10_DEVICE Array block_y_reduce(Array value, char* shared_memory) const { + using args_vec_t = Array; + args_vec_t* shared = (args_vec_t*)shared_memory; + shared[config.shared_memory_offset(0)] = value; + for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) { + __syncthreads(); + if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) { + args_vec_t other = shared[config.shared_memory_offset(offset)]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine(value[i], other[i]); + } + shared[config.shared_memory_offset(0)] = value; + } + } + return value; + } + )ESCAPE"; + + const std::string reduction_template_1 = R"ESCAPE( + + C10_DEVICE bool mark_block_finished() const { + __shared__ bool is_last_block_done_shared; + + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) { + int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1); + is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1); + } + + __syncthreads(); + + return is_last_block_done_shared; + } + + template + C10_DEVICE Array accumulate_in_output( + Array out, + Array value + ) const { + Array ret; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + ret[i] = reducer::combine(*(out[i]), value[i]); + } + return ret; + } + + + C10_DEVICE out_scalar_t get_accumulated_output( + out_scalar_t* out, arg_t value + ) const { + assert(!final_output); + return (out_scalar_t)value; + } + + template + C10_DEVICE void set_results(const T x, const uint32_t base_offset) const { + assert(noutputs == 1); + auto res = (out_scalar_t*)((char*)dst[0] + base_offset); + *res = x; + } + +//TODO - multi-output reduction - we won't be able to use thrust::pair +//just explicitly specify typed output reads/writes +//Currently implemented for max of two outputs +// template +// C10_DEVICE void set_results(const thrust::pair x, const index_t base_offset) const { +// if (noutputs >= 1) { +// auto res0 = (T1*)((char*)dst[0] + base_offset); +// *res0 = x.first; +// } +// if (noutputs >= 2) { +// // base offset is computed assuming element size being sizeof(T1), so we need to make a +// // correction to obtain the correct base offset +// auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2)); +// *res1 = x.second; +// } +// } + + template + C10_DEVICE void set_results_to_output(Array value, Array base_offset) const { + assert(final_output); + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + set_results(reducer::project(value[i]), base_offset[i]); + } + } + + template + C10_DEVICE Array global_reduce(Array value, Array *acc, char* shared_memory) const { + using arg_vec_t = Array; + using out_ptr_vec_t = Array; + using offset_vec_t = Array; + + arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf; + uint32_t output_idx = config.output_idx(); + offset_vec_t base_offsets; + out_ptr_vec_t out; + + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + base_offsets[i] = output_calc.get(output_idx + i)[0]; + out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]); + } + + bool should_store = config.should_store(output_idx); + if (should_store) { + uint32_t offset = config.staging_memory_offset(blockIdx.y); + reduce_buffer[offset] = value; + } + + __threadfence(); // make sure writes are globally visible + __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done + bool is_last_block_done = mark_block_finished(); + + if (is_last_block_done) { + __threadfence(); //complete acquire pattern + value = ident; + if (config.should_block_x_reduce()) { + uint32_t input_offset = threadIdx.x + threadIdx.y * blockDim.x; + uint32_t step = blockDim.x * blockDim.y; + for (; input_offset < config.ctas_per_output; input_offset += step) { + uint32_t idx = config.staging_memory_offset(input_offset); + arg_vec_t next = reduce_buffer[idx]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine(value[i], next[i]); + } + } + } else { + uint32_t input_offset = threadIdx.y; + uint32_t step = blockDim.y; + for (; input_offset < config.ctas_per_output; input_offset += step) { + uint32_t idx = config.staging_memory_offset(input_offset); + arg_vec_t next = reduce_buffer[idx]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine(value[i], next[i]); + } + } + } + value = block_y_reduce(value, shared_memory); + if (config.should_block_x_reduce()) { + value = block_x_reduce(value, shared_memory); + } + if (should_store) { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::translate_idx(value[i], base_idx); + } + } + + if (acc == nullptr) { + if (accumulate) { + value = accumulate_in_output(out, value); + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + *(out[i]) = get_accumulated_output(out[i], value[i]); + } + } + } else { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine((*acc)[i], value[i]); + } + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + *acc = value; + } + } + } + } + + return value; + } +}; + +extern "C" +__launch_bounds__(${max_threads_lb}, 4) +__global__ void reduction_${name}_kernel(ReduceJitOp r){ + r.run(); +} +)ESCAPE"; + +const std::string reduction_template = reduction_template_0 + reduction_template_1; + + +const std::string &get_reduction_template() { + return reduction_template; +} + +}} diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/im2col_shape_check.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/im2col_shape_check.h new file mode 100644 index 0000000000000000000000000000000000000000..8a6fa47ba10f108d4437ff46bc9f4610e9f37dc4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/im2col_shape_check.h @@ -0,0 +1,232 @@ +#pragma once +#include +#include +#include + +namespace at::native { + +inline void col2im_shape_check( + const Tensor& input, + const Tensor& grad_output, + int64_t output_height, + int64_t output_width, + int64_t kernel_height, + int64_t kernel_width, + int64_t dilation_height, + int64_t dilation_width, + int64_t pad_height, + int64_t pad_width, + int64_t stride_height, + int64_t stride_width) { + TORCH_CHECK( + kernel_width > 0 && kernel_height > 0, + "kernel size should be greater than zero, but got kernel_height: ", + kernel_height, + " kernel_width: ", + kernel_width); + TORCH_CHECK( + stride_width > 0 && stride_height > 0, + "stride should be greater than zero, but got stride_height: ", + stride_height, + " stride_width: ", + stride_width); + TORCH_CHECK( + dilation_width > 0 && dilation_height > 0, + "dilation should be greater than zero, but got dilation_height: ", + dilation_height, + " dilation_width: ", + dilation_width); + TORCH_CHECK( + pad_width >= 0 && pad_height >= 0, + "padding should be non-negative, but got pad_height: ", + pad_height, + " pad_width: ", + pad_width); + + + int64_t ndim = input.ndimension(); + // allow dim=0 only the batch dimension. + TORCH_CHECK( + (ndim == 2 && input.size(0) != 0 && input.size(1) != 0) || + (ndim == 3 && input.size(1) != 0 && input.size(2) != 0), + "Expected 2D or 3D (batch mode) tensor for input with possibly 0 batch size and non-zero dimensions for input, but got: ", + input.sizes()); + + int64_t batch_dim = (ndim == 3) ? 0 : -1; + int64_t n_input_plane = input.size(batch_dim + 1); + + if (n_input_plane % (kernel_width * kernel_height) != 0) { + AT_ERROR( + "Expected size of input's dimension 1 to be divisible by the " + "product of kernel_size, but got input.size(1)=", + n_input_plane, + " and kernel_size=(", + kernel_height, + ", ", + kernel_width, + ")."); + } + + int64_t input_length = input.size(batch_dim + 2); + int64_t n_blocks_height = + div_rtn( + output_height + 2 * pad_height - + dilation_height * (kernel_height - 1) - 1, + stride_height) + + 1; + int64_t n_blocks_width = div_rtn( + output_width + 2 * pad_width - + dilation_width * (kernel_width - 1) - 1, + stride_width) + + 1; + + if (input_length != (n_blocks_height * n_blocks_width)) { + AT_ERROR( + "Given output_size=(", + output_height, + ", ", + output_width, + "), kernel_size=(", + kernel_height, + ", ", + kernel_width, + "), dilation=(", + dilation_height, + ", ", + dilation_width, + "), padding=(", + pad_height, + ", ", + pad_width, + "), stride=(", + stride_height, + ", ", + stride_width, + "), expected size of input's dimension 2 to match the calculated number of ", + "sliding blocks ", + n_blocks_height, + " * ", + n_blocks_width, + " = ", + (n_blocks_height * n_blocks_width), + ", but got input.size(2)=", + input_length, + "."); + } + + TORCH_CHECK( + n_blocks_height >= 1 && n_blocks_width >= 1, + "Given output_size=(", output_height, ", ", output_width, "), ", + "kernel_size=(", kernel_height, ", ", kernel_width, "), ", + "dilation=(", dilation_height, ", ", dilation_width, "), ", + "padding=(", pad_height, ", ", pad_width, "), ", + "stride=(", stride_height, ", ", stride_width, "), ", + "calculated shape of the array of sliding blocks as ", + "(", n_blocks_height, ", ", n_blocks_width, "), ", + "which is too small (non-positive)"); + + if (output_width < 1 || output_height < 1) { + AT_ERROR( + "Expected output spatial size to be positive, but got: output_size=(", + output_height, + ", ", + output_width, + ")."); + } +} + +inline void im2col_shape_check( + const Tensor& input, + const Tensor& grad_output, + int64_t kernel_height, + int64_t kernel_width, + int64_t dilation_height, + int64_t dilation_width, + int64_t pad_height, + int64_t pad_width, + int64_t stride_height, + int64_t stride_width) { + TORCH_CHECK( + kernel_width > 0 && kernel_height > 0, + "kernel size should be greater than zero, but got kernel_height: ", + kernel_height, + " kernel_width: ", + kernel_width); + + TORCH_CHECK( + dilation_width > 0 && dilation_height > 0, + "dilation should be greater than zero, but got dilation_height: ", + dilation_height, + " dilation_width: ", + dilation_width); + + TORCH_CHECK( + pad_width >= 0 && pad_height >= 0, + "padding should be non-negative, but got pad_height: ", + pad_height, + " pad_width: ", + pad_width); + + TORCH_CHECK( + stride_width > 0 && stride_height > 0, + "stride should be greater than zero, but got stride_height: ", + stride_height, + " stride_width: ", + stride_width); + + int64_t ndim = input.ndimension(); + + // allow dim=0 only the batch dimension. + bool valid_dims = input.size(1) != 0 && input.size(2) != 0; + TORCH_CHECK( + (ndim == 3 && input.size(0) && valid_dims) || + (ndim == 4 && valid_dims && input.size(3) != 0), + "Expected 3D or 4D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ", + input.sizes()); + + int64_t dim_batch = 0; + + if (ndim == 3) { + dim_batch = -1; + } + + int64_t input_height = input.size(dim_batch + 2); + int64_t input_width = input.size(dim_batch + 3); + int64_t output_height = div_rtn( + input_height + 2 * pad_height - + (dilation_height * (kernel_height - 1) + 1), + stride_height) + + 1; + int64_t output_width = div_rtn( + input_width + 2 * pad_width - + (dilation_width * (kernel_width - 1) + 1), + stride_width) + + 1; + + if (output_height < 1 || output_width < 1) { + AT_ERROR( + "Given input with spatial size (", + input_height, + ", ", + input_height, + "), kernel_size=(", + kernel_height, + ", ", + kernel_width, + "), dilation=(", + dilation_height, + ", ", + dilation_width, + "), padding=(", + pad_height, + ", ", + pad_width, + "), calculated shape of the array of sliding blocks as (", + output_height, + ", ", + output_width, + "), but its components must be at least one."); + } +} + +} // namespace at::native